[med-svn] [paleomix] 01/04: New upstream version 1.2.7
Andreas Tille
tille at debian.org
Tue Feb 14 15:37:17 UTC 2017
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository paleomix.
commit 346af031f1767c3408f4d3e28a45db5bff58d35d
Author: Andreas Tille <tille at debian.org>
Date: Tue Feb 14 16:01:50 2017 +0100
New upstream version 1.2.7
---
.gitignore | 23 +
CHANGES.md | 454 ++++++
MANIFEST.in | 19 +
README.rst | 11 +
bin/bam_pipeline | 43 +
bin/bam_rmdup_collapsed | 43 +
bin/conv_gtf_to_bed | 43 +
bin/paleomix | 43 +
bin/phylo_pipeline | 43 +
bin/trim_pipeline | 43 +
docs/Makefile | 192 +++
docs/_static/zonkey/incl_ts_0_tree_rooted.png | Bin 0 -> 18823 bytes
docs/_static/zonkey/incl_ts_0_tree_unrooted.png | Bin 0 -> 17608 bytes
docs/_static/zonkey/mito_phylo.png | Bin 0 -> 12751 bytes
docs/acknowledgements.rst | 8 +
docs/bam_pipeline/configuration.rst | 53 +
docs/bam_pipeline/filestructure.rst | 195 +++
docs/bam_pipeline/index.rst | 22 +
docs/bam_pipeline/makefile.rst | 716 ++++++++
docs/bam_pipeline/makefile.yaml | 162 ++
docs/bam_pipeline/overview.rst | 59 +
docs/bam_pipeline/requirements.rst | 54 +
docs/bam_pipeline/usage.rst | 520 ++++++
docs/conf.py | 286 ++++
docs/examples.rst | 106 ++
docs/index.rst | 39 +
docs/installation.rst | 115 ++
docs/introduction.rst | 21 +
docs/other_tools.rst | 119 ++
docs/phylo_pipeline/configuration.rst | 8 +
docs/phylo_pipeline/filestructure.rst | 7 +
docs/phylo_pipeline/index.rst | 27 +
docs/phylo_pipeline/makefile.rst | 10 +
docs/phylo_pipeline/overview.rst | 21 +
docs/phylo_pipeline/requirements.rst | 51 +
docs/phylo_pipeline/usage.rst | 219 +++
docs/references.rst | 28 +
docs/related.rst | 13 +
docs/troubleshooting/bam_pipeline.rst | 201 +++
docs/troubleshooting/common.rst | 114 ++
docs/troubleshooting/index.rst | 15 +
docs/troubleshooting/install.rst | 54 +
docs/troubleshooting/phylo_pipeline.rst | 8 +
docs/troubleshooting/zonkey_pipeline.rst | 6 +
docs/yaml.rst | 28 +
docs/zonkey_pipeline/configuration.rst | 47 +
docs/zonkey_pipeline/filestructure.rst | 68 +
docs/zonkey_pipeline/index.rst | 19 +
docs/zonkey_pipeline/overview.rst | 55 +
docs/zonkey_pipeline/panel.rst | 235 +++
docs/zonkey_pipeline/requirements.rst | 76 +
docs/zonkey_pipeline/usage.rst | 160 ++
examples | 1 +
licenses/gpl.txt | 674 ++++++++
licenses/mit.txt | 17 +
misc/setup_bam_pipeline_example.makefile.yaml | 35 +
misc/setup_bam_pipeline_example.sh | 57 +
misc/setup_phylo_pipeline_example.sh | 50 +
misc/skeleton.py | 37 +
misc/synthesize_reads.py | 406 +++++
paleomix/__init__.py | 63 +
paleomix/atomiccmd/__init__.py | 22 +
paleomix/atomiccmd/builder.py | 541 +++++++
paleomix/atomiccmd/command.py | 482 ++++++
paleomix/atomiccmd/pprint.py | 198 +++
paleomix/atomiccmd/sets.py | 191 +++
paleomix/common/__init__.py | 22 +
paleomix/common/bamfiles.py | 138 ++
paleomix/common/bedtools.py | 234 +++
paleomix/common/console.py | 99 ++
paleomix/common/fileutils.py | 344 ++++
paleomix/common/formats/__init__.py | 25 +
paleomix/common/formats/_common.py | 25 +
paleomix/common/formats/_graph.py | 295 ++++
paleomix/common/formats/fasta.py | 150 ++
paleomix/common/formats/fastq.py | 76 +
paleomix/common/formats/msa.py | 230 +++
paleomix/common/formats/newick.py | 357 ++++
paleomix/common/formats/phylip.py | 90 ++
paleomix/common/makefile.py | 901 +++++++++++
paleomix/common/procs.py | 108 ++
paleomix/common/rtools.py | 41 +
paleomix/common/sampling.py | 67 +
paleomix/common/sequences.py | 205 +++
paleomix/common/signals.py | 54 +
paleomix/common/system.py | 54 +
paleomix/common/testing.py | 112 ++
paleomix/common/text.py | 120 ++
paleomix/common/timer.py | 111 ++
paleomix/common/utilities.py | 312 ++++
paleomix/common/vcffilter.py | 411 +++++
paleomix/common/vcfwrap.py | 143 ++
paleomix/common/versions.py | 432 +++++
paleomix/config.py | 198 +++
paleomix/logger.py | 179 ++
paleomix/main.py | 236 +++
paleomix/node.py | 281 ++++
paleomix/nodegraph.py | 456 ++++++
paleomix/nodes/__init__.py | 22 +
paleomix/nodes/adapterremoval.py | 196 +++
paleomix/nodes/bedtools.py | 117 ++
paleomix/nodes/bowtie2.py | 147 ++
paleomix/nodes/bwa.py | 430 +++++
paleomix/nodes/commands.py | 375 +++++
paleomix/nodes/examl.py | 289 ++++
paleomix/nodes/formats.py | 257 +++
paleomix/nodes/gatk.py | 153 ++
paleomix/nodes/mafft.py | 93 ++
paleomix/nodes/mapdamage.py | 294 ++++
paleomix/nodes/misc.py | 54 +
paleomix/nodes/newick.py | 116 ++
paleomix/nodes/phylip.py | 188 +++
paleomix/nodes/picard.py | 299 ++++
paleomix/nodes/raxml.py | 350 ++++
paleomix/nodes/samtools.py | 203 +++
paleomix/nodes/sequences.py | 206 +++
paleomix/nodes/validation.py | 401 +++++
paleomix/pipeline.py | 492 ++++++
paleomix/resources/__init__.py | 75 +
.../bam_pipeline/000_data/ACGATA_L1_R1_01.fastq.gz | Bin 0 -> 37455 bytes
.../bam_pipeline/000_data/ACGATA_L1_R1_02.fastq.gz | Bin 0 -> 37120 bytes
.../bam_pipeline/000_data/ACGATA_L1_R1_03.fastq.gz | Bin 0 -> 37566 bytes
.../bam_pipeline/000_data/ACGATA_L1_R1_04.fastq.gz | Bin 0 -> 13322 bytes
.../bam_pipeline/000_data/ACGATA_L1_R2_01.fastq.gz | Bin 0 -> 37461 bytes
.../bam_pipeline/000_data/ACGATA_L1_R2_02.fastq.gz | Bin 0 -> 37220 bytes
.../bam_pipeline/000_data/ACGATA_L1_R2_03.fastq.gz | Bin 0 -> 37533 bytes
.../bam_pipeline/000_data/ACGATA_L1_R2_04.fastq.gz | Bin 0 -> 13408 bytes
.../000_data/ACGATA_L2/reads.collapsed.gz | Bin 0 -> 109199 bytes
.../ACGATA_L2/reads.collapsed.truncated.gz | Bin 0 -> 20 bytes
.../ACGATA_L2/reads.singleton.truncated.gz | Bin 0 -> 20 bytes
.../bam_pipeline/000_data/GCTCTG_L1_R1_01.fastq.gz | Bin 0 -> 37021 bytes
.../bam_pipeline/000_data/GCTCTG_L1_R1_02.fastq.gz | Bin 0 -> 31401 bytes
.../bam_pipeline/000_data/GCTCTG_L1_R1_03.fastq.gz | Bin 0 -> 11443 bytes
.../bam_pipeline/000_data/TGCTCA_L1_R1_01.fastq.gz | Bin 0 -> 37160 bytes
.../bam_pipeline/000_data/TGCTCA_L1_R1_02.fastq.gz | Bin 0 -> 37211 bytes
.../bam_pipeline/000_data/TGCTCA_L1_R1_03.fastq.gz | Bin 0 -> 34271 bytes
.../bam_pipeline/000_data/TGCTCA_L2_R1_01.fastq.gz | Bin 0 -> 36990 bytes
.../bam_pipeline/000_data/TGCTCA_L2_R1_02.fastq.gz | Bin 0 -> 36916 bytes
.../bam_pipeline/000_data/TGCTCA_L2_R1_03.fastq.gz | Bin 0 -> 3594 bytes
.../bam_pipeline/000_data/TGCTCA_L2_R2_01.fastq.gz | Bin 0 -> 37054 bytes
.../bam_pipeline/000_data/TGCTCA_L2_R2_02.fastq.gz | Bin 0 -> 36999 bytes
.../bam_pipeline/000_data/TGCTCA_L2_R2_03.fastq.gz | Bin 0 -> 3580 bytes
.../examples/bam_pipeline/000_makefile.yaml | 177 ++
.../examples/bam_pipeline/000_prefixes/rCRS.fasta | 239 +++
.../nature_protocols/alignment/000_makefile.yaml | 337 ++++
.../nature_protocols/alignment/000_prefixes/README | 6 +
.../alignment/000_prefixes/setup.sh | 29 +
.../alignment/000_rawreads/06_3928A/000_ENA | 7 +
.../alignment/000_rawreads/06_3928A/000_README | 2 +
.../alignment/000_rawreads/DDR7602/000_ENA | 2 +
.../alignment/000_rawreads/DDR7602/000_README | 2 +
.../alignment/000_rawreads/LBUS5/000_ENA | 2 +
.../alignment/000_rawreads/LBUS5/000_README | 2 +
.../alignment/000_rawreads/M-0182896/000_ENA | 6 +
.../alignment/000_rawreads/M-0182896/000_README | 2 +
.../alignment/000_rawreads/NL07434/000_ENA | 8 +
.../alignment/000_rawreads/NL07434/000_README | 2 +
.../alignment/000_rawreads/P13527/000_ENA | 5 +
.../alignment/000_rawreads/P13527/000_README | 2 +
.../alignment/000_rawreads/P13626/000_ENA | 5 +
.../alignment/000_rawreads/P13626/000_README | 2 +
.../alignment/000_rawreads/P17777/000_ENA | 8 +
.../alignment/000_rawreads/P17777/000_README | 2 +
.../alignment/000_rawreads/Pi1845A/000_ENA | 10 +
.../alignment/000_rawreads/Pi1845A/000_README | 2 +
.../alignment/000_rawreads/Pi1889/000_ENA | 16 +
.../alignment/000_rawreads/Pi1889/000_README | 2 +
.../alignment/000_rawreads/setup.sh | 39 +
.../alignment/M-0182896.Pi_mito.coverage | 40 +
.../alignment/M-0182896.Pi_mito.depths | 38 +
.../alignment/M-0182896.Pi_nucl.coverage | 40 +
.../alignment/M-0182896.Pi_nucl.depths | 38 +
.../example_results/alignment/M-0182896.summary | 301 ++++
.../alignment/Pi1845A.Pi_mito.coverage | 31 +
.../alignment/Pi1845A.Pi_mito.depths | 29 +
.../Pi1845A_id_CATAGA/3pGtoA_freq.txt | 26 +
.../Pi1845A_id_CATAGA/5pCtoT_freq.txt | 26 +
.../Fragmisincorporation_plot.pdf | Bin 0 -> 19656 bytes
.../Pi1845A_id_CATAGA/Length_plot.pdf | Bin 0 -> 9982 bytes
.../Pi1845A_id_CATAGA/Runtime_log.txt | 4 +
.../Stats_out_MCMC_correct_prob.csv | 25 +
.../Pi1845A_id_CATAGA/Stats_out_MCMC_hist.pdf | Bin 0 -> 6287 bytes
.../Stats_out_MCMC_iter_summ_stat.csv | 45 +
.../Pi1845A_id_CATAGA/Stats_out_MCMC_post_pred.pdf | Bin 0 -> 8206 bytes
.../Pi1845A_id_CATAGA/Stats_out_MCMC_trace.pdf | Bin 0 -> 370079 bytes
.../Pi1845A_id_CATAGA/dnacomp.txt | 324 ++++
.../Pi1845A_id_CATAGA/dnacomp_genome.csv | 2 +
.../Pi1845A_id_CATAGA/lgdistribution.txt | 152 ++
.../Pi1845A_id_CATAGA/misincorporation.txt | 284 ++++
.../Pi1845A_id_CGCTAT/3pGtoA_freq.txt | 26 +
.../Pi1845A_id_CGCTAT/5pCtoT_freq.txt | 26 +
.../Fragmisincorporation_plot.pdf | Bin 0 -> 20090 bytes
.../Pi1845A_id_CGCTAT/Length_plot.pdf | Bin 0 -> 10973 bytes
.../Pi1845A_id_CGCTAT/Runtime_log.txt | 4 +
.../Stats_out_MCMC_correct_prob.csv | 25 +
.../Pi1845A_id_CGCTAT/Stats_out_MCMC_hist.pdf | Bin 0 -> 6452 bytes
.../Stats_out_MCMC_iter_summ_stat.csv | 45 +
.../Pi1845A_id_CGCTAT/Stats_out_MCMC_post_pred.pdf | Bin 0 -> 8178 bytes
.../Pi1845A_id_CGCTAT/Stats_out_MCMC_trace.pdf | Bin 0 -> 379890 bytes
.../Pi1845A_id_CGCTAT/dnacomp.txt | 324 ++++
.../Pi1845A_id_CGCTAT/dnacomp_genome.csv | 2 +
.../Pi1845A_id_CGCTAT/lgdistribution.txt | 229 +++
.../Pi1845A_id_CGCTAT/misincorporation.txt | 284 ++++
.../alignment/Pi1845A.Pi_nucl.coverage | 31 +
.../alignment/Pi1845A.Pi_nucl.depths | 29 +
.../Pi1845A_id_CATAGA/3pGtoA_freq.txt | 26 +
.../Pi1845A_id_CATAGA/5pCtoT_freq.txt | 26 +
.../Fragmisincorporation_plot.pdf | Bin 0 -> 19401 bytes
.../Pi1845A_id_CATAGA/Length_plot.pdf | Bin 0 -> 10034 bytes
.../Pi1845A_id_CATAGA/Runtime_log.txt | 4 +
.../Stats_out_MCMC_correct_prob.csv | 25 +
.../Pi1845A_id_CATAGA/Stats_out_MCMC_hist.pdf | Bin 0 -> 6408 bytes
.../Stats_out_MCMC_iter_summ_stat.csv | 45 +
.../Pi1845A_id_CATAGA/Stats_out_MCMC_post_pred.pdf | Bin 0 -> 8177 bytes
.../Pi1845A_id_CATAGA/Stats_out_MCMC_trace.pdf | Bin 0 -> 360701 bytes
.../Pi1845A_id_CATAGA/dnacomp.txt | 324 ++++
.../Pi1845A_id_CATAGA/dnacomp_genome.csv | 2 +
.../Pi1845A_id_CATAGA/lgdistribution.txt | 155 ++
.../Pi1845A_id_CATAGA/misincorporation.txt | 284 ++++
.../Pi1845A_id_CGCTAT/3pGtoA_freq.txt | 26 +
.../Pi1845A_id_CGCTAT/5pCtoT_freq.txt | 26 +
.../Fragmisincorporation_plot.pdf | Bin 0 -> 19641 bytes
.../Pi1845A_id_CGCTAT/Length_plot.pdf | Bin 0 -> 11894 bytes
.../Pi1845A_id_CGCTAT/Runtime_log.txt | 4 +
.../Stats_out_MCMC_correct_prob.csv | 25 +
.../Pi1845A_id_CGCTAT/Stats_out_MCMC_hist.pdf | Bin 0 -> 6299 bytes
.../Stats_out_MCMC_iter_summ_stat.csv | 45 +
.../Pi1845A_id_CGCTAT/Stats_out_MCMC_post_pred.pdf | Bin 0 -> 8217 bytes
.../Pi1845A_id_CGCTAT/Stats_out_MCMC_trace.pdf | Bin 0 -> 375213 bytes
.../Pi1845A_id_CGCTAT/dnacomp.txt | 324 ++++
.../Pi1845A_id_CGCTAT/dnacomp_genome.csv | 2 +
.../Pi1845A_id_CGCTAT/lgdistribution.txt | 300 ++++
.../Pi1845A_id_CGCTAT/misincorporation.txt | 284 ++++
.../example_results/alignment/Pi1845A.summary | 186 +++
.../alignment/Pi1889.Pi_mito.coverage | 34 +
.../alignment/Pi1889.Pi_mito.depths | 32 +
.../Pi1889_id_CTTGTA/3pGtoA_freq.txt | 26 +
.../Pi1889_id_CTTGTA/5pCtoT_freq.txt | 26 +
.../Pi1889_id_CTTGTA/Fragmisincorporation_plot.pdf | Bin 0 -> 19770 bytes
.../Pi1889_id_CTTGTA/Length_plot.pdf | Bin 0 -> 11922 bytes
.../Pi1889_id_CTTGTA/Runtime_log.txt | 4 +
.../Stats_out_MCMC_correct_prob.csv | 25 +
.../Pi1889_id_CTTGTA/Stats_out_MCMC_hist.pdf | Bin 0 -> 6375 bytes
.../Stats_out_MCMC_iter_summ_stat.csv | 45 +
.../Pi1889_id_CTTGTA/Stats_out_MCMC_post_pred.pdf | Bin 0 -> 8246 bytes
.../Pi1889_id_CTTGTA/Stats_out_MCMC_trace.pdf | Bin 0 -> 368693 bytes
.../Pi1889_id_CTTGTA/dnacomp.txt | 324 ++++
.../Pi1889_id_CTTGTA/dnacomp_genome.csv | 2 +
.../Pi1889_id_CTTGTA/lgdistribution.txt | 317 ++++
.../Pi1889_id_CTTGTA/misincorporation.txt | 284 ++++
.../Pi1889_id_GGCTAC/3pGtoA_freq.txt | 26 +
.../Pi1889_id_GGCTAC/5pCtoT_freq.txt | 26 +
.../Pi1889_id_GGCTAC/Fragmisincorporation_plot.pdf | Bin 0 -> 19729 bytes
.../Pi1889_id_GGCTAC/Length_plot.pdf | Bin 0 -> 12059 bytes
.../Pi1889_id_GGCTAC/Runtime_log.txt | 4 +
.../Stats_out_MCMC_correct_prob.csv | 25 +
.../Pi1889_id_GGCTAC/Stats_out_MCMC_hist.pdf | Bin 0 -> 6418 bytes
.../Stats_out_MCMC_iter_summ_stat.csv | 45 +
.../Pi1889_id_GGCTAC/Stats_out_MCMC_post_pred.pdf | Bin 0 -> 8246 bytes
.../Pi1889_id_GGCTAC/Stats_out_MCMC_trace.pdf | Bin 0 -> 359446 bytes
.../Pi1889_id_GGCTAC/dnacomp.txt | 324 ++++
.../Pi1889_id_GGCTAC/dnacomp_genome.csv | 2 +
.../Pi1889_id_GGCTAC/lgdistribution.txt | 325 ++++
.../Pi1889_id_GGCTAC/misincorporation.txt | 284 ++++
.../Pi1889_id_TAGCTT/3pGtoA_freq.txt | 26 +
.../Pi1889_id_TAGCTT/5pCtoT_freq.txt | 26 +
.../Pi1889_id_TAGCTT/Fragmisincorporation_plot.pdf | Bin 0 -> 19863 bytes
.../Pi1889_id_TAGCTT/Length_plot.pdf | Bin 0 -> 12089 bytes
.../Pi1889_id_TAGCTT/Runtime_log.txt | 4 +
.../Stats_out_MCMC_correct_prob.csv | 25 +
.../Pi1889_id_TAGCTT/Stats_out_MCMC_hist.pdf | Bin 0 -> 6490 bytes
.../Stats_out_MCMC_iter_summ_stat.csv | 45 +
.../Pi1889_id_TAGCTT/Stats_out_MCMC_post_pred.pdf | Bin 0 -> 8293 bytes
.../Pi1889_id_TAGCTT/Stats_out_MCMC_trace.pdf | Bin 0 -> 370359 bytes
.../Pi1889_id_TAGCTT/dnacomp.txt | 324 ++++
.../Pi1889_id_TAGCTT/dnacomp_genome.csv | 2 +
.../Pi1889_id_TAGCTT/lgdistribution.txt | 327 ++++
.../Pi1889_id_TAGCTT/misincorporation.txt | 284 ++++
.../alignment/Pi1889.Pi_nucl.coverage | 34 +
.../alignment/Pi1889.Pi_nucl.depths | 32 +
.../Pi1889_id_CTTGTA/3pGtoA_freq.txt | 26 +
.../Pi1889_id_CTTGTA/5pCtoT_freq.txt | 26 +
.../Pi1889_id_CTTGTA/Fragmisincorporation_plot.pdf | Bin 0 -> 19632 bytes
.../Pi1889_id_CTTGTA/Length_plot.pdf | Bin 0 -> 12144 bytes
.../Pi1889_id_CTTGTA/Runtime_log.txt | 4 +
.../Stats_out_MCMC_correct_prob.csv | 25 +
.../Pi1889_id_CTTGTA/Stats_out_MCMC_hist.pdf | Bin 0 -> 6514 bytes
.../Stats_out_MCMC_iter_summ_stat.csv | 45 +
.../Pi1889_id_CTTGTA/Stats_out_MCMC_post_pred.pdf | Bin 0 -> 8070 bytes
.../Pi1889_id_CTTGTA/Stats_out_MCMC_trace.pdf | Bin 0 -> 343001 bytes
.../Pi1889_id_CTTGTA/dnacomp.txt | 324 ++++
.../Pi1889_id_CTTGTA/dnacomp_genome.csv | 2 +
.../Pi1889_id_CTTGTA/lgdistribution.txt | 325 ++++
.../Pi1889_id_CTTGTA/misincorporation.txt | 284 ++++
.../Pi1889_id_GGCTAC/3pGtoA_freq.txt | 26 +
.../Pi1889_id_GGCTAC/5pCtoT_freq.txt | 26 +
.../Pi1889_id_GGCTAC/Fragmisincorporation_plot.pdf | Bin 0 -> 19582 bytes
.../Pi1889_id_GGCTAC/Length_plot.pdf | Bin 0 -> 12195 bytes
.../Pi1889_id_GGCTAC/Runtime_log.txt | 4 +
.../Stats_out_MCMC_correct_prob.csv | 25 +
.../Pi1889_id_GGCTAC/Stats_out_MCMC_hist.pdf | Bin 0 -> 6394 bytes
.../Stats_out_MCMC_iter_summ_stat.csv | 45 +
.../Pi1889_id_GGCTAC/Stats_out_MCMC_post_pred.pdf | Bin 0 -> 8078 bytes
.../Pi1889_id_GGCTAC/Stats_out_MCMC_trace.pdf | Bin 0 -> 345771 bytes
.../Pi1889_id_GGCTAC/dnacomp.txt | 324 ++++
.../Pi1889_id_GGCTAC/dnacomp_genome.csv | 2 +
.../Pi1889_id_GGCTAC/lgdistribution.txt | 328 ++++
.../Pi1889_id_GGCTAC/misincorporation.txt | 284 ++++
.../Pi1889_id_TAGCTT/3pGtoA_freq.txt | 26 +
.../Pi1889_id_TAGCTT/5pCtoT_freq.txt | 26 +
.../Pi1889_id_TAGCTT/Fragmisincorporation_plot.pdf | Bin 0 -> 19634 bytes
.../Pi1889_id_TAGCTT/Length_plot.pdf | Bin 0 -> 12227 bytes
.../Pi1889_id_TAGCTT/Runtime_log.txt | 4 +
.../Stats_out_MCMC_correct_prob.csv | 25 +
.../Pi1889_id_TAGCTT/Stats_out_MCMC_hist.pdf | Bin 0 -> 6435 bytes
.../Stats_out_MCMC_iter_summ_stat.csv | 45 +
.../Pi1889_id_TAGCTT/Stats_out_MCMC_post_pred.pdf | Bin 0 -> 8078 bytes
.../Pi1889_id_TAGCTT/Stats_out_MCMC_trace.pdf | Bin 0 -> 342782 bytes
.../Pi1889_id_TAGCTT/dnacomp.txt | 324 ++++
.../Pi1889_id_TAGCTT/dnacomp_genome.csv | 2 +
.../Pi1889_id_TAGCTT/lgdistribution.txt | 328 ++++
.../Pi1889_id_TAGCTT/misincorporation.txt | 284 ++++
.../example_results/alignment/Pi1889.summary | 236 +++
.../example_results/profiling/M.0182896.NO.UDG.txt | 14 +
.../example_results/profiling/M.0182896.UDG.txt | 14 +
.../example_results/profiling/M.0182896.UDGa.txt | 29 +
.../example_results/profiling/M.0182896.UDGb.txt | 21 +
.../example_results/profiling/M.0182896.UDGc.txt | 37 +
.../profiling/Pi1845A.id.CGCTAT.txt | 75 +
.../example_results/profiling/Pi1889.id.CTTGTA.txt | 61 +
.../example_results/profiling/Pi1889.id.GGCTAC.txt | 61 +
.../example_results/profiling/Pi1889.id.TAGCTT.txt | 71 +
.../krona/Figure_Krona_M.0182896.all.html | 177 ++
.../profiling/krona/Figure_Krona_Pi1845A.all.html | 252 +++
.../profiling/krona/Figure_Krona_Pi1889.all.html | 327 ++++
.../profiling/krona/Krona_M.0182896.all.txt | 16 +
.../profiling/krona/Krona_Pi1845A.all.txt | 22 +
.../profiling/krona/Krona_Pi1845A.pdf | Bin 0 -> 48840 bytes
.../profiling/krona/Krona_Pi1889.all.txt | 40 +
.../profiling/krona/M.0182896.all.txt | 50 +
.../example_results/profiling/krona/Pi1889.all.txt | 100 ++
.../profiling/results/Barplot_class.pdf | Bin 0 -> 4997 bytes
.../profiling/results/Clustering_genus.pdf | Bin 0 -> 5037 bytes
.../profiling/results/Distances_genus.txt | 10 +
.../profiling/results/Diversity_genus.txt | 9 +
.../profiling/results/Heatmap_genus.pdf | Bin 0 -> 8533 bytes
.../profiling/results/PCA_genus.pdf | Bin 0 -> 5918 bytes
.../profiling/results/PCOA_genus.pdf | Bin 0 -> 30106 bytes
.../profiling/results/Potato_merged.csv | 126 ++
.../profiling/results/Potato_merged_profiles.csv | 144 ++
.../results/Relative_abundances_genus.csv | 32 +
.../profiling/results/Taxon_count_genus.txt | 9 +
.../nature_protocols/phylogeny/000_makefile.yaml | 178 ++
.../phylogeny/select_highly_covered_genes.py | 68 +
.../phylogeny/summarize_heterozygosity.py | 129 ++
.../profiling/build_all_profiles.sh | 42 +
.../nature_protocols/profiling/build_profile.sh | 103 ++
.../profiling/metagenomic_profile.R | 118 ++
.../phylo_pipeline/alignment/000_makefile.yaml | 169 ++
.../alignment/000_prefixes/bonobo.fasta | 239 +++
.../alignment/000_prefixes/chimpanzee.fasta | 239 +++
.../alignment/000_prefixes/gorilla.fasta | 237 +++
.../alignment/000_prefixes/rCRS.fasta | 239 +++
.../alignment/000_prefixes/rCRS.fasta.fai | 1 +
.../000_prefixes/sumatran_orangutan.fasta | 238 +++
.../000_prefixes/white_handed_gibbon.fasta | 276 ++++
.../examples/phylo_pipeline/alignment/setup.sh | 30 +
.../phylo_pipeline/phylogeny/000_makefile.yaml | 177 ++
.../phylogeny/data/regions/rCRS.non_coding.bed | 12 +
.../data/regions/rCRS.protein_coding.CDS.bed | 13 +
.../resources/examples/phylo_pipeline/setup.sh | 1 +
.../examples/phylo_pipeline/synthesize_reads.py | 1 +
paleomix/resources/reports/zonkey/report.css | 267 +++
paleomix/resources/rscripts/common/requires.r | 21 +
paleomix/resources/rscripts/zonkey/admixture.r | 102 ++
paleomix/resources/rscripts/zonkey/coverage.r | 112 ++
paleomix/resources/rscripts/zonkey/pca.r | 83 +
paleomix/resources/rscripts/zonkey/tinytree.r | 532 ++++++
paleomix/resources/rscripts/zonkey/treemix.r | 651 ++++++++
paleomix/tools/__init__.py | 22 +
paleomix/tools/bam_pipeline/__init__.py | 23 +
paleomix/tools/bam_pipeline/config.py | 109 ++
paleomix/tools/bam_pipeline/makefile.py | 838 ++++++++++
paleomix/tools/bam_pipeline/mkfile.py | 385 +++++
paleomix/tools/bam_pipeline/nodes.py | 139 ++
paleomix/tools/bam_pipeline/parts/__init__.py | 7 +
paleomix/tools/bam_pipeline/parts/lane.py | 310 ++++
paleomix/tools/bam_pipeline/parts/library.py | 255 +++
paleomix/tools/bam_pipeline/parts/prefix.py | 106 ++
paleomix/tools/bam_pipeline/parts/reads.py | 117 ++
paleomix/tools/bam_pipeline/parts/sample.py | 41 +
paleomix/tools/bam_pipeline/parts/statistics.py | 210 +++
paleomix/tools/bam_pipeline/parts/summary.py | 463 ++++++
paleomix/tools/bam_pipeline/parts/target.py | 35 +
paleomix/tools/bam_pipeline/paths.py | 85 +
paleomix/tools/bam_pipeline/pipeline.py | 305 ++++
paleomix/tools/bam_pipeline/remap.py | 291 ++++
paleomix/tools/bam_pipeline/trim_pipeline.py | 28 +
paleomix/tools/bam_stats/__init__.py | 0
paleomix/tools/bam_stats/common.py | 183 +++
paleomix/tools/bam_stats/coverage.py | 191 +++
paleomix/tools/cat.py | 104 ++
paleomix/tools/cleanup.py | 386 +++++
paleomix/tools/coverage.py | 193 +++
paleomix/tools/depths.py | 404 +++++
paleomix/tools/duphist.py | 107 ++
paleomix/tools/ena.py | 499 ++++++
paleomix/tools/factory.py | 67 +
paleomix/tools/genotype.py | 524 ++++++
paleomix/tools/gtf_to_bed.py | 325 ++++
paleomix/tools/phylo_pipeline/__init__.py | 22 +
paleomix/tools/phylo_pipeline/config.py | 167 ++
paleomix/tools/phylo_pipeline/makefile.py | 757 +++++++++
paleomix/tools/phylo_pipeline/mkfile.py | 233 +++
paleomix/tools/phylo_pipeline/parts/__init__.py | 22 +
paleomix/tools/phylo_pipeline/parts/genotype.py | 388 +++++
paleomix/tools/phylo_pipeline/parts/msa.py | 98 ++
paleomix/tools/phylo_pipeline/parts/paml.py | 170 ++
paleomix/tools/phylo_pipeline/parts/phylo.py | 260 +++
paleomix/tools/phylo_pipeline/pipeline.py | 137 ++
paleomix/tools/rmdup_collapsed.py | 182 +++
paleomix/tools/sample_pileup.py | 248 +++
paleomix/tools/vcf_filter.py | 85 +
paleomix/tools/vcf_to_fasta.py | 362 +++++
paleomix/tools/zonkey/__init__.py | 21 +
paleomix/tools/zonkey/build_db.py | 347 ++++
paleomix/tools/zonkey/build_mito.py | 203 +++
paleomix/tools/zonkey/build_tped.py | 352 ++++
paleomix/tools/zonkey/common.py | 81 +
paleomix/tools/zonkey/config.py | 365 +++++
paleomix/tools/zonkey/database.py | 562 +++++++
paleomix/tools/zonkey/parts/__init__.py | 21 +
paleomix/tools/zonkey/parts/admixture.py | 171 ++
paleomix/tools/zonkey/parts/common.py | 65 +
paleomix/tools/zonkey/parts/mitochondria.py | 100 ++
paleomix/tools/zonkey/parts/nuclear.py | 755 +++++++++
paleomix/tools/zonkey/parts/report.py | 869 ++++++++++
paleomix/tools/zonkey/parts/summary.py | 532 ++++++
paleomix/tools/zonkey/pipeline.py | 458 ++++++
paleomix/ui.py | 469 ++++++
paleomix/yaml/CHANGES | 140 ++
paleomix/yaml/LICENSE | 19 +
paleomix/yaml/PKG-INFO | 38 +
paleomix/yaml/README | 35 +
paleomix/yaml/__init__.py | 31 +
paleomix/yaml/lib2/__init__.py | 310 ++++
paleomix/yaml/lib2/composer.py | 139 ++
paleomix/yaml/lib2/constructor.py | 678 ++++++++
paleomix/yaml/lib2/dumper.py | 62 +
paleomix/yaml/lib2/emitter.py | 1140 +++++++++++++
paleomix/yaml/lib2/error.py | 75 +
paleomix/yaml/lib2/events.py | 86 +
paleomix/yaml/lib2/loader.py | 40 +
paleomix/yaml/lib2/nodes.py | 49 +
paleomix/yaml/lib2/parser.py | 589 +++++++
paleomix/yaml/lib2/reader.py | 190 +++
paleomix/yaml/lib2/representer.py | 484 ++++++
paleomix/yaml/lib2/resolver.py | 225 +++
paleomix/yaml/lib2/scanner.py | 1457 +++++++++++++++++
paleomix/yaml/lib2/serializer.py | 111 ++
paleomix/yaml/lib2/tokens.py | 104 ++
pylint.conf | 286 ++++
setup.py | 107 ++
tests/atomiccmd_test/__init__.py | 22 +
tests/atomiccmd_test/builder_test.py | 715 ++++++++
tests/atomiccmd_test/command_test.py | 873 ++++++++++
tests/atomiccmd_test/pprint_test.py | 396 +++++
tests/atomiccmd_test/sets_test.py | 340 ++++
tests/bwa/README | 4 +
tests/bwa/run.sh | 153 ++
tests/bwa/testcases/case_01a/README | 1 +
tests/bwa/testcases/case_01a/prefix.fasta | 2 +
tests/bwa/testcases/case_01a/reads1.fasta | 2 +
tests/bwa/testcases/case_01a/reads2.fasta | 2 +
tests/bwa/testcases/case_01b/README | 1 +
tests/bwa/testcases/case_01b/prefix.fasta | 7 +
tests/bwa/testcases/case_01b/reads1.fasta | 2 +
tests/bwa/testcases/case_01b/reads2.fasta | 2 +
tests/bwa/testcases/case_02/README | 1 +
tests/bwa/testcases/case_02/prefix.fasta | 3 +
tests/bwa/testcases/case_02/reads1.fasta | 3 +
tests/bwa/testcases/case_02/reads2.fasta | 2 +
tests/bwa/testcases/case_02/run.sh | 20 +
tests/bwa/testcases/case_03/README | 15 +
tests/bwa/testcases/case_03/prefix.fasta | 6 +
tests/bwa/testcases/case_03/reads.fasta | 2 +
tests/bwa/testcases/case_03/run.sh | 17 +
tests/bwa/testcases/case_04/README | 1 +
tests/bwa/testcases/case_04/prefix.fasta | 4 +
tests/bwa/testcases/case_04/reads.fasta | 2 +
tests/bwa/testcases/case_04/results.sam | 3 +
tests/bwa/testcases/case_04/run.sh | 19 +
tests/common_tests/__init__.py | 22 +
tests/common_tests/bedtools_tests.py | 242 +++
tests/common_tests/fileutils_test.py | 952 +++++++++++
tests/common_tests/formats_tests/__init__.py | 22 +
tests/common_tests/formats_tests/fasta_test.py | 296 ++++
tests/common_tests/formats_tests/msa_test.py | 436 +++++
tests/common_tests/formats_tests/newick_tests.py | 635 ++++++++
tests/common_tests/formats_tests/phylip_test.py | 186 +++
tests/common_tests/makefile_test.py | 1709 ++++++++++++++++++++
tests/common_tests/sampling_tests.py | 132 ++
tests/common_tests/sequences_test.py | 228 +++
tests/common_tests/signals_test.py | 56 +
tests/common_tests/text_tests.py | 289 ++++
tests/common_tests/utilities_test.py | 727 +++++++++
tests/common_tests/versions_tests.py | 724 +++++++++
tests/data/alignments/library_1.bam | Bin 0 -> 50182 bytes
tests/data/alignments/library_2.bam | Bin 0 -> 59061 bytes
tests/data/empty_file_1 | 0
tests/data/empty_file_2 | 0
tests/data/empty_file_3 | 0
tests/data/fasta_file.fasta | 4 +
tests/data/fasta_file.fasta.bz2 | Bin 0 -> 82 bytes
tests/data/fasta_file.fasta.gz | Bin 0 -> 70 bytes
tests/data/non_empty_file_1 | 1 +
tests/data/non_empty_file_2 | 1 +
tests/data/non_empty_file_3 | 1 +
tests/data/rCRS.fasta | 239 +++
tests/data/rCRS.fasta.fai | 1 +
tests/data/raw_reads/pe_reads_R1_001.fastq.gz | Bin 0 -> 7780 bytes
tests/data/raw_reads/pe_reads_R1_002.fastq.gz | Bin 0 -> 8663 bytes
tests/data/raw_reads/pe_reads_R2_001.fastq.gz | Bin 0 -> 8065 bytes
tests/data/raw_reads/pe_reads_R2_002.fastq.gz | Bin 0 -> 8766 bytes
tests/data/raw_reads/se_reads_R1_001.fastq.gz | Bin 0 -> 6645 bytes
tests/data/raw_reads/se_reads_R1_002.fastq.gz | Bin 0 -> 6048 bytes
tests/data/sim_reads/mate_1.fastq.gz | Bin 0 -> 36994 bytes
tests/data/sim_reads/mate_2.fastq.gz | Bin 0 -> 36909 bytes
tests/data/simple.yaml | 3 +
tests/data/timestamp_a_older | 0
tests/data/timestamp_a_younger | 0
tests/data/timestamp_b_older | 0
tests/data/timestamp_b_younger | 0
tests/node_test.py | 579 +++++++
tests/nodegraph_test.py | 146 ++
tests/run | 27 +
tests/tools_test/factory_test.py | 121 ++
tox.ini | 23 +
538 files changed, 69178 insertions(+)
diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..2295b43
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,23 @@
+*.swp
+.\#*
+\#*
+*~
+*.py[cod]
+
+.coverage
+MANIFEST
+
+# Packages
+dist
+build
+sdist
+
+tests/runs
+tests/links/
+
+*.egg/
+*.egg-info/
+.eggs
+.tox
+
+docs/_build
diff --git a/CHANGES.md b/CHANGES.md
new file mode 100644
index 0000000..fe86ba8
--- /dev/null
+++ b/CHANGES.md
@@ -0,0 +1,454 @@
+# Change log
+
+
+## [1.2.7] - 2017-01-03
+### Added
+ - PALEOMIX now includes the 'Zonkey' pipeline, a pipeline for
+ detecting equine F1 hybrids from archeological remains. Usage
+ is described in the documentation.
+
+
+### Changed
+ - The wrongly named per-sample option 'Gender' in the phylogenetic
+ pipeline makefile has been replaced with a 'Sex' option. This does not
+ break backwards compatibility, and makefiles using the old name will still
+ work correctly.
+ - The 'RescaleQualities' option has been merged with the 'mapDamage' Feature
+ in the BAM pipeline makefile. The 'mapDamage' feature now takes the options
+ 'plot', 'model', and 'rescale', allowing more fine-grained control.
+
+
+### Fixed
+ - Fixed the phylogenetic pipeline complaining about missing sample genders
+ (now sex) if no regions of interest had been specified. The pipeline will
+ now complain about there being no regions of interest, instead.
+ - The 'random sampling' genotyper would misinterpret mapping qualities 10
+ (encoded as '+') and 12 (encoded as '-') as indels, resulting in the
+ genotyping failing. These mapping qualities are now correctly ignored.
+
+
+## [1.2.6] - 2016-10-12
+### Changed
+ - PALEOMIX now uses the 'setproctitle' for better compatibility; installing
+ / upgraing PALEOMIX using pip (or equivalent tools) should automatically
+ install this dependency.
+
+### Fixed
+ - mapDamage plots should not require indexed BAMs; this fixed missing file
+ errors for some makefile configurations.
+ - Version check for java did now works correctly for OpenJDK JVMs.
+ - Pressing 'l' or 'L' to list the currently running tasks now correctly
+ reports the total runtime of the pipeline, rather than 0s.
+ - Fixed broken version-check in setup.py breaking on versions of python
+ older than than 2.7, preventing meaningful message (patch by beeso018).
+ - The total runtime is now correctly reported when pressing the 'l' key
+ during execution of a pipeline.
+ - The logger will automatically create the output directory if this does
+ not already exist; previously logged messages could cause the pipeline
+ to fail, even if these were not in themselves fatal.
+ - Executables required executables for version checks are now included in
+ the prior checks for missing executables, to avoid version-checks failing
+ due to missing executables.
+
+### Added
+ - PALEOMIX will attempt to automatically limit the per-process maximum
+ number of file-handles used by when invoking Picard tools, in order
+ to prevent failures due to exceeding the system limits (ulimit -n).
+
+
+## [1.2.5] - 2015-06-06
+### Changed
+ - Improved information capture when a node raises an unexpected exception,
+ mainly for nodes implementing their own 'run' function (not CommandNodes).
+ - Improved printing of the state of output files when using the command-line
+ option --list-output-files. Outdated files are now always listed as
+ outdated, where previously these could be listed as 'Missing' if the task
+ in question was queued to be run next.
+ - Don't attempt to validate prefixes when running 'trim_pipeline'; note that
+ the structure of the Prefix section the makefile still has to be valid.
+ - Reverted commit normalizing the strand of unmapped reads.
+ - The commands 'paleomix coverage' and 'paleomix depths' now accept records
+ lacking read-group information by default; these are record as <NA> in the
+ sample and library columns. It is further possible to ignore all read-group
+ information using the --ignore-readgroups command-line option.
+ - The 'bam_pipeline mkfile' command now does limited validation of input
+ 'SampleSheet.csv', prints generated targets sorted alphabetically, and
+ automatically generates unique names for identically named lanes. Finally,
+ the target template is not included automatically generating a makefile.
+ - The 'coverage' and 'depth' commands are now capable of processing files
+ containing reads with and without read-groups, without requiring the use
+ of the --ignore-readgroups command-line option. Furthermore, reads for
+ which the read-group is missing in the BAM header are treated as if no
+ readgroup was specified for that read.
+ - The 'coverage' and 'depth' command now checks that input BAM files are
+ sorted during startup and while processing a file.
+ - Normalized information printed by different progress UIs (--progress-ui),
+ and included the maximum number of threads allowed.
+ - Restructured CHANGELOG based on http://keepachangelog.com/
+
+### Fixed
+ - Fixed mislabeling of BWA nodes; all were labeled as 'SE'.
+ - Terminate read duplication checks when reaching the trailing, unmapped
+ reads; this fixes uncontrolled memory growth when an alignment produces a
+ large number of unmapped reads.
+ - Fixed the pipeline demanding the existence of files from lanes that had
+ been entirely excluded due to ExcludeReads settings.
+ - Fixed some tasks needlessly depending on BAM files being indexed (e.g.
+ depth histograms of a single BAM), resulting in missing file errors for
+ certain makefile configurations.
+ - Fixed per-prefix scan for duplicate input data not being run if no BAMs
+ were set to be generated in the makefile, i.e. if both 'RawBAM' and
+ 'RealignedBAM' was set to 'off'.
+
+### Deprecated
+ - Removed the BAM file from the bam_pipeline example, and added deprecation
+ warning; support for including preexisting BAMs will be removed in a
+ future version of PALEOMIX.
+
+
+## [1.2.4] - 2015-03-14
+### Added
+ - Included PATH in 'pipe.errors' file, to assist debugging of failed nodes.
+
+### Fixed
+ - Fix regression causing 'fixmate' not to be run on paired-end reads. This
+ would occasionally cause paired-end mapping to fail during validation.
+
+
+## [1.2.3] - 2015-03-11
+### Added
+ - Added the ability to the pipelines to output the list of input files
+ required for a given makefile, excluding any file built by the pipeline
+ itself. Use the --list-input-files command-line option to view these.
+
+### Changed
+ - Updated 'bam_pipeline' makefile template; prefixes and targets are
+ described more explicitly, and values for the prefix are commented out by
+ default. The 'Label' option is no included in the template, as it is
+ considered deprecated.
+ - Allow the 'trim_pipeline' to be run on a makefile without any prefixes;
+ this eases use of this pipeline in the case where a mapping is not wanted.
+ - Improved handling of unmapped reads in 'paleomix cleanup'; additional
+ flags (in particular 0x2; proper alignment) are now cleared if the mate is
+ unmapped, and unmapped reads are always represented on the positive strand
+ (clearing 0x4 and / or 0x20).
+
+
+## [1.2.2] - 2015-03-10
+### Added
+ - Documented work-arounds for problem caused when upgrading an old version of
+ PALEOMIX (< 1.2.0) by using 'pip' to install a newer version, in which all
+ command-line aliases invoke the same tool.
+ - Added expanded description of PALEOMIX to README file.
+ - The tool 'paleomix vcf_filter' can now clear any existing value in the
+ FILTER column, and only record the result of running the filters
+ implemented by this tool. This behavior may be enabled by running
+ vcf_filter with the command-line option '--reset-filter yes'.
+
+### Changed
+ - Improved parsing of 'depths' histograms when running the phylogenetic
+ pipeline genotyping step with 'MaxDepth: auto'; mismatches between the
+ sample name in the table and in the makefile now only cause a warning,
+ allowing for the common case where files depths were manually recalculated
+ (and --target was not set), or where files were renamed.
+ - The tool 'paleomix rmdup_collapsed' now assumes that ALL single-end reads
+ (flag 0x1 not set) are collapsed. This ensures that pre-collapsed reads
+ used in the pipeline are correctly filtered. Furthermore, reads without
+ quality scores will be filtered, but only selected as the unique
+ representative for a set of potential duplicates if no reads have quality
+ scores. In that case, a random read is selected among the candidates.
+
+### Fixed
+ - Fixed failure during mapping when using SAMTools v1.x.
+
+
+## [1.2.1] - 2015-03-08
+### Changed
+ - Remove dependency on BEDTools from the Phylogenetic pipeline.
+ - Change paleomix.__version__ to follow PEP 0396.
+
+### Fixed
+ - Stop 'phylo_pipeline makefile' from always printing help text.
+ - Fixed bug causing the phylo_pipeline to throw exception if no additional
+ command-line arguments were given.
+ - Allow simulation of reads for phylogenetic pipeline example to be executed
+ when PALEOMIX is run from a virtual environment.
+
+
+## [1.2.0] - 2015-02-24
+This is a major revision of PALEOMIX, mainly focused on reworking the internals
+of the PALEOMIX framework, as well as cleaning up several warts in the BAM
+pipeline. As a result, the default makefile has changed in a number of ways,
+but backwards compatibility is still retained with older makefiles, with one
+exception. Where previously the 'FilterUnmappedReads' would only be in effect
+when 'MinQuality' was set to 0, this option is now independent of the
+'MinQuality' option.
+
+In addition, it is now possible to install PALEOMIX via Pypi, as described in
+the (partially) updated documentation now hosted on ReadTheDocs.
+
+### Changed
+ - Initial version of updated documentation hosted on ReadTheDocs, to replace
+ documentation currently hosted on the repository wiki.
+ - mapDamage files and models are now only kept in the
+ {Target}.{Prefix}.mapDamage folder to simplify the file-structure;
+ consequently, re-scaling can be re-done with different parameters by
+ re-running the model step in these folders.
+ - Rework BWA backtrack mapping to be carried out in two steps; this requires
+ saving the .sai files (and hence more disk-space used by intermediate
+ files, which can be removed afterwards), but allows better control over
+ thread and memory usage.
+ - Validate paths in BAM makefiles, to ensure that these can be parsed, and
+ that these do not contain keys other than '{Pair}'.
+ - The mapping-quality filter in the BAM pipeline / 'cleanup' command now only
+ applies to mapped reads; consequently, setting a non-zero mapq value, and
+ setting 'FilterUnmappedReads' to 'no' will not result in unmapped reads
+ being filtered.
+ - Improved the cleanup of BAM records following mapping, to better ensure
+ that the resulting records follow the recommendations in the SAM spec. with
+ regards to what fields / flags are set.
+ - Configuration files are now expected to be located in ~/.paleomix or
+ /etc/paleomix rather than ~/.pypeline and /etc/pypeline. To ensure
+ backwards compatibility, ~/.pypeline will be migrated when a pipeline is
+ first run, and replaced with a symbolic link to the new location.
+ Furthermore, files in /etc/pypeline are still read, but settings in
+ /etc/paleomix take precedence.
+ - When parsing GTF files with 'gtf_to_bed', use either the attribute
+ 'gene_type' or 'gene_biotype', defaulting to the value 'unknown_genetype'
+ if neither attribute can be found; also support reading of gz / bz2 files.
+ - The "ExcludeReads" section of the BAM Pipeline makefile is now a dictionary
+ rather a list of strings. Furthermore, 'Singleton' reads are now considered
+ seperately from 'Single'-end reads, and may be excluded independently of
+ those. This does not break backwards compatibility, but as a consequence
+ 'Single' includes both single-end and singleton reads when using old
+ makefiles.
+ - Added command-line option --nth-sample to the 'vcf_to_fasta' command,
+ allowing FASTA construction from multi-sample VCFs; furthermore, if no BED
+ file is specified, the entire genotype is constructed assuming that the VCF
+ header is present.
+ - Modify the FASTA indexing node so that SAMTools v0.1.x and v1.x can be used
+ (added workaround for missing feature in v1.x).
+ - The "Features" section of the BAM Pipeline makefile is now a dictionary
+ rather than a list of strings, and spaces have been removed from feature
+ names. This does not break backwards compatibility.
+ - EXaML v3.0+ is now required; the name of the examl parser executable is
+ required to be 'parse-examl' (previously expected to be 'examlParser'),
+ following the name used by EXaML v3.0+.
+ - Pysam v0.8.3+ is now required.
+ - AdapterRemoval v2.1.5+ is now required; it is now possible to provide a
+ list of adapter sequences using --adapter-list, and to specify the number
+ of threads uses by AdapterRemoval via the --adapterremoval-max-threads
+ command-line option.
+ - Renamed module from 'pypeline' to 'paleomix' to aviod conflicts.
+ - Improved handling FASTQ paths containing wildcards in the BAM pipeline,
+ including additional checks to catch unequal numbers of files for paired-
+ end reads.
+ - Switch to setuptools in preperation for PyPI registration.
+ - Avoid seperate indexing of intermediate BAMs when possible, reducing the
+ total number of steps required for typical runs.
+ - Restructure tests, removing (mostly unused) node tests.
+ - Reworked sub-command handling to enable migration to setup-tools, and
+ improved the safety of invoking these from the pipeline itself.
+ - The output of "trim_pipeline mkfile" now includes the section for
+ AdapterRemoval, which was previously mistakenly omitted.
+ - Increased the speed of the checks for duplicate input data (i.e. the same
+ FASTQ record(s) included multiple times in one or more files) by ~4x.
+
+### Added
+ - Paleomix v1.2.0 is now available via Pypi ('pip install paleomix').
+ - Added command 'paleomix ena', which is designed to ease the preparation of
+ FASTQ reads previously recorded in a BAM pipeline makefile for submission
+ to the European Nucleotide Archive; this command is current unstable, and
+ not available by default (see comments in 'main.py').
+ - Exposed 'bam_pipeline remap' command, which eases re-mapping the hits
+ identified against one prefix against other prefixes.
+ - Added validation of BED files supplied to the BAM pipeline, and expand
+ validation of BED files supplied to the Phylogenetic pipeline, to catch
+ some cases that may cause unexpected behavior or failure during runtime.
+ - Support SAMTools v1.x in the BAM pipeline; note, however, that the
+ phylogenetic pipeline still requires SAMTools v0.1.19, due to major changes
+ to BCFTools 1.x, which is not yet supported.
+ - Modified 'bam_cleanup' to support SAMTools 1.x; SAMTools v0.1.19 or v1.x+
+ is henceforth required by this tool.
+ - The gender 'NA' may now be used for samples for which no filtering of sex
+ chromosomes is to be carried out, and defaults to an empty set of
+ chromsomes unless explicitly overridden.
+ - Pipeline examples are now available following installation via the commands
+ "bam_pipeline example" and "phylo_pipeline example", which copy the example
+ files to a folder specified by the user.
+ - Added ability to specify the maximum number of threads used by GATK;
+ currently only applicable for training of indel realigner.
+
+### Fixed
+ - Ensured that only a single header is generated when using multiple threads
+ during genotyping, in order to avoid issues with programs unable to handle
+ multiple headers.
+ - Information / error messages are now more consistently logged to stderr, to
+ better ensure that results printed to stdout are not mixed with those.
+ - Fixed bug which could cause the data duplication detection to fail when
+ unmapped reads were included.
+ - Fixed default values not being shown for 'vcf_filter --help'.
+ - Fix 'vcf_filter' when using pysam v0.8.4; would raise exception due to
+ changes to the VCF record class.
+
+### Removed
+ - Removed the 'paleomix zip' command, as this is no longer needed thanks to
+ built-in gzip / bzip2 support in AdapterRemoval v2.
+ - Removed commandline options --allow-missing-input-files,
+ --list-orphan-files, --target, and --list-targets.
+
+
+## [1.1.1] - 2015-10-10
+
+### Changed
+ - Detect the presence of carriage-returns ('\r') in FASTA files used as
+ prefixes; these cause issues with some tools, and files should be converted
+ using e.g. 'dos2unix' first.
+
+### Fixed
+ - Minor fix to help-text displayed as part of running information.
+
+### Deprecated
+ - AdapterRemoval v1.x is now considered deprecated, and support will be
+ dropped shortly. Please upgrade to v2.1 or later, which can be found at
+ https://github.com/MikkelSchubert/adapterremoval
+
+### Removed
+ - Dropped support for Picard tools versions prior to 1.124; this was
+ nessesitated Picard tools merging into a single jar for all commands. This
+ jar (picard.jar) is expected to be located in the --jar-root folder.
+
+
+## [1.1.0] - 2015-09-08
+### Added
+ - Check that regions of interest specified in PhylogeneticInference section
+ corresponds to those specified earlier in the makefile.
+ - Added the ability to automatically read MaxReadDepth values from
+ depth-histograms generated by the BAM pipeline to the genotyping step.
+ - Add support for BWA algorithms "bwasw" and "mem", which are recommended for
+ longer sequencing reads. The default remains the "backtrack" algorithm.
+ - Include list of filters in 'vcf_filter' output and renamed these to be
+ compatible with GATK (using ':' instead of '=').
+ - Support for genotyping entire BAM (once, and only once), even if only a set
+ of regions are to be called; this is useful in the context of larger
+ projects, and when multiple overlapping regions are to be genotyped.
+ - Added validation of FASTA files for the BAM pipeline, in order to catch
+ serveral types of errors that may lead to failure during mapping.
+ - Added options to BAM / Phylo pipelines for writing Dot-file of the full
+ dependency tree of a pipeline.
+ - Added the ability to change the number of threads, and more, while the
+ pipeline is running. Currently, already running tasks are not terminated if
+ the maximum number of threads is decreased. Press 'h' during runtime to
+ list commands.
+ - Support for AdapterRemoval v2.
+ - Allow the -Xmx option for Java to be overridden by the user.
+
+### Changed
+ - Prohibit whitespace and parentheses in prefix paths; these cause problems
+ with Bowtie2, due to the wrapper script used by this program.
+ - Allow "*" as the name for prefixes, when selecting prefixes by wildcards.
+ - Rework genotyping step to improve performance when genotyping sparse
+ regions (e.g. genes), and to allow transparent parallelization.
+ - Require BWA 0.5.9, 0.5.10, 0.6.2, or 0.7.9+ for BWA backtrack; other
+ versions have never been tested, or are known to contain bugs that result
+ in invalid BAM files.
+ - The memory limit it no longer increased for 32-bit JREs by default, as the
+ value used by the pipeline exceeded the maxmimum for this architecture.
+ - Improved verification of singleton-filtering settings in makefiles.
+ - Reworked the 'sample_pileup' command, to reduce the memory usage for larger
+ regions (e.g. entire chromosomes) by an order of magnitude. Also fixed some
+ inconsistency in the calculation of distance to indels, resulting in some
+ changes in results.
+ - Changed 'gtf_to_bed' to group by the gene biotype, instead of the source.
+
+### Fixed
+ - Fixed a bug preventing new tasks from being started immediately after a
+ task had failed; new tasks would only be started once a task had finished,
+ or no running tasks were left.
+ - Fixed MaxDepth calculation being limited to depths in the range 0 .. 200.
+ - Added workaround for bug in Pysam, which caused parsing of some GTF files
+ to fail if these contained unquoted values (e.g. "exon_number 2;").
+ - Fixed bug causing some tasks to not be re-run if the input file changed.
+ - Fixed off-by-one error for coverages near the end of regions / contigs.
+ - Ensure that the correct 'paleomix' wrapper script is called when invoking
+ the various other tools, even if this is not located in the current PATH.
+ - Parse newer SAMTools / BCFTools version strings, so that a meaningful
+ version check failure can be reported, as these versions are not supported
+ yet due to missing functionality.
+ - Fix potential deadlock in the genotyping tool, which could occur if either
+ of the invoked commands failed to start or crashed / were killed during
+ execution.
+ - Fixed error in which summary files could not be generated if two (or more)
+ prefixes using the same label contained contigs with overlapping names but
+ different sizes.
+ - Fixed problems calculating coverage, depths, and others, when when using a
+ user-provided BED without a name column.
+ - Improved termination of child-processes, when the pipeline is interrupted.
+
+### Deprecated
+ - The 'mkfile' command has been renamed to 'makefile' for both pipelines; the
+ old command is still supported, but considered deprecated.
+
+### Removed
+ - Dropped support for the "verbose" terminal output due to excessive
+ verbosity (yes, really). The new default is "running" (previously called
+ "quiet"), which shows a list of currently running nodes at every update.
+
+
+## [1.0.1] - 2014-04-30
+### Added
+ - Add 'paleomix' command, which provides interface for the various tools
+ included in the PALEOMIX pipeline; this reduces the number of executables
+ exposed by the pipeline, and allows for prerequisite checks to be done in
+ one place.
+ - Added warning if HomozygousContigs contains contigs not included in any of
+ the prefixes specified in the makefile.
+
+### Changed
+ - Reworking version checking; add checks for JRE version (1.6+), for GATK
+ (to check that the JRE can run it), and improved error messages for
+ unidentified and / or outdated versions, and reporting of version numbers
+ and requirements.
+ - Dispose of hsperfdata_* folders created by certain JREs when using a
+ custom temporary directory, when running Picard tools.
+ - Cleanup of error-message displayed if Pysam version is outdated.
+ - Ensure that file-handles are closed in the main process before subprocess
+ execution, to ensure that these recieve SIGPIPE upon broken pipes.
+ - Improvements to handling of implicit empty lists in makefiles; it is now
+ no longer required to explicitly specify an empty list. Thus, the following
+ is equivalent assuming that the pipeline expects a list:
+ ExplicitEmptyList: []
+ ImplicitEmptyList:
+ - Tweak makefile templates; the phylo makefile now specifies Male/Female
+ genders with chrM and chrX; for the BAM pipeline the ROIs sub-tree and
+ Label is commented out by default, as these are optional.
+ - Reduced start-up time for bigger pipelines.
+
+### Fixed
+ - Fix manifest, ensuring that all files are included in source distribution.
+ - Fix regression in coverage / depths, which would fail if invoked for
+ specific regions of interest.
+ - Fix bug preventing Padding from being set to zero when genotyping.
+
+
+## [1.0.0] - 2014-04-16
+### Changed
+ - Switching to more traditional version-number tracking.
+
+
+
+[Unreleased]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.7...HEAD
+[1.2.7]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.6...v1.2.7
+[1.2.6]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.5...v1.2.6
+[1.2.5]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.4...v1.2.5
+[1.2.4]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.3...v1.2.4
+[1.2.3]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.2...v1.2.3
+[1.2.2]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.1...v1.2.2
+[1.2.1]: https://github.com/MikkelSchubert/paleomix/compare/v1.2.0...v1.2.1
+[1.2.0]: https://github.com/MikkelSchubert/paleomix/compare/v1.1.1...v1.2.0
+[1.1.1]: https://github.com/MikkelSchubert/paleomix/compare/v1.1.0...v1.1.1
+[1.1.0]: https://github.com/MikkelSchubert/paleomix/compare/v1.0.1...v1.1.0
+[1.0.1]: https://github.com/MikkelSchubert/paleomix/compare/v1.0.0...v1.0.1
+[1.0.0]: https://github.com/MikkelSchubert/paleomix/compare/v1.0.0-RC...v1.0.0
+
+
diff --git a/MANIFEST.in b/MANIFEST.in
new file mode 100644
index 0000000..4e60312
--- /dev/null
+++ b/MANIFEST.in
@@ -0,0 +1,19 @@
+include README.rst
+include CHANGES.md
+include MANIFEST.in
+include pylint.conf
+licenses/gpl.txt
+licenses/mit.txt
+include paleomix/yaml/CHANGES
+include paleomix/yaml/LICENSE
+include paleomix/yaml/PKG-INFO
+include paleomix/yaml/README
+
+# Examples
+recursive-include paleomix/resources *
+
+# Misc tools
+recursive-include misc *.py *.sh *.yaml
+
+# Tests
+recursive-include tests *
diff --git a/README.rst b/README.rst
new file mode 100644
index 0000000..20b6444
--- /dev/null
+++ b/README.rst
@@ -0,0 +1,11 @@
+*********************
+The PALEOMIX pipeline
+*********************
+
+The PALEOMIX pipeline is a set of pipelines and tools designed to aid the rapid processing of High-Throughput Sequencing (HTS) data, starting from de-multiplexed reads from one or more samples, through sequence processing and alignment, followed by genotyping and phylogenetic inference on the samples. In addition, PALEOMIX aids in metagenomic analysis of the extracts. The pipeline has been designed with ancient DNA (aDNA) in mind, and includes several features especially useful for the a [...]
+
+For a detailed description of the pipeline, please refer to `PALEOMIX <http://geogenetics.ku.dk/publications/paleomix>`_ website and the `documentation <http://paleomix.readthedocs.io/>`_; for questions, bug reports, and/or suggestions, use the `GitHub tracker <https://github.com/MikkelSchubert/paleomix/issues/>`_, or contact Mikkel Schubert at `MSchubert at snm.ku.dk <mailto:MSchubert at snm.ku.dk>`_.
+
+The PALEOMIX pipeline has been published in Nature Protocols; if you make use of (parts of) the pipeline in your work, then please cite
+
+ Schubert M, Ermini L, Sarkissian CD, Jónsson H, Ginolhac A, Schaefer R, Martin MD, Fernández R, Kircher M, McCue M, Willerslev E, and Orlando L. "**Characterization of ancient and modern genomes by SNP detection and phylogenomic and metagenomic analysis using PALEOMIX**". Nat Protoc. 2014 May;9(5):1056-82. doi: `10.1038/nprot.2014.063 <http://dx.doi.org/10.1038/nprot.2014.063>`_. Epub 2014 Apr 10. PubMed PMID: `24722405 <http://www.ncbi.nlm.nih.gov/pubmed/24722405>`_.
diff --git a/bin/bam_pipeline b/bin/bam_pipeline
new file mode 100755
index 0000000..fb40386
--- /dev/null
+++ b/bin/bam_pipeline
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2014 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""
+Legacy script for invoking the PALEOMIX command 'bam_pipeline'; main scripts
+are otherwise created by setuptools during the installation.
+"""
+import sys
+
+try:
+ import paleomix
+except ImportError:
+ error = sys.exc_info()[1] # Python 2/3 compatible exception syntax
+ sys.stderr.write("""Error importing required PALEOMIX module 'paleomix':
+ - %s
+
+Please make sure that PYTHONPATH points to the location of the 'paleomix'
+module. This may be done permanently by appendign the following to your
+~/.bashrc file (if using Bash):
+ export PYTHONPATH=${PYTHONPATH}:/path/to/paleomix/checkout/...
+""" % (error,))
+ sys.exit(1)
+
+if __name__ == '__main__':
+ sys.exit(paleomix.run_bam_pipeline())
diff --git a/bin/bam_rmdup_collapsed b/bin/bam_rmdup_collapsed
new file mode 100755
index 0000000..0472802
--- /dev/null
+++ b/bin/bam_rmdup_collapsed
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2014 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""
+Legacy script for invoking the PALEOMIX command "rmdup_collapsed"; main scripts
+are otherwise created by setuptools during the installation.
+"""
+import sys
+
+try:
+ import paleomix
+except ImportError:
+ error = sys.exc_info()[1] # Python 2/3 compatible exception syntax
+ sys.stderr.write("""Error importing required PALEOMIX module 'paleomix':
+ - %s
+
+Please make sure that PYTHONPATH points to the location of the 'paleomix'
+module. This may be done permanently by appendign the following to your
+~/.bashrc file (if using Bash):
+ export PYTHONPATH=${PYTHONPATH}:/path/to/paleomix/checkout/...
+""" % (error,))
+ sys.exit(1)
+
+if __name__ == '__main__':
+ sys.exit(paleomix.run_rmdup_collapsed())
diff --git a/bin/conv_gtf_to_bed b/bin/conv_gtf_to_bed
new file mode 100755
index 0000000..145e5f6
--- /dev/null
+++ b/bin/conv_gtf_to_bed
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2014 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""
+Legacy script for invoking the PALEOMIX command "gtf_to_bed"; main scripts
+are otherwise created by setuptools during the installation.
+"""
+import sys
+
+try:
+ import paleomix
+except ImportError:
+ error = sys.exc_info()[1] # Python 2/3 compatible exception syntax
+ sys.stderr.write("""Error importing required PALEOMIX module 'paleomix':
+ - %s
+
+Please make sure that PYTHONPATH points to the location of the 'paleomix'
+module. This may be done permanently by appendign the following to your
+~/.bashrc file (if using Bash):
+ export PYTHONPATH=${PYTHONPATH}:/path/to/paleomix/checkout/...
+""" % (error,))
+ sys.exit(1)
+
+if __name__ == '__main__':
+ sys.exit(paleomix.run_gtf_to_bed())
diff --git a/bin/paleomix b/bin/paleomix
new file mode 100755
index 0000000..06b0a33
--- /dev/null
+++ b/bin/paleomix
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2014 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""
+Legacy script for invoking PALEOMIX; main scripts are otherwise created by
+setuptools during the installation.
+"""
+import sys
+
+try:
+ import paleomix
+except ImportError:
+ error = sys.exc_info()[1] # Python 2/3 compatible exception syntax
+ sys.stderr.write("""Error importing required PALEOMIX module 'paleomix':
+ - %s
+
+Please make sure that PYTHONPATH points to the location of the 'paleomix'
+module. This may be done permanently by appendign the following to your
+~/.bashrc file (if using Bash):
+ export PYTHONPATH=${PYTHONPATH}:/path/to/paleomix/checkout/...
+""" % (error,))
+ sys.exit(1)
+
+if __name__ == '__main__':
+ sys.exit(paleomix.run())
diff --git a/bin/phylo_pipeline b/bin/phylo_pipeline
new file mode 100755
index 0000000..4b1c1bd
--- /dev/null
+++ b/bin/phylo_pipeline
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2014 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""
+Legacy script for invoking the PALEOMIX command "phylo_pipeline"; main scripts
+are otherwise created by setuptools during the installation.
+"""
+import sys
+
+try:
+ import paleomix
+except ImportError:
+ error = sys.exc_info()[1] # Python 2/3 compatible exception syntax
+ sys.stderr.write("""Error importing required PALEOMIX module 'paleomix':
+ - %s
+
+Please make sure that PYTHONPATH points to the location of the 'paleomix'
+module. This may be done permanently by appendign the following to your
+~/.bashrc file (if using Bash):
+ export PYTHONPATH=${PYTHONPATH}:/path/to/paleomix/checkout/...
+""" % (error,))
+ sys.exit(1)
+
+if __name__ == '__main__':
+ sys.exit(paleomix.run_phylo_pipeline())
diff --git a/bin/trim_pipeline b/bin/trim_pipeline
new file mode 100755
index 0000000..1a909e2
--- /dev/null
+++ b/bin/trim_pipeline
@@ -0,0 +1,43 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2014 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""
+Legacy script for invoking the PALEOMIX command "trim_pipeline"; main scripts
+are otherwise created by setuptools during the installation.
+"""
+import sys
+
+try:
+ import paleomix
+except ImportError:
+ error = sys.exc_info()[1] # Python 2/3 compatible exception syntax
+ sys.stderr.write("""Error importing required PALEOMIX module 'paleomix':
+ - %s
+
+Please make sure that PYTHONPATH points to the location of the 'paleomix'
+module. This may be done permanently by appendign the following to your
+~/.bashrc file (if using Bash):
+ export PYTHONPATH=${PYTHONPATH}:/path/to/paleomix/checkout/...
+""" % (error,))
+ sys.exit(1)
+
+if __name__ == '__main__':
+ sys.exit(paleomix.run_trim_pipeline())
diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..a2b922f
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,192 @@
+# Makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line.
+SPHINXOPTS =
+SPHINXBUILD = sphinx-build
+PAPER =
+BUILDDIR = _build
+
+# User-friendly check for sphinx-build
+ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1)
+$(error The '$(SPHINXBUILD)' command was not found. Make sure you have Sphinx installed, then set the SPHINXBUILD environment variable to point to the full path of the '$(SPHINXBUILD)' executable. Alternatively you can add the directory with the executable to your PATH. If you don't have Sphinx installed, grab it from http://sphinx-doc.org/)
+endif
+
+# Internal variables.
+PAPEROPT_a4 = -D latex_paper_size=a4
+PAPEROPT_letter = -D latex_paper_size=letter
+ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+# the i18n builder cannot share the environment and doctrees with the others
+I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
+
+.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest coverage gettext
+
+help:
+ @echo "Please use \`make <target>' where <target> is one of"
+ @echo " html to make standalone HTML files"
+ @echo " dirhtml to make HTML files named index.html in directories"
+ @echo " singlehtml to make a single large HTML file"
+ @echo " pickle to make pickle files"
+ @echo " json to make JSON files"
+ @echo " htmlhelp to make HTML files and a HTML help project"
+ @echo " qthelp to make HTML files and a qthelp project"
+ @echo " applehelp to make an Apple Help Book"
+ @echo " devhelp to make HTML files and a Devhelp project"
+ @echo " epub to make an epub"
+ @echo " latex to make LaTeX files, you can set PAPER=a4 or PAPER=letter"
+ @echo " latexpdf to make LaTeX files and run them through pdflatex"
+ @echo " latexpdfja to make LaTeX files and run them through platex/dvipdfmx"
+ @echo " text to make text files"
+ @echo " man to make manual pages"
+ @echo " texinfo to make Texinfo files"
+ @echo " info to make Texinfo files and run them through makeinfo"
+ @echo " gettext to make PO message catalogs"
+ @echo " changes to make an overview of all changed/added/deprecated items"
+ @echo " xml to make Docutils-native XML files"
+ @echo " pseudoxml to make pseudoxml-XML files for display purposes"
+ @echo " linkcheck to check all external links for integrity"
+ @echo " doctest to run all doctests embedded in the documentation (if enabled)"
+ @echo " coverage to run coverage check of the documentation (if enabled)"
+
+clean:
+ rm -rf $(BUILDDIR)/*
+
+html:
+ $(SPHINXBUILD) -b html $(ALLSPHINXOPTS) $(BUILDDIR)/html
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/html."
+
+dirhtml:
+ $(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
+ @echo
+ @echo "Build finished. The HTML pages are in $(BUILDDIR)/dirhtml."
+
+singlehtml:
+ $(SPHINXBUILD) -b singlehtml $(ALLSPHINXOPTS) $(BUILDDIR)/singlehtml
+ @echo
+ @echo "Build finished. The HTML page is in $(BUILDDIR)/singlehtml."
+
+pickle:
+ $(SPHINXBUILD) -b pickle $(ALLSPHINXOPTS) $(BUILDDIR)/pickle
+ @echo
+ @echo "Build finished; now you can process the pickle files."
+
+json:
+ $(SPHINXBUILD) -b json $(ALLSPHINXOPTS) $(BUILDDIR)/json
+ @echo
+ @echo "Build finished; now you can process the JSON files."
+
+htmlhelp:
+ $(SPHINXBUILD) -b htmlhelp $(ALLSPHINXOPTS) $(BUILDDIR)/htmlhelp
+ @echo
+ @echo "Build finished; now you can run HTML Help Workshop with the" \
+ ".hhp project file in $(BUILDDIR)/htmlhelp."
+
+qthelp:
+ $(SPHINXBUILD) -b qthelp $(ALLSPHINXOPTS) $(BUILDDIR)/qthelp
+ @echo
+ @echo "Build finished; now you can run "qcollectiongenerator" with the" \
+ ".qhcp project file in $(BUILDDIR)/qthelp, like this:"
+ @echo "# qcollectiongenerator $(BUILDDIR)/qthelp/PALEOMIX.qhcp"
+ @echo "To view the help file:"
+ @echo "# assistant -collectionFile $(BUILDDIR)/qthelp/PALEOMIX.qhc"
+
+applehelp:
+ $(SPHINXBUILD) -b applehelp $(ALLSPHINXOPTS) $(BUILDDIR)/applehelp
+ @echo
+ @echo "Build finished. The help book is in $(BUILDDIR)/applehelp."
+ @echo "N.B. You won't be able to view it unless you put it in" \
+ "~/Library/Documentation/Help or install it in your application" \
+ "bundle."
+
+devhelp:
+ $(SPHINXBUILD) -b devhelp $(ALLSPHINXOPTS) $(BUILDDIR)/devhelp
+ @echo
+ @echo "Build finished."
+ @echo "To view the help file:"
+ @echo "# mkdir -p $$HOME/.local/share/devhelp/PALEOMIX"
+ @echo "# ln -s $(BUILDDIR)/devhelp $$HOME/.local/share/devhelp/PALEOMIX"
+ @echo "# devhelp"
+
+epub:
+ $(SPHINXBUILD) -b epub $(ALLSPHINXOPTS) $(BUILDDIR)/epub
+ @echo
+ @echo "Build finished. The epub file is in $(BUILDDIR)/epub."
+
+latex:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo
+ @echo "Build finished; the LaTeX files are in $(BUILDDIR)/latex."
+ @echo "Run \`make' in that directory to run these through (pdf)latex" \
+ "(use \`make latexpdf' here to do that automatically)."
+
+latexpdf:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo "Running LaTeX files through pdflatex..."
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+latexpdfja:
+ $(SPHINXBUILD) -b latex $(ALLSPHINXOPTS) $(BUILDDIR)/latex
+ @echo "Running LaTeX files through platex and dvipdfmx..."
+ $(MAKE) -C $(BUILDDIR)/latex all-pdf-ja
+ @echo "pdflatex finished; the PDF files are in $(BUILDDIR)/latex."
+
+text:
+ $(SPHINXBUILD) -b text $(ALLSPHINXOPTS) $(BUILDDIR)/text
+ @echo
+ @echo "Build finished. The text files are in $(BUILDDIR)/text."
+
+man:
+ $(SPHINXBUILD) -b man $(ALLSPHINXOPTS) $(BUILDDIR)/man
+ @echo
+ @echo "Build finished. The manual pages are in $(BUILDDIR)/man."
+
+texinfo:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo
+ @echo "Build finished. The Texinfo files are in $(BUILDDIR)/texinfo."
+ @echo "Run \`make' in that directory to run these through makeinfo" \
+ "(use \`make info' here to do that automatically)."
+
+info:
+ $(SPHINXBUILD) -b texinfo $(ALLSPHINXOPTS) $(BUILDDIR)/texinfo
+ @echo "Running Texinfo files through makeinfo..."
+ make -C $(BUILDDIR)/texinfo info
+ @echo "makeinfo finished; the Info files are in $(BUILDDIR)/texinfo."
+
+gettext:
+ $(SPHINXBUILD) -b gettext $(I18NSPHINXOPTS) $(BUILDDIR)/locale
+ @echo
+ @echo "Build finished. The message catalogs are in $(BUILDDIR)/locale."
+
+changes:
+ $(SPHINXBUILD) -b changes $(ALLSPHINXOPTS) $(BUILDDIR)/changes
+ @echo
+ @echo "The overview file is in $(BUILDDIR)/changes."
+
+linkcheck:
+ $(SPHINXBUILD) -b linkcheck $(ALLSPHINXOPTS) $(BUILDDIR)/linkcheck
+ @echo
+ @echo "Link check complete; look for any errors in the above output " \
+ "or in $(BUILDDIR)/linkcheck/output.txt."
+
+doctest:
+ $(SPHINXBUILD) -b doctest $(ALLSPHINXOPTS) $(BUILDDIR)/doctest
+ @echo "Testing of doctests in the sources finished, look at the " \
+ "results in $(BUILDDIR)/doctest/output.txt."
+
+coverage:
+ $(SPHINXBUILD) -b coverage $(ALLSPHINXOPTS) $(BUILDDIR)/coverage
+ @echo "Testing of coverage in the sources finished, look at the " \
+ "results in $(BUILDDIR)/coverage/python.txt."
+
+xml:
+ $(SPHINXBUILD) -b xml $(ALLSPHINXOPTS) $(BUILDDIR)/xml
+ @echo
+ @echo "Build finished. The XML files are in $(BUILDDIR)/xml."
+
+pseudoxml:
+ $(SPHINXBUILD) -b pseudoxml $(ALLSPHINXOPTS) $(BUILDDIR)/pseudoxml
+ @echo
+ @echo "Build finished. The pseudo-XML files are in $(BUILDDIR)/pseudoxml."
diff --git a/docs/_static/zonkey/incl_ts_0_tree_rooted.png b/docs/_static/zonkey/incl_ts_0_tree_rooted.png
new file mode 100644
index 0000000..1b57a19
Binary files /dev/null and b/docs/_static/zonkey/incl_ts_0_tree_rooted.png differ
diff --git a/docs/_static/zonkey/incl_ts_0_tree_unrooted.png b/docs/_static/zonkey/incl_ts_0_tree_unrooted.png
new file mode 100644
index 0000000..63b01bc
Binary files /dev/null and b/docs/_static/zonkey/incl_ts_0_tree_unrooted.png differ
diff --git a/docs/_static/zonkey/mito_phylo.png b/docs/_static/zonkey/mito_phylo.png
new file mode 100644
index 0000000..7d91379
Binary files /dev/null and b/docs/_static/zonkey/mito_phylo.png differ
diff --git a/docs/acknowledgements.rst b/docs/acknowledgements.rst
new file mode 100644
index 0000000..997b9c9
--- /dev/null
+++ b/docs/acknowledgements.rst
@@ -0,0 +1,8 @@
+================
+Acknowledgements
+================
+
+The PALEOMIX pipeline has been developed by researchers at the `Orlando Group`_ at the `Centre for GeoGenetics`_, University of Copenhagen, Denmark. Its development was supported by the Danish Council for Independent Research, Natural Sciences (FNU); the Danish National Research Foundation (DNRF94); a Marie-Curie Career Integration grant (FP7 CIG-293845); the Lundbeck foundation (R52-A5062).
+
+.. _Orlando Group: http://geogenetics.ku.dk/research_groups/palaeomix_group/
+.. _Centre for GeoGenetics: http://geogenetics.ku.dk/
\ No newline at end of file
diff --git a/docs/bam_pipeline/configuration.rst b/docs/bam_pipeline/configuration.rst
new file mode 100644
index 0000000..261cd40
--- /dev/null
+++ b/docs/bam_pipeline/configuration.rst
@@ -0,0 +1,53 @@
+.. highlight:: ini
+.. _bam_configuration:
+
+
+Configuring the BAM pipeline
+============================
+
+The BAM pipeline exposes a number options, including the maximum number of threads used, and the maximum number of threads used for individual programs, the location of JAR files, and more. These may be set using the corresponding command-line options (e.g. --max-threads). However, it is also possible to set default values for such options, including on a per-host bases. This is accomplished by executing the following command, in order to generate a configuration file at ~/.paleomix/bam_ [...]
+
+.. code-block:: bash
+
+ $ paleomix bam_pipeline --write-config
+
+
+The resulting file contains a list of options which can be overwritten::
+
+ [Defaults]
+ max_threads = 16
+ log_level = warning
+ jar_root = /home/username/install/jar_root
+ bwa_max_threads = 1
+ progress_ui = quiet
+ temp_root = /tmp/username/bam_pipeline
+ jre_options =
+ bowtie2_max_threads = 1
+ ui_colors = on
+
+.. note::
+ Options in the configuration file correspond directly to command-line options for the BAM pipeline, with two significant differences: The leading dashes (--) are removed and any remaining dashes are changed to underscores (_); as an example, the command-line option --max-threads becomes max\_threads in the configuration file, as shown above.
+
+These values will be used by the pipeline, unless the corresponding option is also supplied on the command-line. I.e. if "max_threads" is set to 4 in the "bam_pipeline.ini" file, but the pipeline is run using "paleomix bam_pipeline --max-threads 10", then the max threads value is set to 10.
+
+.. note::
+ If no value is given for --max-threads in ini-file or on the command-line, then the maximum number of threads is set to the number of CPUs available for the current host.
+
+It is furthermore possible to set specific options depending on the current host-name. Assuming that the pipeline was run on multiple servers sharing a single home folder, one might set the maximum number of threads on a per-server basis as follows::
+
+ [Defaults]
+ max_threads = 32
+ [BigServer]
+ max_threads = 64
+ [SmallServer]
+ max_threads = 16
+
+
+The names used (here "BigServer" and "SmallServer") should correspond to the hostname, i.e. the value returned by the "hostname" command:
+
+.. code-block:: bash
+
+ $ hostname
+ BigServer
+
+Any value set in the section matching the name of the current host will take precedence over the 'Defaults' section, but can still be overridden by specifying the same option on the command-line.
\ No newline at end of file
diff --git a/docs/bam_pipeline/filestructure.rst b/docs/bam_pipeline/filestructure.rst
new file mode 100644
index 0000000..fb83c86
--- /dev/null
+++ b/docs/bam_pipeline/filestructure.rst
@@ -0,0 +1,195 @@
+.. highlight:: Yaml
+.. _bam_filestructure:
+
+File structure
+==============
+
+The following section explains the file structure of the BAM pipeline example project (see :ref:`examples`), which results if that project is executed::
+
+ ExampleProject: # Target name
+ Synthetic_Sample_1: # Sample name
+ ACGATA: # Library 1
+ Lane_1: 000_data/ACGATA_L1_R{Pair}_*.fastq.gz
+ Lane_2:
+ Single: 000_data/ACGATA_L2/reads.singleton.truncated.gz
+ Collapsed: 000_data/ACGATAr_L2/reads.collapsed.gz
+ CollapsedTruncated: 000_data/ACGATA_L2/reads.collapsed.truncated.gz
+
+ GCTCTG: # Library 2
+ Lane_1: 000_data/GCTCTG_L1_R1_*.fastq.gz
+ Lane_2: rCRS: 000_data/GCTCTG_L2.bam
+
+ TGCTCA: # Library 3
+ Options:
+ SplitLanesByFilenames: no
+
+ Lane_1: 000_data/TGCTCA_L1_R1_*.fastq.gz
+ Lane_2: 000_data/TGCTCA_L2_R{Pair}_*.fastq.gz
+
+Once executed, this example is expected to generate the following result files,
+depending on which options are enabled:
+
+* ExampleProject.rCRS.bam
+* ExampleProject.rCRS.bai
+* ExampleProject.rCRS.realigned.bam
+* ExampleProject.rCRS.realigned.bai
+* ExampleProject.rCRS.coverage
+* ExampleProject.rCRS.depths
+* ExampleProject.rCRS.duphist
+* ExampleProject.rCRS.mapDamage
+* ExampleProject.summary
+
+As well as a single folder containing intermediate results:
+
+* ExampleProject/
+
+
+.. warning::
+ Please be aware that the internal file structure of PALEOMIX may change between major revisions (e.g. v1.1 to 1.2), but is not expected to change between minor revisions (v1.1.1 to v1.1.2). Consequently, if you wish to re-run an old project with the PALEOMIX pipeline, it is recommended to either use the same version of PALEOMIX, or remove the folder containing intermediate files before starting (see below), in order to ensure that analyses are re-run from scratch.
+
+
+Primary results
+---------------
+
+These files are the main results generated by the PALEOMIX pipeline:
+
+**ExampleProject.rCRS.bam** and **ExampleProject.rCRS.bai**
+
+ Final BAM file, which has not realigned using the GATK Indel Realigner, and it's index file (.bai), created using the "samtools index". If rescaling has been enabled, this BAM will contain reads processed by mapDamage.
+
+**ExampleProject.rCRS.realigned.bam** and **ExampleProject.rCRS.realigned.bai**
+
+ BAM file realigned using the GATK Indel Realigner, and it's index file (.bai), created using the "samtools index". If rescaling has been enabled, this BAM will contain reads processed by mapDamage.
+
+**ExampleProject.rCRS.mapDamage/**
+
+ Per-library analyses generated using mapDamage2.0. If rescaling is enabled, these folders also includes the model files generated for each library. See the `mapDamage2.0 documentation`_ for a description of these files.
+
+**ExampleProject.rCRS.coverage**
+
+ Coverage statistics generated using the 'paleomix coverage' command. These include per sample, per library and per contig / chromosome breakdowns.
+
+**ExampleProject.rCRS.depths**
+
+ Depth-histogram generated using 'paleomix depths' commands. As with the coverage, this information is broken down by sample, library, and contig / chromosome.
+
+**ExampleProject.rCRS.duphist**
+
+ Per-library histograms of PCR duplicates; for use with *`preseq`_* [Daley2013]_ to estimate the remaining molecular complexity of these libraries. Please refer to the original PALEOMIX publication [Schubert2014]_ for more information.
+
+**ExampleProject.summary**
+
+ A summary table, which is created for each target if enabled in the makefile. This table contains contains a summary of the project, including the number / types of reads processed, average coverage, and other statistics broken down by prefix, sample, and library.
+
+.. warning::
+ Some statistics will missing if pre-trimmed reads are included in the makefile, since PALEOMIX relies on the output from the adapter trimming software to collect these values.
+
+
+Intermediate results
+--------------------
+
+Internally, the BAM pipeline uses a simple file structure which corresponds to the visual structure of the makefile. For each target (in this case "ExampleProject") a folder of the same name is created in the directory in which the makefile is executed. This folder contains a folder containing the trimmed / collapsed reads, and a folder for each prefix (in this case, only "rCRS"), as well as some additional files used in certain analytical steps (see below):
+
+.. code-block:: bash
+
+ $ ls ExampleProject/
+ reads/
+ rCRS/
+ [...]
+
+
+Trimmed reads
+^^^^^^^^^^^^^
+
+Each of these folders in turn contains a directory structure that corresponds to the names of the samples, libraries, and lanes, shown here for Lane_1 in library ACGATA. If the option "SplitLanesByFilenames" is enabled (as shown here), several numbered folders may be created for each lane, using a 3-digit postfix:
+
+.. code-block:: bash
+
+ ExampleProject/
+ reads/
+ Synthetic_Sample_1/
+ ACGATA/
+ Lane_1_001/
+ Lane_1_002/
+ Lane_1_003/
+ [...]
+
+The contents of the lane folders contains the output of AdapterRemoval, with most filenames corresponding to the read-types listed in the makefile under the option "ExcludeReads":
+
+.. code-block:: bash
+
+ $ ls ExampleProject/reads/Synthetic_Sample_1/ACGATA/Lane_1_001/
+ reads.settings # Settings / statistics file generated by AdapterRemoval
+ reads.discarded.bz2 # Low-quality or short reads
+ reads.truncated.bz2 # Single-ended reads following adapter-removal
+ reads.collapsed.bz2 # Paired-ended reads collapsed into single reads
+ reads.collapsed.truncated.bz2 # Collapsed reads trimmed at either termini
+ reads.pair1.truncated.bz2 # The first mate read of paired reads
+ reads.pair2.truncated.bz2 # The second mate read of paired reads
+ reads.singleton.truncated.bz2 # Paired-ended reads for which one mate was discarded
+
+
+If the reads were pre-trimmed (as is the case for Lane_2 of the library ACGATA), then a single file is generated to signal that the reads have been validated (attempting to detect invalid quality scores and/or file formats):
+
+.. code-block:: bash
+
+ $ ls ExampleProject/reads/Synthetic_Sample_1/ACGATA/Lane_2/
+ reads.pretrimmed.validated
+
+The .validated file is an empty file marking the successful validation of pre-trimmed reads. If the validation fails with a false positive, creating this file for lane in question allows one to bypass the validation step.
+
+
+Mapped reads (BAM format)
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The file-structure used for mapped reads is similar to that described for the trimmed reads, but includes a larger number of files. Using lane "Lane_1" of library "ACGATA" as an example, the following files are created in each folder for that lane, with each type of reads represented (collapsed, collapsedtruncated, paired, and single) depending on the lane type (SE or PE):
+
+.. code-block:: bash
+
+ $ ls ExampleProject/rCRS/Synthetic_Sample_1/ACGATA/Lane_1_001/
+ collapsed.bai # Index file used for accessing the .bam file
+ collapsed.bam # The mapped reads in BAM format
+ collapsed.coverage # Coverage statistics
+ collapsed.validated # Log-file from Picard ValidateSamFile indicating marking that the .bam file has been validated
+ [...]
+
+For each library, two sets of files are created in the folder corresponding to the sample; these corresponds to the way in which duplicates are filtered, with one method for "normal" reads (paired and single-ended reads), and one method for "collapsed" reads (taking advantage of the fact that both external coordinates of the mapping is informative). Note however, that "collapsedtruncated" reads are included among normal reads, as at least one of the external coordinates are unreliable fo [...]
+
+.. code-block:: bash
+
+ ExampleProject/
+ rCRS/
+ Synthetic_Sample_1/
+ ACGATA.duplications_checked
+ ACGATA.rmdup.*.bai
+ ACGATA.rmdup.*.bam
+ ACGATA.rmdup.*.coverage
+ ACGATA.rmdup.*.validated
+
+With the exception of the "duplicates_checked" file, these corresponds to the files created in the lane folder. The "duplicates_checked" file marks the successful completion of a validation step in which attempts to detect data duplication due to the inclusion of the same reads / files multiple times (not PCR duplicates!).
+
+If rescaling is enabled, a set of files is created for each library, containing the BAM file generated using the mapDamage2.0 quality rescaling functionality, but otherwise corresponding to the files described above:
+
+.. code-block:: bash
+
+ ExampleProject/
+ rCRS/
+ Synthetic_Sample_1/
+ ACGATA.rescaled.bai
+ ACGATA.rescaled.bam
+ ACGATA.rescaled.coverage
+ ACGATA.rescaled.validated
+
+Finally, the resulting BAMs for each library (rescaled or not) are merged (optionally using GATK) and validated. This results in the creation of the following files in the target folder:
+
+.. code-block:: bash
+
+ ExampleProject/
+ rCRS.validated # Signifies that the "raw" BAM has been validated
+ rCRS.realigned.validated # Signifies that the realigned BAM has been validated
+ rCRS.intervals # Intervals selected by the GATK IndelRealigner during training
+ rCRS.duplications_checked # Similar to above, but catches duplicates across samples / libraries
+
+
+.. _mapDamage2.0 documentation: http://ginolhac.github.io/mapDamage/\#a7
+.. _preseq: http://smithlabresearch.org/software/preseq/
diff --git a/docs/bam_pipeline/index.rst b/docs/bam_pipeline/index.rst
new file mode 100644
index 0000000..bb8635f
--- /dev/null
+++ b/docs/bam_pipeline/index.rst
@@ -0,0 +1,22 @@
+.. _bam_pipeline:
+
+BAM Pipeline
+============
+
+**Table of Contents:**
+
+.. toctree::
+
+ overview.rst
+ requirements.rst
+ configuration.rst
+ usage.rst
+ makefile.rst
+ filestructure.rst
+
+
+The BAM Pipeline is a pipeline designed for the processing of demultiplexed high-throughput sequencing (HTS) data, primarily that generated from Illumina high-throughput sequencing platforms.
+
+The pipeline carries out trimming of adapter sequences, filtering of low quality reads, merging of overlapping mate-pairs to reduce the error rate, mapping of reads using against one or more reference genomes / sequences, filtering of PCR duplicates, analyses of / correction for post-mortem DNA damage, estimation of average coverages and depth-of-coverage histograms, and more. To ensure the correctness of the results, the pipeline invokes frequent validation of intermediate results (*e.g [...]
+
+To allow tailoring of the process to the needs of individual projects, many features may be disabled, and the behavior of most programs can be tweaked to suit the specific of a given project, down to and including only carrying out trimming of FASTQ reads, to facilitate use in other contexts.
diff --git a/docs/bam_pipeline/makefile.rst b/docs/bam_pipeline/makefile.rst
new file mode 100644
index 0000000..067f8b4
--- /dev/null
+++ b/docs/bam_pipeline/makefile.rst
@@ -0,0 +1,716 @@
+.. highlight:: YAML
+.. _bam_makefile:
+
+Makefile description
+====================
+
+.. contents::
+
+The following sections reviews the options available in the BAM pipeline makefiles. As described in the :ref:`bam_usage` section, a default makefile may be generated using the 'paleomix bam\_pipeline makefile' command. For clarity, the location of options in subsections are specified by concatenating the names using '\:\:' as a separator. Thus, in the following (simplified example), the 'UseSeed' option (line 13) would be referred to as 'Options \:\: Aligners \:\: BWA \:\: UseSeed':
+
+.. code-block:: yaml
+ :emphasize-lines: 13
+ :linenos:
+
+ Options:
+
+ # Settings for aligners supported by the pipeline
+ Aligners:
+ # Choice of aligner software to use, either "BWA" or "Bowtie2"
+ Program: BWA
+
+ # Settings for mappings performed using BWA
+ BWA:
+ # May be disabled ("no") for aDNA alignments, as post-mortem damage
+ # localizes to the seed region, which BWA expects to have few
+ # errors (sets "-l"). See http://pmid.us/22574660
+ UseSeed: yes
+
+
+Specifying command-line options
+-------------------------------
+
+For several programs it is possible to directly specify command-line options; this is accomplished in one of 3 ways; firstly, for command-line options that take a single value, this is accomplished simply by specifying the option and value as any other option. For example, if we wish to supply the option --mm 5 to AdapterRemoval, then we would list it as "--mm: 5" (all other options omitted for brevity)::
+
+ AdapterRemoval:
+ --mm: 5
+
+For options that do not take any values, such as the AdapterRemoval '--trimns' (enabling the trimming of Ns in the reads), these are specified either as "--trimmns: ", with the value left blank, or as "--trimns: yes". The following are therefore equivalent::
+
+ AdapterRemoval:
+ --trimns: # Method 1
+ --trimns: yes # Method 2
+
+In some cases the BAM pipeline will enable features by default, but still allow these to be overridden. In those cases, the feature can be disabled by setting the value to 'no' (without quotes), as shown here::
+
+ AdapterRemoval:
+ --trimns: no
+
+If you need to provide the text "yes" or "no" as the value for an option, it is necessary to put these in quotes::
+
+ --my-option: "yes"
+ --my-option: "no"
+
+In some cases it is possible or even necessary to specify an option multiple times. Due to the way YAML works, this is not possible to do so directly. Instead, the pipeline allows multiple instances of the same option by providing these as a list::
+
+ --my-option:
+ - "yes"
+ - "no"
+ - "maybe"
+
+The above will be translated into calling the program in question with the options "--my-option yes --my-option no --my-option maybe".
+
+
+Options section
+---------------
+
+By default, the 'Options' section of the makefile contains the following:
+
+.. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+
+
+Options: General
+^^^^^^^^^^^^^^^^
+
+**Options \:\: Platform**
+ .. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+ :lineno-start: 7
+ :lines: 7-8
+
+ The sequencing platform used to generate the sequencing data; this information is recorded in the resulting BAM file, and may be used by downstream tools. The `SAM/BAM specification`_ the valid platforms, which currently include 'CAPILLARY', 'HELICOS', 'ILLUMINA', 'IONTORRENT', 'LS454', 'ONT', 'PACBIO', and 'SOLID'.
+
+**Options \:\: QualityOffset**
+ .. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+ :lineno-start: 9
+ :lines: 9-13
+
+ The QualityOffset option refers to the starting ASCII value used to encode `Phred quality-scores`_ in user-provided FASTQ files, with the possible values of 33, 64, and 'Solexa'. For most modern data, this will be 33, corresponding to ASCII characters in the range '!' to 'J'. Older data is often encoded using the offset 64, corresponding to ASCII characters in the range '@' to 'h', and more rarely using Solexa quality-scores, which represent a different scheme than Phred scores, and [...]
+
+**Options \:\: SplitLanesByFilenames**
+ .. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+ :lineno-start: 14
+ :lines: 14-17
+
+ This option influences how the BAM pipeline handles lanes that include multiple files. By default (corresponding to a value of 'yes'), the pipeline will process individual files in parallel, potentially allowing for greater throughput. If set to 'no', all files in a lane are merged during processing, resulting in a single set of trimmed reads per lane. The only effect of this option on the final result is a greater number of read-groups specified in the final BAM files. See the :ref: [...]
+
+ .. warning::
+ This option is deprecated, and will be removed in future versions of PALEOMIX.
+
+
+**Options \:\: CompressionFormat**
+ .. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+ :lineno-start: 18
+ :lines: 18-19
+
+
+ This option determines which type of compression is carried out on trimmed FASTQ reads; if set to 'gz', reads are gzip compressed, and if set to 'bz2', reads are compressed using bzip2. This option has no effect on the final results, but may be used to trade off space (gz) for some additional runtime (bz2).
+
+ .. warning::
+ This option is deprecated, and may be removed in future versions of PALEOMIX.
+
+
+**Options \:\: PCRDuplicates**
+ .. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+ :lineno-start: 72
+ :lines: 72-79
+
+ This option determines how the BAM pipeline handles PCR duplicates following the mapping of trimmed reads. At present, 3 possible options are available. The first option is 'filter', which corresponds to running Picard MarkDuplicates and 'paleomix rmdup_collapsed' on the input files, and removing any read determined to be a PCR duplicate; the second option 'mark' functions like the 'filter' option, except that reads are not removed from the output, but instead the read flag is marked [...]
+
+
+Options: Adapter Trimming
+^^^^^^^^^^^^^^^^^^^^^^^^^
+
+The "AdapterRemoval" subsection allows for options that are applied when AdapterRemoval is applied to the FASTQ reads supplied by the user. For a more detailed description of command-line options, please refer to the `AdapterRemoval documentation`_. A few important particularly options are described here:
+
+**Options \:\: AdapterRemoval \:\: --adapter1** and **Options \:\: AdapterRemoval \:\: --adapter2**
+ .. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+ :lineno-start: 23
+ :lines: 23-25
+
+
+ These two options are used to specify the adapter sequences used to identify and trim reads that contain adapter contamination. Thus, the sequence provided for --adapter1 is expected to be found in the mate 1 reads, and the sequence specified for --adapter2 is expected to be found in the mate 2 reads. In both cases, these should be specified as in the orientation that appear in these files (i.e. it should be possible to grep the files for these, assuming that the reads were long enou [...]
+
+
+ .. note::
+ As of version AdapterRemoval 2.1, it is possible to use multiple threads to speed up trimming of adapter sequences. This is accomplished not by setting the --threads command-line option in the makefile, but by supplying the --adapterremoval-max-threads option to the BAM pipeline itself:
+
+ .. code-block:: bash
+
+ $ paleomix bam_pipeline run makefile.yaml --adapterremoval-max-threads 2
+
+ .. warning::
+ Older versions of PALEOMIX may use the --pcr1 and --pcr2 options instead of --adapter1 and --adapter2; for new projects, using --adapter1 and --adapter2 is strongly recommended, due to the simpler schematics (described above). If your project uses the --pcr1 and --pcr2 options, then refer to the `AdapterRemoval documentation`_ information for how to proceed!
+
+
+**Options \:\: AdapterRemoval \:\: --mm**
+ .. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+ :lineno-start: 28
+ :lines: 28
+
+ Sets the fraction of mismatches allowed when aligning reads / adapter sequences. If the specified value (MM) is greater than 1, this is calculated as 1 / MM, otherwise the value is used directly. To set, replace the default value as desired::
+
+ --mm: 3 # Maximum mismatch rate of 1 / 3
+ --mm: 5 # Maximum mismatch rate of 1 / 5
+ --mm: 0.2 # Maximum mismatch rate of 1 / 5
+
+
+**Options \:\: AdapterRemoval \:\: --minlength**
+
+ The minimum length required after read merging, adapter trimming, and base-quality quality trimming; resulting reads shorter than this length are discarded, and thereby excluded from further analyses by the pipeline. A value of at least 25 bp is recommended to cut down on the rate of spurious alignments; if possible, a value of 30 bp may be used to greatly reduce the fraction of spurious alignments, with smaller gains for greater minimums [Schubert2012]_.
+
+ .. warning::
+ The default value used by PALEOMIX for '--minlength' (25 bp) differs from the default value for AdapterRemoval (15 bp). Thus, if a minimum length of 15 bp is desired, it is nessesarily to explicitly state so in the makefile, simply commenting out this command-line argument is not sufficient.
+
+
+**Options \:\: AdapterRemoval \:\: --collapse**
+ .. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+ :lineno-start: 31
+ :lines: 31
+
+ If enabled, AdapterRemoval will attempt to combine overlapping paired-end reads into a single (potentially longer) sequence. This has at least two advantages, namely that longer reads allow for less ambiguous alignments against the target reference genome, and that the fidelity of the overlapping region (potentially the entire read) is improved by selecting the highest quality base when discrepancies are observed. The names of reads thus merged are prefixed with either 'M\_' or 'MT\_ [...]
+
+ --collapse: yes # Option enabled
+ --collapse: no # Option disabled
+
+ .. note::
+ This option may be combined with the 'ExcludeReads' option (see below), to either eliminate or select for short inserts, depending on the expectations from the experiment. I.e. for ancient samples, where the most inserts should be short enough to allow collapsing (< 2x read read - 11, by default), excluding paired (uncollapsed) and singleton reads may help reduce the fraction of exogenous DNA mapped.
+
+
+**Options \:\: AdapterRemoval \:\: --trimns**
+ .. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+ :lineno-start: 32
+ :lines: 32
+
+ If set to 'yes' (without quotes), AdapterRemoval will trim uncalled bases ('N') from the 5' and 3' end of the reads. Trimming will stop at the first called base ('A', 'C', 'G', or 'T'). If both --trimns and --trimqualities are enabled, then consecutive stretches of Ns and / or low-quality bases are trimmed from the 5' and 3' end of the reads. To disable, set the option to 'no' (without quotes)::
+
+ --trimns: yes # Option enabled
+ --trimns: no # Option disabled
+
+
+**Options \:\: AdapterRemoval \:\: --trimqualities**
+ .. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+ :lineno-start: 33
+ :lines: 33
+
+ If set to 'yes' (without quotes), AdapterRemoval will trim low-quality bases from the 5' and 3' end of the reads. Trimming will stop at the first base which is greater than the (Phred encoded) minimum quality score specified using the command-line option --minquality. This value defaults to 2. If both --trimns and --trimqualities are enabled, then consecutive stretches of Ns and / or low-quality bases are trimmed from the 5' and 3' end of the reads. To disable, set the option to 'no' [...]
+
+ --trimqualities: yes # Option enabled
+ --trimqualities: no # Option disabled
+
+
+Options: Short read aligners
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+This section allow selection between supported short read aligners (currently BWA [Li2009a]_ and Bowtie2 [Langmead2012]_), as well as setting options for these, individually:
+
+.. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+ :lineno-start: 35
+ :lines: 35-39
+
+
+To select a mapping program, set the 'Program' option appropriately::
+
+ Program: BWA # Using BWA to map reads
+ Program: Bowtie2 # Using Bowtie2 to map reads
+
+
+Options: Short read aligners - BWA
+""""""""""""""""""""""""""""""""""
+
+ The following options are applied only when running the BWA short read aligner; see the section "Options: Short read aligners" above for how to select this aligner.
+
+ .. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+ :lineno-start: 40
+ :lines: 40-54
+
+ **Options \:\: Aligners \:\: BWA \:\: Algorithm**
+ .. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+ :lineno-start: 42
+ :lines: 42-44
+
+ The mapping algorithm to use; options are 'backtrack' (corresponding to 'bwa aln'), 'bwasw', and 'mem'. Additional command-line options may be specified for these. Algorithms are selected as follows::
+
+ Algorithm: backtrack # 'Backtrack' algorithm, using the command 'bwa aln'
+ Algorithm: bwasw # 'SW' algorithm for long queries, using the command 'bwa bwasw'
+ Algorithm: mem # 'mem' algorithm, using the command 'bwa mem'
+
+ .. warning::
+
+ Alignment algorithms 'bwasw' and 'mem' currently cannot be used with input data that is encoded using QualityOffset 64 or 'Solexa'. This is a limitation of PALEOMIX, and will be resolved in future versions. In the mean time, this can be circumvented by converting FASTQ reads to the standard quality-offset 33, using for example `seqtk`_.
+
+
+ **Options \:\: Aligners \:\: BWA \:\: MinQuality**
+ .. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+ :lineno-start: 45
+ :lines: 45-46
+
+ Specifies the minimum mapping quality of alignments produced by BWA. Any aligned read with a quality score below this value are removed during the mapping process. Note that while unmapped read have a quality of zero, these are not excluded by a non-zero 'MinQuality' value. To filter unmapped reads, use the option 'FilterUnmappedReads' (see below). To set this option, replace the default value with a desired minimum::
+
+ MinQuality: 0 # Keep all hits
+ MinQuality: 25 # Keep only hits where mapping-quality >= 25
+
+ **Options \:\: Aligners \:\: BWA \:\: FilterUnmappedReads**
+ .. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+ :lineno-start: 47
+ :lines: 47-48
+
+ Specifies wether or not unmapped reads (reads not aligned to a target sequence) are to be retained in the resulting BAM files. If set to 'yes' (without quotes), all unmapped reads are discarded during the mapping process, while setting the option to 'no' (without quotes) retains these reads in the BAM. By convention, paired reads in which one mate is unmapped are assigned the same chromosome and position, while no chromosome / position are assigned to unmapped single-end reads. T [...]
+
+ FilterUnmappedReads: yes # Remove unmapped reads during alignment
+ FilterUnmappedReads: no # Keep unmapped reads
+
+ **Options \:\: Aligners \:\: BWA \:\: \***
+
+ Additional command-line options may be specified for the selected alignment algorithm, as described in the "Specifying command-line options" section above. See also the examples listed for Bowtie2 below. Note that for the 'backtrack' algorithm, it is only possible to specify options for the 'bwa aln' call.
+
+
+
+Options: Short read aligners - Bowtie2
+""""""""""""""""""""""""""""""""""""""
+ The following options are applied only when running the Bowtie2 short read aligner; see the section "Options: Short read aligners" above for how to select this aligner.
+
+ .. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+ :lineno-start: 56
+ :lines: 56-70
+
+ **Options \:\: Aligners \:\: Bowtie2 \:\: MinQuality**
+ .. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+ :lineno-start: 58
+ :lines: 58-59
+
+ See 'Options \:\: Aligners \:\: BWA \:\: MinQuality' above.
+
+ **Options \:\: Aligners \:\: Bowtie2 \:\: FilterUnmappedReads**
+ .. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+ :lineno-start: 60
+ :lines: 60-61
+
+ See 'Options \:\: Aligners \:\: BWA \:\: FilterUnmappedReads' above.
+
+ **Options \:\: Aligners \:\: BWA \:\: \***
+
+ Additional command-line options may be specified for Bowtie2, as described in the "Specifying command-line options" section above. Please refer to the `Bowtie2 documentation`_ for more information about available command-line options.
+
+
+Options: mapDamage plots and rescaling
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ .. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+ :lineno-start: 80
+ :lines: 80-86
+
+ This subsection is used to specify options for mapDamage2.0, when plotting *post-mortem* DNA damage, when building models of the *post-mortem* damage, and when rescaling quality scores to account for this damage. In order to enable plotting, modeling, or rescaling of quality scores, please see the 'mapDamage' option in the 'Features' section below.
+
+ .. note::
+ It may be worthwhile to tweak mapDamage parameters before building a model of *post-mortem* DNA damage; this may be accomplished by running the pipeline without rescaling, running with the 'mapDamage' feature set to 'plot' (with or without quotes), inspecting the plots generated per-library, and then tweaking parameters as appropriate, before setting 'RescaleQualities' to 'model' (with or without quotes).
+
+ Disabling the construction of the final BAMs may be accomplished by setting the features 'RawBam' and 'RealignedBAM' to 'no' (without quotes) in the 'Features' section (see below), and then setting the desired option to yes again after enabling rescaling and adding the desired options to the mapDamage section.
+
+ Should you wish to change the modeling and rescaling parameters, after having already run the pipeline with RescaleQualities enabled, simply remove the mapDamage files generated for the relevant libraries (see the :ref:`bam_filestructure` section).
+
+ .. warning::
+ Rescaling requires a certain minimum number of C>T and G>A substitutions, before it is possible to construct a model of *post-mortem* DNA damage. If mapDamage fails with an error indicating that "DNA damage levels are too low", then it is necessary to disable rescaling for that library to continue.
+
+
+**Options \:\: mapDamage :: --downsample**
+ .. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+ :lineno-start: 84
+ :lines: 84-86
+
+ By default the BAM pipeline only samples 100k reads for use in constructing mapDamage plots; in our experience, this is sufficient for accurate plots and models. If no downsampling is to be done, this value can set to 0 to disable this features::
+
+ --downsample: 100000 # Sample 100 thousand reads
+ --downsample: 1000000 # Sample 1 million reads
+ --downsample: 0 # No downsampling
+
+
+**Options \:\: mapDamage :: \***
+
+ Additional command-line options may be supplied to mapDamage, just like the '--downsample' parameter, as described in the "Specifying command-line options" section above. These are used during plotting and rescaling (if enabled).
+
+
+Options: Excluding read-types
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+ .. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+ :lineno-start: 88
+ :lines: 88-102
+
+ During the adapter-trimming and read-merging step, AdapterRemoval will generate a selection of different read types. This option allows certain read-types to be excluded from further analyses. In particular, it may be useful to exclude non-collapsed (paired and singleton) reads when processing (ancient) DNA for which only short inserts is expected, since this may help exclude exogenous DNA. The following read types are currently recognized:
+
+ *Single*
+ Single-end reads; these are the (trimmed) reads generated from supplying single-end FASTQ files to the pipeline.
+
+ *Paired*
+ Paired-end reads; these are the (trimmed) reads generated from supplying paired-end FASTQ files to the pipeline, but covering only the subset of paired reads for which *both* mates were retained, and which were not merged into a single read (if --collapse is set for AdapterRemoval).
+
+ *Singleton*
+ Paired-end reads; these are (trimmed) reads generated from supplying paired-end FASTQ files to the pipeline, but covering only those reads in which one of the two mates were discarded due to either the '--maxns', the '--minlength', or the '--maxlength' options supplied to AdapterRemoval. Consequently, these reads are mapped and PCR-duplicate filtered in single-end mode.
+
+ *Collapsed*
+ Paired-end reads, for which the sequences overlapped, and which were consequently merged by AdapterRemoval into a single sequence (enabled by the --collapse command-line option). These sequences are expected to represent the complete insert, and while they are mapped in single-end mode, PCR duplicate filtering is carried out in a manner that treats these as paired reads. Note that all collapsed reads are tagged by prefixing the read name with 'M\_'.
+
+ *CollapsedTruncated*
+ Paired-end reads (like *Collapsed*), which were trimmed due to the '--trimqualities' or the '--trimns' command-line options supplied to AdapterRemoval. Consequently, and as these sequences represent the entire insert, these reads are mapped and PCR-duplicate filtered in single-end mode. Note that all collapsed, truncated reads are tagged by prefixing the read name with 'MT\_'.
+
+ To enable / disable exclusion of a read type, set the value for the appropriate type to 'yes' or 'no' (without quotes)::
+
+ Singleton: no # Singleton reads are NOT excluded
+ Singleton: yes # Singleton reads are excluded
+
+
+Options: Optional features
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+ .. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+ :lineno-start: 104
+ :lines: 104-127
+
+ This section lists several optional features, in particular determining which BAM files and which summary statistics are generated when running the pipeline. Currently, the following options are available:
+
+ *RawBAM*
+ If enabled, the pipeline will generate a final BAM, which is NOT processed using the GATK Indel Realigner (see below), following all other processing steps.
+
+ *RealignedBAM*
+ If enabled, the pipeline will generate a final BAM, which is processed using the GATK Indel Realigner [McKenna2010]_, in order to improve the alignment near indels, by performing a multiple sequence alignment in regions containing putative indels.
+
+ *mapDamage*
+ The 'mapDamage' option accepts four possible values: 'no', 'plot', 'model', and 'rescale'. By default value ('plot'), will cause mapDamage to be run in order to generate simple plots of the *post-mortem* DNA damage rates, as well as base composition plots, and more. If set to 'model', mapDamage will firstly generate the plots described for 'plot', but also construct models of DNA damage parameters, as described in [Jonsson2013]_. Note that a minimum amount of DNA damage is requir [...]
+
+ *Coverage*
+ If enabled, a table summarizing the number of hits, the number of aligned bases, bases inserted, and bases deleted, as well as the mean coverage, is generated for each reference sequence, stratified by sample, library, and contig.
+
+ *Depths*
+ If enabled, a table containing a histogram of the depth of coverage, ranging from 0 to 200, is generated for each reference sequence, stratified by sample, library, and contig. These files may further be used by the Phylogenetic pipeline, in order to automatically select a maximum read depth during SNP calling (see the :ref:`phylo_usage` section for more information).
+
+ *Summary*
+ If enabled, a single summary table will be generated per target, containing information about the number of reads processed, hits and fraction of PCR duplicates (per prefix and per library), and much more.
+
+ *DuplicateHist*
+ If enabled, a histogram of the estimated number of PCR duplicates observed per DNA fragment is generated per library. This may be used with the 'preseq' program in order to estimate the (remaining) complexity of a given library, and thereby direct future sequencing efforts [Daley2013]_.
+
+ For a description of where files are placed, refer to the :ref:`bam_filestructure` section. It is possible to run the BAM pipeline without any of these options enabled, and this may be useful in certain cases (if only the statistics or per-library BAMs are needed). To enable / disable a features, set the value for that feature to 'yes' or 'no' (without quotes)::
+
+ Summary: no # Do NOT generate a per-target summary table
+ Summary: yes # Generate a per-target summary table
+
+
+Prefixes section
+----------------
+
+.. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+ :lineno-start: 129
+ :lines: 129-149
+
+
+Reference genomes used for mapping are specified by listing these (one or more) in the 'Prefixes' section. Each reference genome is associated with a name (used in summary statistics and as part of the resulting filenames), and the path to a FASTA file which contains the reference genome. Several other options are also available, but only the name and the 'Path' value are required, as shown here for several examples::
+
+ # Map of prefixes by name, each having a Path key, which specifies the
+ # location of the BWA/Bowtie2 index, and optional label, and an option
+ # set of regions for which additional statistics are produced.
+ Prefixes:
+ # Name of the prefix; is used as part of the output filenames
+ MyPrefix1:
+ # Path to FASTA file containing reference genome; must end with '.fasta'
+ Path: /path/to/genomes/file_1.fasta
+ MyPrefix2:
+ Path: /path/to/genomes/file_2.fasta
+ MyPrefix3:
+ Path: /path/to/genomes/AE008922_1.fasta
+
+Each sample in the makefile is mapped against each prefix, and BAM files are generated according to the enabled 'Features' (see above). In addition to the path, two other options are available per prefix, namely the 'Label' and 'RegionsOfInterest', which are described below.
+
+.. warning::
+ FASTA files used in the BAM pipeline *must* be named with a .fasta file extension. Furthermore, if alignments are to be carried out against the human nuclear genome, chromosomes MUST be ordered by their number for GATK to work! See the `GATK FAQ`_ for more information.
+
+
+Regions of interest
+^^^^^^^^^^^^^^^^^^^
+
+It is possible to specify one or more "regions of interest" for a particular reference genome. Doing so results in the production of coverage and depth tables being generated for those regions (if these features are enabled, see above), as well as additional information in the summary table (if enabled, see above).
+
+Such regions are specified using a BED file containing one or more regions; in particular, the first three columns (name, 0-based start coordinate, and 1-based end coordinate) are required, with the 4th column (the name) being optional. Strand information (the 6th column) is not used, but must still be valid according to the BED format.
+
+If these regions are named, statistics are merged by these names (essentially treating them as pseudo contigs), while regions are merged by contig. Thus, it is important to insure that names are unique if statistics are desired for very single region, individually.
+
+Specifying regions of interest is accomplished by providing a name and a path for each set of regions of interest under the 'RegionOfInterest' section for a given prefix::
+
+ # Produce additional coverage / depth statistics for a set of
+ # regions defined in a BED file; if no names are specified for the
+ # BED records, results are named after the chromosome / contig.
+ RegionsOfInterest:
+ MyRegions: /path/to/my_regions.bed
+ MyOtherRegions: /path/to/my_other_regions.bed
+
+The following is a simple example of such a BED file, for an alignment against the rCRS (`NC_012920.1`_)::
+
+ NC_012920_1 3306 4262 region_a
+ NC_012920_1 4469 5510 region_b
+ NC_012920_1 5903 7442 region_a
+
+In this case, the resulting tables will contain information about two different regions, namely region\_a (2495 bp, resulting from merging the two individual regions specified), and region\_b (1041 bp). The order of lines in this file does not matter.
+
+
+Adding multiple prefixes
+^^^^^^^^^^^^^^^^^^^^^^^^
+
+In cases where it is necessary to map samples against a large number of reference genomes, it may become impractical to add these to the makefile by hand. To allow such use-cases, it is possible to specify the location of the reference genomes via a path containing wild-cards, and letting the BAM pipeline collect these automatically. For the following example, we assume that we have a folder '/path/to/genomes', which contains our reference genomes:
+
+.. code-block:: bash
+
+ $ ls /path/to/genomes
+ AE000516_2.fasta
+ AE004091_2.fasta
+ AE008922_1.fasta
+ AE008923_1.fasta
+
+To automatically add these (4) reference genomes to the makefile, we would add a prefix as follows::
+
+ # Map of prefixes by name, each having a Path key, which specifies the
+ # location of the BWA/Bowtie2 index, and optional label, and an option
+ # set of regions for which additional statistics are produced.
+ Prefixes:
+ # Name of the prefix; is used as part of the output filenames
+ MyGenomes*:
+ # Path to .fasta file containing a set of reference sequences.
+ Path: /path/to/genomes/*.fasta
+
+There are two components to this, namely the name of the pseudo-prefix which *must* end with a star (\*), and the path which may contain one or more wild-cards. If the prefix name does not end with a star, the BAM pipeline will simply treat the path as a regular path. In this particular case, the BAM pipeline will perform the equivalent of 'ls /path/to/genomes/\*.fasta', and then add each file it has located using the filename without extensions as the name of the prefix. In other words, [...]
+
+ # Map of prefixes by name, each having a Path key, which specifies the
+ # location of the BWA/Bowtie2 index, and optional label, and an option
+ # set of regions for which additional statistics are produced.
+ Prefixes:
+ # Name of the prefix; is used as part of the output filenames
+ AE000516_2:
+ Path: /path/to/genomes/AE000516_2.fasta
+ AE004091_2:
+ Path: /path/to/genomes/AE004091_2.fasta
+ AE008922_1:
+ Path: /path/to/genomes/AE008922_1.fasta
+ AE008923_1:
+ Path: /path/to/genomes/AE008923_1.fasta
+
+A makefile including such prefixes is executed as any other makefile.
+
+.. note::
+ The name provided for the pseudo-prefix (here 'MyGenomes') is not used by the pipeline, and can instead be used to document the nature of the files being included.
+
+.. warning::
+ Just like regular prefixes, it is required that the filename of the reference genome ends with '.fasta'. However, the pipeline will attempt to add *any* file found using the provided path with wildcards, and care should therefore be taken to avoid including non-FASTA files. For example, if the path '/path/to/genomes/\*' was used instead of '/path/to/genomes/\*.fasta', this would cause the pipeline to abort due to the inclusion of (for example) non-FASTA index files generated at this [...]
+
+
+Prefix labels
+^^^^^^^^^^^^^
+.. code-block:: yaml
+
+ Prefixes:
+ # Uncomment and replace 'NAME_OF_PREFIX' with name of the prefix; this name
+ # is used in summary statistics and as part of output filenames.
+ # NAME_OF_PREFIX:
+ # ...
+
+ # (Optional) Uncomment and replace 'LABEL' with one of 'nuclear',
+ # 'mitochondrial', 'chloroplast', 'plasmid', 'bacterial', or 'viral'.
+ # Label: LABEL
+
+The label option for prefixes allow a prefix to be classified according to one of several categories, currently including 'nuclear', 'mitochondrial', 'chloroplast', 'plasmid', 'bacterial', and 'viral'. This is only used when generating the .summary files (if the 'Summary' feature is enabled), in which the label is used instead of the prefix name, and the results for prefixes with the same label are combined.
+
+.. warning::
+ Labels are deprecated, and will either be removed in future versions of PALEOMIX, or significantly changed.
+
+
+Targets section
+---------------
+.. literalinclude:: makefile.yaml
+ :language: yaml
+ :linenos:
+ :lineno-start: 152
+ :lines: 152-
+
+In the BAM pipeline, the term 'Target' is used to refer not to a particular sample (though in typical usage a target includes just one sample), but rather one or more samples to processed together to generate a BAM file per prefix (see above). A sample included in a target may likewise contain one or more libraries, for each of which one or more sets of FASTQ reads are specified.
+
+The following simplified example, derived from the makefile constructed as part of :ref:`bam_usage` section exemplifies this:
+
+.. code-block:: yaml
+ :linenos:
+
+ # Target name; all output files uses this name as a prefix
+ MyFilename:
+ # Sample name; used to tag data for segregation in downstream analyses
+ MySample:
+ # Library name; used to tag data for segregation in downstream analyses
+ TGCTCA:
+ # Lane / run names and paths to FASTQ files
+ Lane_1: 000_data/TGCTCA_L1_*.fastq.gz
+ Lane_2: 000_data/TGCTCA_L2_R{Pair}_*.fastq.gz
+
+
+*Target name*
+ The first top section of this target (line 1, 'MyFilename') constitute the target name. This name is used as part of summary statistics and, more importantly, determined the first part of name of files generated as part of the processing of data specified for this target. Thus, in this example all files and folders generated during the processing of this target will start with 'MyFilename'; for example, the summary table normally generated from running the pipeline will be placed in [...]
+
+*Sample name*
+ The subsections listed in the 'Target' section (line 2, 'MySample') constitute the (biological) samples included in this target; in the vast majority of analyses, you will have only a single sample per target, and in that case it is considered good practice to use the same name for both the target and the sample. A single target can, however, contain any number of samples, the data for which are tagged according to the names given in the makefile, using the SAM/BAM readgroup ('RG') tags.
+
+*Library name*
+ The subsections listed in the 'Sample' section (line 3, 'TGCTCA') constitute the sequencing libraries constructed during the extraction and library building for the current sample. For modern samples, there is typically only a single library per sample, but more complex sequencing projects (modern and ancient) may involve any number of libraries constructed from one or more extracts. It is very important that libraries be listed correctly (see below).
+
+ .. warning::
+ Note that the BAM pipeline imposes the restriction that each library name specified for a target must be unique, even if these are located in to different samples. This restriction may be removed in future versions of PALEOMIX.
+
+*Lane name*
+ The subsections of each library are used to specify the names of individual
+
+In addition to these target (sub)sections, it is possible to specify 'Options' for individual targets, samples, and libraries, similarly to how this is done globally at the top of the makefile. This is described below.
+
+.. warning::
+ It is very important that lanes are assigned to their corresponding libraries in the makefile; while it is possible to simply record every sequencing run / lane under a single library and run the pipeline like that, this will result in several unintended side effects: Firstly, the BAM pipeline uses the library information to ensure that PCR duplicates are filtered correctly. Wrongly grouping together lanes will result either in the loss of sequences which are not, in fact, PCR duplic [...]
+
+
+
+Including already trimmed reads
+^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In some cases it is useful to include FASTQ reads that has already been trimmed for adapter sequences. While this is not recommended in general, as it may introduce systematic bias if some data has been processed differently than the remaining FASTQ reads, the BAM pipeline makes it simple to incorporate both 'raw' and trimmed FASTQ reads, and to ensure that these integrate in the pipeline.
+
+To include already trimmed reads, these are specified as values belonging to a lane, using the same names for read-types as in the 'ExcludeReads' option (see above). The following minimal example demonstrates this:
+
+.. code-block:: yaml
+ :linenos:
+
+ MyFilename:
+ MySample:
+ ACGATA:
+ # Regular lane, containing reads that are not already trimmed
+ Lane_1: 000_data/ACGATA_L1_R{Pair}_*.fastq.gz
+
+ # Lane containing pre-trimmed reads of each type
+ Lane_2:
+ # Single-end reads
+ Single: /path/to/single_end_reads.fastq.gz
+
+ # Paired-end reads where one mate has been discarded
+ Singleton: /path/to/singleton_reads.fastq.gz
+
+ # Paired end reads; note that the {Pair} key is required,
+ # just like with raw, paired-end reads
+ Paired: /path/to/paired_end_{Pair}.fastq.gz
+
+ # Paired-end reads merged into a single sequence
+ Collapsed: /path/to/collapsed.fastq.gz
+
+ # Paired-end reads merged into a single sequence, and then truncated
+ CollapsedTruncated: /path/to/collapsed_truncated.fastq.gz
+
+The above examples show how each type of reads are to be listed, but it is not necessary to specify more than a single type of pre-trimmed reads in the makefile.
+
+.. note::
+ Including already trimmed reads currently result in the absence of some summary statistics in the .summary file, namely the number of raw reads, as well as trimming statistics, since the BAM pipeline currently relies on AdapterRemoval to collect these statistics.
+
+Overriding global settings
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In addition to the 'Options' section included, by default, at the beginning of every makefile, it is possible to specify / override options at a Target, Sample, and Library level. This allows, for example, that different adapter sequences be specified for each library generated for a sample, or options that should only be applied to a particular sample among several included in a makefile. The following demonstration uses the makefile constructed as part of :ref:`bam_usage` section as the base:
+
+.. code-block:: yaml
+ :linenos:
+ :emphasize-lines: 2-7, 10-14, 20-23
+
+ MyFilename:
+ # These options apply to all samples with this filename
+ Options:
+ # In this example, we override the default adapter sequences
+ AdapterRemoval:
+ --adapter1: AGATCGGAAGAGC
+ --adapter2: AGATCGGAAGAGC
+
+ MySample:
+ # These options apply to libraries 'ACGATA', 'GCTCTG', and 'TGCTCA'
+ Options:
+ # In this example, we assume that FASTQ files for our libraries
+ # include Phred quality scores encoded with offset 64.
+ QualityOffset: 64
+
+ ACGATA:
+ Lane_1: 000_data/ACGATA_L1_R{Pair}_*.fastq.gz
+
+ GCTCTG:
+ # These options apply to 'Lane_1' in the 'GCTCTG' library
+ Options:
+ # It is possible to override options we have previously overridden
+ QualityOffset: 33
+
+ Lane_1: 000_data/GCTCTG_L1_*.fastq.gz
+
+ TGCTCA:
+ Lane_1: 000_data/TGCTCA_L1_*.fastq.gz
+ Lane_2: 000_data/TGCTCA_L2_R{Pair}_*.fastq.gz
+
+
+In this example, we have overwritten options at 3 places:
+
+* The first place (lines 2 - 7) will be applied to *all* samples, libraries, and lanes in this target, unless subsequently overridden. In this example, we have set a new pair of adapter sequences, which we wish to use for these data.
+
+* The second place (lines 10 - 14) are applied to the sample 'MySample' that we have included in this target, and consequently applies to all libraries specified for this sample ('ACGATA', 'GCTCTG', and 'TGCTCA'). In most cases you will only have a single sample, and so it will not make a difference whether or not you override options for the entire target (e.g. lines 3 - 8), or just for that sample (e.g. lines 11-15).
+
+* Finally, the third place (lines 20 - 23) demonstrate how options can be overridden for a particular library. In this example, we have chosen to override an option (for this library only!) we previously overrode for that sample (the 'QualityOffset' option).
+
+.. note:: It currently not possible to override options for a single lane, it is only possible to override options for all lanes in a library.
+
+.. warning::
+ It is currently not possible to set the 'Features' except in the global 'Options' section at the top of the Makefile; this limitation will be removed in future versions of PALEOMIX.
+
+
+
+.. _AdapterRemoval documentation: https://github.com/MikkelSchubert/adapterremoval
+.. _Bowtie2 documentation: http://bowtie-bio.sourceforge.net/bowtie2/manual.shtml
+.. _GATK FAQ: http://www.broadinstitute.org/gatk/guide/article?id=1204
+.. _NC_012920.1: http://www.ncbi.nlm.nih.gov/nuccore/251831106
+.. _Phred quality-scores: https://en.wikipedia.org/wiki/FASTQ_format#Quality
+.. _SAM/BAM specification: http://samtools.sourceforge.net/SAM1.pdf
+.. _seqtk: https://github.com/lh3/seqtk
diff --git a/docs/bam_pipeline/makefile.yaml b/docs/bam_pipeline/makefile.yaml
new file mode 100644
index 0000000..43056c4
--- /dev/null
+++ b/docs/bam_pipeline/makefile.yaml
@@ -0,0 +1,162 @@
+# Default options.
+# Can also be specific for a set of samples, libraries, and lanes,
+# by including the "Options" hierarchy at the same level as those
+# samples, libraries, or lanes below. This does not include
+# "Features", which may only be specific globally.
+Options:
+ # Sequencing platform, see SAM/BAM reference for valid values
+ Platform: Illumina
+ # Quality offset for Phred scores, either 33 (Sanger/Illumina 1.8+)
+ # or 64 (Illumina 1.3+ / 1.5+). For Bowtie2 it is also possible to
+ # specify 'Solexa', to handle reads on the Solexa scale. This is
+ # used during adapter-trimming and sequence alignment
+ QualityOffset: 33
+ # Split a lane into multiple entries, one for each (pair of) file(s)
+ # found using the search-string specified for a given lane. Each
+ # lane is named by adding a number to the end of the given barcode.
+ SplitLanesByFilenames: yes
+ # Compression format for FASTQ reads; 'gz' for GZip, 'bz2' for BZip2
+ CompressionFormat: bz2
+
+ # Settings for trimming of reads, see AdapterRemoval man-page
+ AdapterRemoval:
+ # Adapter sequences, set and uncomment to override defaults
+# --adapter1: AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG
+# --adapter2: AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT
+ # Some BAM pipeline defaults differ from AR defaults;
+ # To override, change these value(s):
+ --mm: 3
+ --minlength: 25
+ # Extra features enabled by default; change 'yes' to 'no' to disable
+ --collapse: yes
+ --trimns: yes
+ --trimqualities: yes
+
+ # Settings for aligners supported by the pipeline
+ Aligners:
+ # Choice of aligner software to use, either "BWA" or "Bowtie2"
+ Program: BWA
+
+ # Settings for mappings performed using BWA
+ BWA:
+ # One of "backtrack", "bwasw", or "mem"; see the BWA documentation
+ # for a description of each algorithm (defaults to 'backtrack')
+ Algorithm: backtrack
+ # Filter aligned reads with a mapping quality (Phred) below this value
+ MinQuality: 0
+ # Filter reads that did not map to the reference sequence
+ FilterUnmappedReads: yes
+ # May be disabled ("no") for aDNA alignments, as post-mortem damage
+ # localizes to the seed region, which BWA expects to have few
+ # errors (sets "-l"). See http://pmid.us/22574660
+ UseSeed: yes
+ # Additional command-line options may be specified for the "aln"
+ # call(s), as described below for Bowtie2 below.
+
+ # Settings for mappings performed using Bowtie2
+ Bowtie2:
+ # Filter aligned reads with a mapping quality (Phred) below this value
+ MinQuality: 0
+ # Filter reads that did not map to the reference sequence
+ FilterUnmappedReads: yes
+ # Examples of how to add additional command-line options
+# --trim5: 5
+# --trim3: 5
+ # Note that the colon is required, even if no value is specified
+ --very-sensitive:
+ # Example of how to specify multiple values for an option
+# --rg:
+# - CN:SequencingCenterNameHere
+# - DS:DescriptionOfReadGroup
+
+ # Mark / filter PCR duplicates. If set to 'filter', PCR duplicates are
+ # removed from the output files; if set to 'mark', PCR duplicates are
+ # flagged with bit 0x400, and not removed from the output files; if set to
+ # 'no', the reads are assumed to not have been amplified. Collapsed reads
+ # are filtered using the command 'paleomix rmdup_duplicates', while "normal"
+ # reads are filtered using Picard MarkDuplicates.
+ PCRDuplicates: filter
+
+ # Command-line options for mapDamage; note that the long-form
+ # options are expected; --length, not -l, etc. Uncomment the
+ # "mapDamage" line adding command-line options below.
+ mapDamage:
+ # By default, the pipeline will downsample the input to 100k hits
+ # when running mapDamage; remove to use all hits
+ --downsample: 100000
+
+ # Set to 'yes' exclude a type of trimmed reads from alignment / analysis;
+ # possible read-types reflect the output of AdapterRemoval
+ ExcludeReads:
+ # Exclude single-end reads (yes / no)?
+ Single: no
+ # Exclude non-collapsed paired-end reads (yes / no)?
+ Paired: no
+ # Exclude paired-end reads for which the mate was discarded (yes / no)?
+ Singleton: no
+ # Exclude overlapping paired-ended reads collapsed into a single sequence
+ # by AdapterRemoval (yes / no)?
+ Collapsed: no
+ # Like 'Collapsed', but only for collapsed reads truncated due to the
+ # presence of ambiguous or low quality bases at read termini (yes / no).
+ CollapsedTruncated: no
+
+ # Optional steps to perform during processing.
+ Features:
+ # Generate BAM from the alignments without indel realignment (yes / no)
+ RawBAM: no
+ # Generate indel-realigned BAM using the GATK Indel realigner (yes / no)
+ RealignedBAM: yes
+ # To disable mapDamage, write 'no'; to generate basic mapDamage plots,
+ # write 'plot'; to build post-mortem damage models, write 'model',
+ # and to produce rescaled BAMs, write 'rescale'. The 'model' option
+ # includes the 'plot' output, and the 'rescale' option includes both
+ # 'plot' and 'model' results. All analyses are carried out per library.
+ mapDamage: plot
+ # Generate coverage information for the raw BAM (wo/ indel realignment).
+ # If one or more 'RegionsOfInterest' have been specified for a prefix,
+ # additional coverage files are generated for each alignment (yes / no)
+ Coverage: yes
+ # Generate histogram of number of sites with a given read-depth, from 0
+ # to 200. If one or more 'RegionsOfInterest' have been specified for a
+ # prefix, additional histograms are generated for each alignment (yes / no)
+ Depths: yes
+ # Generate summary table for each target (yes / no)
+ Summary: yes
+ # Generate histogram of PCR duplicates, for use with PreSeq (yes / no)
+ DuplicateHist: no
+
+
+# Map of prefixes by name, each having a Path key, which specifies the
+# location of the BWA/Bowtie2 index, and optional label, and an option
+# set of regions for which additional statistics are produced.
+Prefixes:
+ # Replace 'NAME_OF_PREFIX' with name of the prefix; this name
+ # is used in summary statistics and as part of output filenames.
+ NAME_OF_PREFIX:
+ # Replace 'PATH_TO_PREFIX' with the path to .fasta file containing the
+ # references against which reads are to be mapped. Using the same name
+ # as filename is strongly recommended (e.g. /path/to/Human_g1k_v37.fasta
+ # should be named 'Human_g1k_v37').
+ Path: PATH_TO_PREFIX
+
+ # (Optional) Uncomment and replace 'PATH_TO_BEDFILE' with the path to a
+ # .bed file listing extra regions for which coverage / depth statistics
+ # should be calculated; if no names are specified for the BED records,
+ # results are named after the chromosome / contig. Change 'NAME' to the
+ # name to be used in summary statistics and output filenames.
+# RegionsOfInterest:
+# NAME: PATH_TO_BEDFILE
+
+
+# Mapping targets are specified using the following structure. Uncomment and
+# replace 'NAME_OF_TARGET' with the desired prefix for filenames.
+#NAME_OF_TARGET:
+ # Uncomment and replace 'NAME_OF_SAMPLE' with the name of this sample.
+# NAME_OF_SAMPLE:
+ # Uncomment and replace 'NAME_OF_LIBRARY' with the name of this sample.
+# NAME_OF_LIBRARY:
+ # Uncomment and replace 'NAME_OF_LANE' with the name of this lane,
+ # and replace 'PATH_WITH_WILDCARDS' with the path to the FASTQ files
+ # to be trimmed and mapped for this lane (may include wildcards).
+# NAME_OF_LANE: PATH_WITH_WILDCARDS
diff --git a/docs/bam_pipeline/overview.rst b/docs/bam_pipeline/overview.rst
new file mode 100644
index 0000000..ee4c105
--- /dev/null
+++ b/docs/bam_pipeline/overview.rst
@@ -0,0 +1,59 @@
+Overview of analytical steps
+============================
+
+During a typical analyses, the BAM pipeline will proceed through the following steps. Note that the exact order in which each step is carried out during execution is not necessarily as shown below, since the exact steps depend on the user settings, and since the pipeline will automatically run steps as soon as possible:
+
+
+1. Initial steps
+
+ 1. Each prefix (reference sequences in FASTA format) is indexed using either "bwa index" or "bowtie-build", depending on the configuration used.
+
+ 2. Each prefix is indexed using "samtools faidx".
+
+ 3. A sequence dictionary is built for each prefix using Picard BuildSequenceDictionary.jar
+
+2. Preprocessing of reads
+
+ 2. Adapter sequences, low quality bases and ambiguous bases are trimmed; overlapping paired-end reads are merged, and short reads are filtered using AdapterRemoval [Lindgreen2012]_.
+
+3. Mapping of reads
+
+ 1. Processed reads resulting from the adapter-trimming / read-collapsing step above are mapped using the chosen aligner (BWA or Bowtie2). The resulting alignments are tagged using the information specified in the makefile (sample, library, lane, etc.).
+
+ 2. The records of the resulting BAM are updated using "samtools fixmate" to ensure that PE reads contain the correct information about the mate read).
+
+ 3. The BAM is sorted using "samtools sort", indexed using "samtools index" (if required based on the current configuration), and validated using Picard ValidateSamFile.jar.
+
+ 4. Finally, the records are updated using "samtools calmd" to ensure consistent reporting of the number of mismatches relative to the reference genome (BAM tag 'NM').
+
+4. Processing of preexisting BAM files
+
+ 1. Any preexisting BAM files are re-tagged using Picard 'AddOrReplaceReadGroups.jar' to match the tagging of other reads processed by the pipeline.
+
+ 2. The resulting BAM is sorted, updated using "samtools calmd", indexed using "samtools index" (if required), and validated using Picard ValidateSamFile.jar.
+
+5. Filtering of duplicates, rescaling of quality scores, and validation
+
+ 1. If enabled, PCR duplicates are filtered using Picard MarkDuplicates.jar (for SE and PE reads) and "paleomix rmdup_collapsed" (for collapsed reads; see the :ref:`other_tools` section). PCR filtering is carried out per library.
+
+ 2. If "Rescaling" is enabled, quality scores of bases that are potentially the result of *post-mortem* DNA damage are recalculated using mapDamage2.0 [Jonsson2013]_.
+
+ 3. The resulting BAMs are indexed and validated using Picard ValidateSamFile.jar. Mapped reads at each position of the alignments are compared using the query name, sequence, and qualities. If a match is found, it is assumed to represent a duplication of input data (see :ref:`troubleshooting_bam`).
+
+6. Generation of final BAMs
+
+ 1. If the "Raw BAM" feature is enabled, each BAM in the previous step is merged into a final BAM file.
+
+ 2. If the "Realigned BAM" feature is enabled, each BAM generated in the previous step is merged, and GATK IndelRealigner is used to perform local realignment around indels, to improve downstream analyses. The resulting BAM is updated using "samtools calmd" as above.
+
+7. Statistics
+
+ 1. If the "Summary" feature is enable, a single summary table is generated for each target. This table summarizes the input data in terms of the raw number of reads, the number of reads following filtering / collapsing, the fraction of reads mapped to each prefix, the fraction of reads filtered as duplicates, and more.
+
+ 2. Coverages statistics are calculated for the intermediate and final BAM files using "paleomix coverage", depending on makefile settings. Statistics are calculated genome-wide and for any regions of interest specified by the user.
+
+ 3. Depth histograms are calculated using "paleomix depths", similar to coverage statistics, these statistics are genome-wide and for any regions of interest specified by the user.
+
+ 4. If the "mapDamage" feature or "Rescaling" is enabled, mapDamage plots are generated; if rescaling is enabled, a model of the post-mortem DNA damage is also generated.
+
+ 5. If the "DuplicateHist" feature is enabled, histograms of PCR duplicates are estimated for each library, for use with the 'preseq' tool[Daley2013]_, to estimate the complexity of the libraries.
diff --git a/docs/bam_pipeline/requirements.rst b/docs/bam_pipeline/requirements.rst
new file mode 100644
index 0000000..62152e5
--- /dev/null
+++ b/docs/bam_pipeline/requirements.rst
@@ -0,0 +1,54 @@
+.. highlight:: Bash
+.. _bam_requirements:
+
+
+Software requirements
+=====================
+
+In addition to the requirements listed in the ref:`installation` section, the BAM pipeline requires that a several other pieces of software be installed:
+
+* `AdapterRemoval`_ v2.1+ [Lindgreen2012]_
+* `SAMTools`_ v0.1.18+ [Li2009b]_
+* `Picard Tools`_ v1.124+
+
+The Picard Tools JAR-file (picard.jar) is expected to be located in ~/install/jar_root/ by default, but this behavior may be changed using either the --jar-root command-line option, or via the global configuration file (see section :ref:`bam_configuration`).
+
+Furthermore, one or both of the following sequence aligners must be installed:
+
+ * `Bowtie2`_ v2.1.0+ [Langmead2012]_
+ * `BWA`_ v0.5.9+ or v0.6.2+ or v0.7.9+ [Li2009a]_
+
+In addition, the following packages are used by default, but can be omitted if disabled during runtime:
+
+* `mapDamage`_ 2.0.2+ [Jonsson2013]_
+* `Genome Analysis ToolKit`_ [McKenna2010]_
+
+If mapDamage is used to perform rescaling of post-mortem DNA damage, then the GNU Scientific Library (GSL) and the R packages listed in the mapDamage installation instructions are required; these include 'inline', 'gam', 'Rcpp', 'RcppGSL' and 'ggplot2' (>=0.9.2). Use the following commands to verify that these packages have been correctly installed::
+
+ $ gsl-config
+ Usage: gsl-config [OPTION]
+ ...
+
+ $ mapDamage --check-R-packages
+ All R packages are present
+
+The GATK JAR is only required if the user wishes to carry out local realignment near indels (recommended), and is expected to be placed in the same folder as the Picard Tools JAR (see above).
+
+The example projects included in the PALEOMIX source distribution may be used to test that PALEOMIX and the BAM pipeline has been correctly installed. See the :ref:`examples` section for more information.
+
+In case of errors, please consult the :ref:`troubleshooting` section.
+
+
+Testing the pipeline
+--------------------
+
+An example project is included with the BAM pipeline, and it is recommended to run this project in order to verify that the pipeline and required applications have been correctly installed. See the :ref:`examples` section for a description of how to run this example project.
+
+
+.. _AdapterRemoval: https://github.com/MikkelSchubert/adapterremoval
+.. _Bowtie2: http://bowtie-bio.sourceforge.net/bowtie2/
+.. _BWA: http://bio-bwa.sourceforge.net/
+.. _mapDamage: http://ginolhac.github.io/mapDamage/
+.. _Genome Analysis ToolKit: http://www.broadinstitute.org/gatk/
+.. _SAMTools: https://samtools.github.io
+.. _Picard Tools: http://broadinstitute.github.io/picard/
\ No newline at end of file
diff --git a/docs/bam_pipeline/usage.rst b/docs/bam_pipeline/usage.rst
new file mode 100644
index 0000000..0e80b51
--- /dev/null
+++ b/docs/bam_pipeline/usage.rst
@@ -0,0 +1,520 @@
+.. highlight:: Yaml
+.. _bam_usage:
+
+Pipeline usage
+==============
+
+The following describes, step by step, the process of setting up a project for mapping FASTQ reads against a reference sequence using the BAM pipeline. For a detailed description of the configuration file (makefile) used by the BAM pipeline, please refer to the section :ref:`bam_makefile`, and for a detailed description of the files generated by the pipeline, please refer to the section :ref:`bam_filestructure`.
+
+The BAM pipeline is invoked using either the 'paleomix' command, which offers access to all tools included with PALEOMIX (see section :ref:`other_tools`), or using the (deprecated) alias 'bam_pipeline'. Thus, all commands in the following may take one of the following (equivalent) forms:
+
+.. code-block:: bash
+
+ $ paleomix bam_pipeline [...]
+ $ bam_pipeline [...]
+
+For the purpose of these instructions, we will make use of a tiny FASTQ data set included with PALEOMIX pipeline, consisting of synthetic FASTQ reads simulated against the human mitochondrial genome. To follow along, first create a local copy of the BAM pipeline example data:
+
+.. code-block:: bash
+
+ $ paleomix bam_pipeline example .
+
+This will create a folder named 'bam_pipeline' in the current folder, which contain the example FASTQ reads and a 'makefile' showcasing various features of the BAM pipeline ('000\_makefile.yaml'). We will make use of a subset of the data, but we will not make use of the makefile. The data we will use consists of 3 simulated ancient DNA libraries (independent amplifications), for which either one or two lanes have been simulated:
+
++-------------+------+------+---------------------------------+
+| Library | Lane | Type | Files |
++-------------+------+------+---------------------------------+
+| ACGATA | 1 | PE | 000_data/ACGATA\_L1\_*.fastq.gz |
++-------------+------+------+---------------------------------+
+| GCTCTG | 1 | SE | 000_data/GCTCTG\_L1\_*.fastq.gz |
++-------------+------+------+---------------------------------+
+| TGCTCA | 1 | SE | 000_data/TGCTCA\_L1\_*.fastq.gz |
++-------------+------+------+---------------------------------+
+| | 2 | PE | 000_data/TGCTCA\_L2\_*.fastq.gz |
++-------------+------+------+---------------------------------+
+
+
+.. warning::
+ The BAM pipeline largely relies on the existence of final and intermediate files in order to detect if a given analytical step has been carried out. Therefore, changes made to a makefile after the pipeline has already been run (even if not run to completion) may therefore not cause analytical steps affected by these changes to be re-run. If changes are to be made at such a point, it is typically necessary to manually remove affected intermediate files before running the pipeline agai [...]
+
+
+Creating a makefile
+-------------------
+
+As described in the :ref:`introduction`, the BAM pipeline operates based on 'makefiles', which serve to specify the location and structure of input data (samples, libraries, lanes, etc), and which specific which tasks are to be run, and which settings to be used. The makefiles are written using the human-readable YAML format, which may be edited using any regular text editor.
+
+For a brief introduction to the YAML format, please refer to the :ref:`yaml_intro` section, and for a detailed description of the BAM Pipeline makefile, please refer to section :ref:`bam_makefile`.
+
+To start a new project, we must first generate a makefile template using the following command, which for the purpose of this tutorial we place in the example folder:
+
+.. code-block:: bash
+
+ $ cd bam_pipeline/
+ $ paleomix bam_pipeline mkfile > makefile.yaml
+
+Once you open the resulting file ('makefile.yaml') in your text editor of choice, you will find that BAM pipeline makefiles are split into 3 major sections, representing 1) the default options used for processing the data; 2) the reference genomes against which reads are to be mapped; and 3) sets of input files for one or more samples which is to be processed.
+
+In a typical project, we will need to review the default options, add one or more reference genomes which we wish to target, and list the input data to be processed.
+
+
+Default options
+^^^^^^^^^^^^^^^
+
+The makefile starts with an "Options" section, which is applied to every set of input-files in the makefile unless explicitly overwritten for a given sample (this is described in the :ref:`bam_makefile` section). For most part, the default values should be suitable for a given project, but special attention should be paid to the following options (colons indicates subsections):
+
+**Options\:Platform**
+
+ The sequencing platform used to generate the sequencing data; this information is recorded in the resulting BAM file, and may be used by downstream tools. The `SAM/BAM specification`_ the valid platforms, which currently include 'CAPILLARY', 'HELICOS', 'ILLUMINA', 'IONTORRENT', 'LS454', 'ONT', 'PACBIO', and 'SOLID'.
+
+**Options\:QualityOffset**
+
+ The QualityOffset option refers to the starting ASCII value used to encode `Phred quality-scores`_ in user-provided FASTQ files, with the possible values of 33, 64, and 'Solexa'. For most modern data, this will be 33, corresponding to ASCII characters in the range '!' to 'J'. Older data is often encoded using the offset 64, corresponding to ASCII characters in the range '@' to 'h', and more rarely using Solexa quality-scores, which represent a different scheme than Phred scores, and [...]
+
+.. warning::
+
+ By default, the adapter trimming software used by PALEOMIX expects quality-scores no higher than 41, corresponding to the ASCII character 'J' when encoded using offset 33. If the input-data contains quality-scores higher greater than this value, then it is necessary to specify the maximum value using the '--qualitymax' command-line option. See below.
+
+.. warning::
+
+ Presently, quality-offsets other than 33 are not supported when using the BWA 'mem' or the BWA 'bwasw' algorithms. To use these algorithms with quality-offset 64 data, it is therefore necessary to first convert these data to offset 33. This can be accomplished using the `seqtk`_ tool.
+
+**Options\:AdapterRemoval\:--adapter1**
+**Options\:AdapterRemoval\:--adapter2**
+
+These two options are used to specify the adapter sequences used to identify and trim reads that contain adapter contamination using AdapterRemoval. Thus, the sequence provided for --adapter1 is expected to be found in the mate 1 reads, and the sequence specified for --adapter2 is expected to be found in the mate 2 reads. In both cases, these should be specified as in the orientation that appear in these files (i.e. it should be possible to grep the files for these, assuming that the rea [...]
+
+
+**Aligners\:Program**
+
+ The short read alignment program to use to map the (trimmed) reads to the reference genome. Currently, users many choose between 'BWA' and 'Bowtie2', with additional options available for each program.
+
+**Aligners\:BWA\:MinQuality** and **Aligners\:Bowtie2\:MinQuality**
+
+ The minimum mapping quality of hits to retain during the mapping process. If this option is set to a non-zero value, any hits with a mapping quality below this value are removed from the resulting BAM file (this option does not apply to unmapped reads). If the final BAM should contain all reads in the input files, this option must be set to 0, and the 'FilterUnmappedReads' option set to 'no'.
+
+**Aligners\:BWA\:UseSeed**
+
+ Enable/disable the use of a seed region when mapping reads using the BWA 'backtrack' alignment algorithm (the default). Disabling this option may yield some improvements in the alignment of highly damaged ancient DNA, at the cost of significantly increasing the running time. As such, this option is not recommended for modern samples [Schubert2012]_.
+
+
+For the purpose of the example project, we need only change a few options. Since the reads were simulated using an Phred score offset of 33, there is no need to change the 'QualityOffset' option, and since the simulated adapter sequences matches the adapters that AdapterRemoval searches for by default, so we do not need to set eiter of '--adapter1' or '--adapter2'. We will, however, use the default mapping program (BWA) and algorithm ('backtrack'), but change the minimum mapping quality [...]
+
+.. code-block:: yaml
+ :emphasize-lines: 12
+ :linenos:
+ :lineno-start: 38
+
+ # Settings for aligners supported by the pipeline
+ Aligners:
+ # Choice of aligner software to use, either "BWA" or "Bowtie2"
+ Program: BWA
+
+ # Settings for mappings performed using BWA
+ BWA:
+ # One of "backtrack", "bwasw", or "mem"; see the BWA documentation
+ # for a description of each algorithm (defaults to 'backtrack')
+ Algorithm: backtrack
+ # Filter aligned reads with a mapping quality (Phred) below this value
+ MinQuality: 30
+ # Filter reads that did not map to the reference sequence
+ FilterUnmappedReads: yes
+ # Should be disabled ("no") for aDNA alignments, as post-mortem damage
+ # localizes to the seed region, which BWA expects to have few
+ # errors (sets "-l"). See http://pmid.us/22574660
+ UseSeed: yes
+
+Since the data we will be mapping represents (simulated) ancient DNA, we will furthermore set the UseSeed option to 'no' (line 18), in order to recover a small additional amount of alignments during mapping (c.f. [Schubert2012]_):
+
+.. code-block:: yaml
+ :emphasize-lines: 18
+ :linenos:
+ :lineno-start: 38
+
+ # Settings for aligners supported by the pipeline
+ Aligners:
+ # Choice of aligner software to use, either "BWA" or "Bowtie2"
+ Program: BWA
+
+ # Settings for mappings performed using BWA
+ BWA:
+ # One of "backtrack", "bwasw", or "mem"; see the BWA documentation
+ # for a description of each algorithm (defaults to 'backtrack')
+ Algorithm: backtrack
+ # Filter aligned reads with a mapping quality (Phred) below this value
+ MinQuality: 30
+ # Filter reads that did not map to the reference sequence
+ FilterUnmappedReads: yes
+ # Should be disabled ("no") for aDNA alignments, as post-mortem damage
+ # localizes to the seed region, which BWA expects to have few
+ # errors (sets "-l"). See http://pmid.us/22574660
+ UseSeed: no
+
+Once this is done, we can proceed to specify the location of the reference genome(s) that we wish to map our reads against.
+
+
+Reference genomes (prefixes)
+----------------------------
+
+Mapping is carried out using one or more reference genomes (or other sequences) in the form of FASTA files, which are indexed for use in read mapping (automatically, by the pipeline) using either the "bwa index" or "bowtie2-build" commands. Since sequence alignment index are generated at the location of these files, reference genomes are also referred to as "prefixes" in the documentation. In other words, using BWA as an example, the PALEOMIX pipeline will generate a index (prefix) of th [...]
+
+.. code-block:: bash
+
+ $ bwa index prefixes/my_genome.fa
+
+In addition to the BWA / Bowtie2 index, several other related files are also automatically generated, including a FASTA index file (.fai), and a sequence dictionary (.dict), which are required for various operations of the pipeline. These are similarly located at the same folder as the reference FASTA file. For a more detailed description, please refer to the :ref:`bam_filestructure` section.
+
+.. warning::
+ Since the pipeline automatically carries out indexing of the FASTA files, it therefore requires write-access to the folder containing the FASTA files. If this is not possible, one may simply create a local folder containing symbolic links to the original FASTA file(s), and point the makefile to this location. All automatically generated files will then be placed in this location.
+
+
+Specifying which FASTA file to align sequences is accomplished by listing these in the "Prefixes" section in the makefile. For example, assuming that we had a FASTA file named "my\_genome.fasta" which is located in the folder "my\_prefixes", the following might be used::
+
+ Prefixes:
+ my_genome:
+ Path: my_prefixes/my_genome.fasta
+
+The name of the prefix (here 'my\_genome') will be used to name the resulting files and in various tables that are generated by the pipeline. Typical names include 'hg19', 'EquCab20', and other standard abbreviations for reference genomes, accession numbers, and the like. Multiple prefixes can be specified, but each name MUST be unique::
+
+ Prefixes:
+ my_genome:
+ Path: my_prefixes/my_genome.fasta
+ my_other_genome:
+ Path: my_prefixes/my_other_genome.fasta
+
+.. warning::
+ FASTA files used in the BAM pipeline *must* be named with a .fasta file extension. Furthermore, if alignments are to be carried out against the human nuclear genome, chromosomes MUST be ordered by their number for GATK to work! See the `GATK FAQ`_ for more information.
+
+In the case of this example project, we will be mapping our data against the revised Cambridge Reference Sequence (rCRS) for the human mitochondrial genome, which is included in examples folder under '000\_prefxies', as a file named 'rCRS.fasta'. To add it to the makefile, locate the 'Prefixes' section located below the 'Options' section, and update it as described above (lines 5 and 7):
+
+.. code-block:: yaml
+ :emphasize-lines: 6,8
+ :linenos:
+ :lineno-start: 125
+
+ # Map of prefixes by name, each having a Path key, which specifies the
+ # location of the BWA/Bowtie2 index, and optional label, and an option
+ # set of regions for which additional statistics are produced.
+ Prefixes:
+ # Name of the prefix; is used as part of the output filenames
+ rCRS:
+ # Path to .fasta file containing a set of reference sequences.
+ Path: 000_prefixes/rCRS.fasta
+
+Once this is done, we may specify the input data that we wish the pipeline to process for us.
+
+
+Specifying read data
+--------------------
+
+A single makefile may be used to process one or more samples, to generate one or more BAM files and supplementary statistics. In this project we will only deal with a single sample, which we accomplish by adding creating our own section at the end of the makefile. The first step is to determine the name for the files generated by the BAM pipeline. Specifically, we will specify a name which is prefixed to all output generated for our sample (here named 'MyFilename'), by adding the followi [...]
+
+.. code-block:: yaml
+ :linenos:
+ :lineno-start: 145
+
+ # You can also add comments like these to document your experiment
+ MyFilename:
+
+
+This first name, or grouping, is referred to as the target, and typically corresponds to the name of the sample being processes, though any name may do. The actual sample-name is specified next (it is possible, but uncommon, for a single target to contain multiple samples), and is used both in tables of summary statistics, and recorded in the resulting BAM files. This is accomplished by adding another line below the target name:
+
+.. code-block:: yaml
+ :linenos:
+ :lineno-start: 145
+
+ # You can also add comments like these to document your experiment
+ MyFilename:
+ MySample:
+
+Similarly, we need to specify the name of each library in our dataset. By convention, I often use the index used to construct the library as the library name (which allows for easy identification), but any name may be used for a library, provided that it unique to that sample. As described near the start of this document, we are dealing with 3 libraries:
+
++-------------+------+------+---------------------------------+
+| Library | Lane | Type | Files |
++-------------+------+------+---------------------------------+
+| ACGATA | 1 | PE | 000_data/ACGATA\_L1\_*.fastq.gz |
++-------------+------+------+---------------------------------+
+| GCTCTG | 1 | SE | 000_data/GCTCTG\_L1\_*.fastq.gz |
++-------------+------+------+---------------------------------+
+| TGCTCA | 1 | SE | 000_data/TGCTCA\_L1\_*.fastq.gz |
++-------------+------+------+---------------------------------+
+| | 2 | PE | 000_data/TGCTCA\_L2\_*.fastq.gz |
++-------------+------+------+---------------------------------+
+
+It is important to correctly specify the libraries, since the pipeline will not only use this information for summary statistics and record it in the resulting BAM files, but will also carry out filtering of PCR duplicates (and other analyses) on a per-library basis. Wrongly grouping together data will therefore result in a loss of useful alignments wrongly identified as PCR duplicates, or, similarly, in the inclusion of reads that should have been filtered as PCR duplicates. The library [...]
+
+.. code-block:: yaml
+ :linenos:
+ :lineno-start: 145
+
+ # You can also add comments like these to document your experiment
+ MyFilename:
+ MySample:
+ ACGATA:
+
+ GCTCTG:
+
+ TGCTCA:
+
+The final step involves specifying the location of the raw FASTQ reads that should be processed for each library, and consists of specifying one or more "lanes" of reads, each of which must be given a unique name. For single-end reads, this is accomplished simply by providing a path (with optional wildcards) to the location of the file(s). For example, for lane 1 of library ACGATA, the files are located at 000_data/ACGATA\_L1\_*.fastq.gz:
+
+.. code-block:: bash
+
+ $ ls 000_data/GCTCTG_L1_*.fastq.gz
+ 000_data/GCTCTG_L1_R1_01.fastq.gz
+ 000_data/GCTCTG_L1_R1_02.fastq.gz
+ 000_data/GCTCTG_L1_R1_03.fastq.gz
+
+We simply specify these paths for each of the single-end lanes, here using the lane number to name these (similar to the above, this name is used to tag the data in the resulting BAM file):
+
+.. code-block:: yaml
+ :linenos:
+ :lineno-start: 145
+
+ # You can also add comments like these to document your experiment
+ MyFilename:
+ MySample:
+ ACGATA:
+
+ GCTCTG:
+ Lane_1: 000_data/GCTCTG_L1_*.fastq.gz
+
+ TGCTCA:
+ Lane_1: 000_data/TGCTCA_L1_*.fastq.gz
+
+Specifying the location of paired-end data is slightly more complex, since the pipeline needs to be able to locate both files in a pair. This is accomplished by making the assumption that paired-end files are numbered as either mate 1 or mate 2, as shown here for 4 pairs of files with the common _R1 and _R2 labels:
+
+.. code-block:: bash
+
+ $ ls 000_data/ACGATA_L1_*.fastq.gz
+ 000_data/ACGATA_L1_R1_01.fastq.gz
+ 000_data/ACGATA_L1_R1_02.fastq.gz
+ 000_data/ACGATA_L1_R1_03.fastq.gz
+ 000_data/ACGATA_L1_R1_04.fastq.gz
+ 000_data/ACGATA_L1_R2_01.fastq.gz
+ 000_data/ACGATA_L1_R2_02.fastq.gz
+ 000_data/ACGATA_L1_R2_03.fastq.gz
+ 000_data/ACGATA_L1_R2_04.fastq.gz
+
+Knowing how that the files contain a number specifying which file in a pair they correspond to, we can then construct a path that includes the keyword '{Pair}' in place of that number. For the above example, that path would therefore be '000_data/ACGATA\_L1\_R{Pair}_*.fastq.gz' (corresponding to '000_data/ACGATA\_L1\_R[12]_*.fastq.gz'):
+
+.. code-block:: yaml
+ :linenos:
+ :lineno-start: 145
+
+ # You can also add comments like these to document your experiment
+ MyFilename:
+ MySample:
+ ACGATA:
+ Lane_1: 000_data/ACGATA_L1_R{Pair}_*.fastq.gz
+
+ GCTCTG:
+ Lane_1: 000_data/GCTCTG_L1_*.fastq.gz
+
+ TGCTCA:
+ Lane_1: 000_data/TGCTCA_L1_*.fastq.gz
+ Lane_2: 000_data/TGCTCA_L2_R{Pair}_*.fastq.gz
+
+.. note::
+ Note that while the paths given here are relative to the location of where the pipeline is run, it is also possible to provide absolute paths, should the files be located in an entirely different location.
+
+.. note::
+ At the time of writing, the PALEOMIX pipeline supports uncompressed, gzipped, and bzipped FASTQ reads. It is not necessary to use any particular file extension for these, as the compression method (if any) is detected automatically.
+
+
+The final makefile
+------------------
+
+Once we've completed the steps described above, the resulting makefile should look like the following, shown here with the modifications that we've made highlighted:
+
+.. code-block:: yaml
+ :emphasize-lines: 49,55,130,132,146-156
+ :linenos:
+
+ # -*- mode: Yaml; -*-
+ # Timestamp: 2016-02-04T10:53:59.906883
+ #
+ # Default options.
+ # Can also be specific for a set of samples, libraries, and lanes,
+ # by including the "Options" hierarchy at the same level as those
+ # samples, libraries, or lanes below. This does not include
+ # "Features", which may only be specific globally.
+ Options:
+ # Sequencing platform, see SAM/BAM reference for valid values
+ Platform: Illumina
+ # Quality offset for Phred scores, either 33 (Sanger/Illumina 1.8+)
+ # or 64 (Illumina 1.3+ / 1.5+). For Bowtie2 it is also possible to
+ # specify 'Solexa', to handle reads on the Solexa scale. This is
+ # used during adapter-trimming and sequence alignment
+ QualityOffset: 33
+ # Split a lane into multiple entries, one for each (pair of) file(s)
+ # found using the search-string specified for a given lane. Each
+ # lane is named by adding a number to the end of the given barcode.
+ SplitLanesByFilenames: yes
+ # Compression format for FASTQ reads; 'gz' for GZip, 'bz2' for BZip2
+ CompressionFormat: bz2
+
+ # Settings for trimming of reads, see AdapterRemoval man-page
+ AdapterRemoval:
+ # Adapter sequences, set and uncomment to override defaults
+ # --adapter1: AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG
+ # --adapter2: AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT
+ # Some BAM pipeline defaults differ from AR defaults;
+ # To override, change these value(s):
+ --mm: 3
+ --minlength: 25
+ # Extra features enabled by default; change 'yes' to 'no' to disable
+ --collapse: yes
+ --trimns: yes
+ --trimqualities: yes
+
+ # Settings for aligners supported by the pipeline
+ Aligners:
+ # Choice of aligner software to use, either "BWA" or "Bowtie2"
+ Program: BWA
+
+ # Settings for mappings performed using BWA
+ BWA:
+ # One of "backtrack", "bwasw", or "mem"; see the BWA documentation
+ # for a description of each algorithm (defaults to 'backtrack')
+ Algorithm: backtrack
+ # Filter aligned reads with a mapping quality (Phred) below this value
+ MinQuality: 30
+ # Filter reads that did not map to the reference sequence
+ FilterUnmappedReads: yes
+ # Should be disabled ("no") for aDNA alignments, as post-mortem
+ # localizes to the seed region, which BWA expects to have few
+ # errors (sets "-l"). See http://pmid.us/22574660
+ UseSeed: no
+ # Additional command-line options may be specified for the "aln"
+ # call(s), as described below for Bowtie2 below.
+
+ # Settings for mappings performed using Bowtie2
+ Bowtie2:
+ # Filter aligned reads with a mapping quality (Phred) below this value
+ MinQuality: 0
+ # Filter reads that did not map to the reference sequence
+ FilterUnmappedReads: yes
+ # Examples of how to add additional command-line options
+ # --trim5: 5
+ # --trim3: 5
+ # Note that the colon is required, even if no value is specified
+ --very-sensitive:
+ # Example of how to specify multiple values for an option
+ # --rg:
+ # - CN:SequencingCenterNameHere
+ # - DS:DescriptionOfReadGroup
+
+ # Mark / filter PCR duplicates. If set to 'filter', PCR duplicates are
+ # removed from the output files; if set to 'mark', PCR duplicates are
+ # flagged with bit 0x400, and not removed from the output files; if set to
+ # 'no', the reads are assumed to not have been amplified. Collapsed reads
+ # are filtered using the command 'paleomix rmdup_duplicates', while "normal"
+ # reads are filtered using Picard MarkDuplicates.
+ PCRDuplicates: filter
+
+ # Carry out quality base re-scaling of libraries using mapDamage
+ # This will be done using the options set for mapDamage below
+ RescaleQualities: no
+
+ # Command-line options for mapDamage; note that the long-form
+ # options are expected; --length, not -l, etc. Uncomment the
+ # "mapDamage" line adding command-line options below.
+ mapDamage:
+ # By default, the pipeline will downsample the input to 100k hits
+ # when running mapDamage; remove to use all hits
+ --downsample: 100000
+
+ # Set to 'yes' exclude a type of trimmed reads from alignment / analysis;
+ # possible read-types reflect the output of AdapterRemoval
+ ExcludeReads:
+ Single: no # Single-ended reads / Orphaned paired-ended reads
+ Paired: no # Paired ended reads
+ Singleton: no # Paired reads for which the mate was discarded
+ Collapsed: no # Overlapping paired-ended reads collapsed into a
+ # single sequence by AdapterRemoval
+ CollapsedTruncated: no # Like 'Collapsed', except that the reads
+ # truncated due to the presence ambiguous
+ # bases or low quality bases at read termini.
+
+ # Optional steps to perform during processing
+ Features:
+ RawBAM: no # Generate BAM from the raw libraries (no indel realignment)
+ # Location: {Destination}/{Target}.{Genome}.bam
+ RealignedBAM: yes # Generate indel-realigned BAM using the GATK Indel realigner
+ # Location: {Destination}/{Target}.{Genome}.realigned.bam
+ mapDamage: yes # Generate mapDamage plot for each (unrealigned) library
+ # Location: {Destination}/{Target}.{Genome}.mapDamage/{Library}/
+ Coverage: yes # Generate coverage information for the raw BAM (wo/ indel realignment)
+ # Location: {Destination}/{Target}.{Genome}.coverage
+ Depths: yes # Generate histogram of number of sites with a given read-depth
+ # Location: {Destination}/{Target}.{Genome}.depths
+ Summary: yes # Generate summary table for each target
+ # Location: {Destination}/{Target}.summary
+ DuplicateHist: no # Generate histogram of PCR duplicates, for use with PreSeq
+ # Location: {Destination}/{Target}.{Genome}.duphist/{Library}/
+
+
+ # Map of prefixes by name, each having a Path key, which specifies the
+ # location of the BWA/Bowtie2 index, and optional label, and an option
+ # set of regions for which additional statistics are produced.
+ Prefixes:
+ # Name of the prefix; is used as part of the output filenames
+ rCRS:
+ # Path to .fasta file containing a set of reference sequences.
+ Path: 000_prefixes/rCRS.fasta
+
+ # Label for prefix: One of nuclear, mitochondrial, chloroplast,
+ # plasmid, bacterial, or viral. Is used in the .summary files.
+ # Label: ...
+
+ # Produce additional coverage / depth statistics for a set of
+ # regions defined in a BED file; if no names are specified for the
+ # BED records, results are named after the chromosome / contig.
+ # RegionsOfInterest:
+ # NAME: PATH_TO_BEDFILE
+
+
+ # You can also add comments like these to document your experiment
+ MyFilename:
+ MySample:
+ ACGATA:
+ Lane_1: 000_data/ACGATA_L1_R{Pair}_*.fastq.gz
+
+ GCTCTG:
+ Lane_1: 000_data/GCTCTG_L1_*.fastq.gz
+
+ TGCTCA:
+ Lane_1: 000_data/TGCTCA_L1_*.fastq.gz
+ Lane_2: 000_data/TGCTCA_L2_R{Pair}_*.fastq.gz
+
+
+With this makefile in hand, the pipeline may be executed using the following command:
+
+.. code-block:: bash
+
+ $ paleomix bam_pipeline run makefile.yaml
+
+The pipeline will run as many simultaneous processes as there are cores in the current system, but this behavior may be changed by using the '--max-threads' command-line option. Use the '--help' command-line option to view additional options available when running the pipeline. By default, output files are placed in the same folder as the makefile, but this behavior may be changed by setting the '--destination' command-line option. For this projects, these files include the following:
+
+.. code-block:: bash
+
+ $ ls -d MyFilename*
+ MyFilename
+ MyFilename.rCRS.coverage
+ MyFilename.rCRS.depths
+ MyFilename.rCRS.mapDamage
+ MyFilename.rCRS.realigned.bai
+ MyFilename.rCRS.realigned.bam
+ MyFilename.summary
+
+The files include a table of the average coverages, a histogram of the per-site coverages (depths), a folder containing one set of mapDamage plots per library, and the final BAM file and its index (the .bai file), as well as a table summarizing the entire analysis. For a more detailed description of the files generated by the pipeline, please refer to the :ref:`bam_filestructure` section; should problems occur during the execution of the pipeline, then please verify that the makefile is [...]
+
+.. note::
+ The first item, 'MyFilename', is a folder containing intermediate files generated while running the pipeline, required due to the many steps involved in a typical analyses, and which also allows for the pipeline to resume should the process be interrupted. This folder will typically take up 3-4x the disk-space used by the final BAM file(s), and can safely be removed once the pipeline has run to completion, in order to reduce disk-usage.
+
+
+.. _SAM/BAM specification: http://samtools.sourceforge.net/SAM1.pdf
+.. _GATK FAQ: http://www.broadinstitute.org/gatk/guide/article?id=1204
+.. _seqtk: https://github.com/lh3/seqtk
+.. _Phred quality-scores: https://en.wikipedia.org/wiki/FASTQ_format#Quality
+.. _AdapterRemoval documentation: https://github.com/MikkelSchubert/adapterremoval
\ No newline at end of file
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..016c7cb
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,286 @@
+# -*- coding: utf-8 -*-
+#
+# PALEOMIX documentation build configuration file, created by
+# sphinx-quickstart on Mon Nov 30 22:47:26 2015.
+#
+# This file is execfile()d with the current directory set to its
+# containing dir.
+#
+# Note that not all possible configuration values are present in this
+# autogenerated file.
+#
+# All configuration values have a default; values that are commented out
+# serve to show the default.
+
+import sys
+import os
+import shlex
+
+# If extensions (or modules to document with autodoc) are in another directory,
+# add these directories to sys.path here. If the directory is relative to the
+# documentation root, use os.path.abspath to make it absolute, like shown here.
+#sys.path.insert(0, os.path.abspath('.'))
+
+# -- General configuration ------------------------------------------------
+
+# If your documentation needs a minimal Sphinx version, state it here.
+#needs_sphinx = '1.0'
+
+# Add any Sphinx extension module names here, as strings. They can be
+# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
+# ones.
+extensions = []
+
+# Add any paths that contain templates here, relative to this directory.
+templates_path = []
+
+# The suffix(es) of source filenames.
+# You can specify multiple suffix as a list of string:
+# source_suffix = ['.rst', '.md']
+source_suffix = '.rst'
+
+# The encoding of source files.
+#source_encoding = 'utf-8-sig'
+
+# The master toctree document.
+master_doc = 'index'
+
+# General information about the project.
+project = u'PALEOMIX'
+copyright = u'2015, Mikkel Schubert'
+author = u'Mikkel Schubert'
+
+# The version info for the project you're documenting, acts as replacement for
+# |version| and |release|, also used in various other places throughout the
+# built documents.
+#
+# The short X.Y version.
+version = u'1.2'
+# The full version, including alpha/beta/rc tags.
+release = u'1.2.7'
+
+# The language for content autogenerated by Sphinx. Refer to documentation
+# for a list of supported languages.
+#
+# This is also used if you do content translation via gettext catalogs.
+# Usually you set "language" from the command line for these cases.
+language = None
+
+# There are two options for replacing |today|: either, you set today to some
+# non-false value, then it is used:
+#today = ''
+# Else, today_fmt is used as the format for a strftime call.
+#today_fmt = '%B %d, %Y'
+
+# List of patterns, relative to source directory, that match files and
+# directories to ignore when looking for source files.
+exclude_patterns = ['_build']
+
+# The reST default role (used for this markup: `text`) to use for all
+# documents.
+#default_role = None
+
+# If true, '()' will be appended to :func: etc. cross-reference text.
+#add_function_parentheses = True
+
+# If true, the current module name will be prepended to all description
+# unit titles (such as .. function::).
+#add_module_names = True
+
+# If true, sectionauthor and moduleauthor directives will be shown in the
+# output. They are ignored by default.
+#show_authors = False
+
+# The name of the Pygments (syntax highlighting) style to use.
+pygments_style = 'sphinx'
+
+# A list of ignored prefixes for module index sorting.
+#modindex_common_prefix = []
+
+# If true, keep warnings as "system message" paragraphs in the built documents.
+#keep_warnings = False
+
+# If true, `todo` and `todoList` produce output, else they produce nothing.
+todo_include_todos = False
+
+
+# -- Options for HTML output ----------------------------------------------
+
+# The theme to use for HTML and HTML Help pages. See the documentation for
+# a list of builtin themes.
+html_theme = 'classic'
+
+# Theme options are theme-specific and customize the look and feel of a theme
+# further. For a list of options available for each theme, see the
+# documentation.
+#html_theme_options = {}
+
+# Add any paths that contain custom themes here, relative to this directory.
+#html_theme_path = []
+
+# The name for this set of Sphinx documents. If None, it defaults to
+# "<project> v<release> documentation".
+#html_title = None
+
+# A shorter title for the navigation bar. Default is the same as html_title.
+#html_short_title = None
+
+# The name of an image file (relative to this directory) to place at the top
+# of the sidebar.
+#html_logo = None
+
+# The name of an image file (within the static path) to use as favicon of the
+# docs. This file should be a Windows icon file (.ico) being 16x16 or 32x32
+# pixels large.
+#html_favicon = None
+
+# Add any paths that contain custom static files (such as style sheets) here,
+# relative to this directory. They are copied after the builtin static files,
+# so a file named "default.css" will overwrite the builtin "default.css".
+html_static_path = ['_static']
+
+# Add any extra paths that contain custom files (such as robots.txt or
+# .htaccess) here, relative to this directory. These files are copied
+# directly to the root of the documentation.
+#html_extra_path = []
+
+# If not '', a 'Last updated on:' timestamp is inserted at every page bottom,
+# using the given strftime format.
+#html_last_updated_fmt = '%b %d, %Y'
+
+# If true, SmartyPants will be used to convert quotes and dashes to
+# typographically correct entities.
+# Disabled as it also converts double-dashes in, for example, command-line
+# options into a single long-dash.
+html_use_smartypants = False
+
+# Custom sidebar templates, maps document names to template names.
+#html_sidebars = {}
+
+# Additional templates that should be rendered to pages, maps page names to
+# template names.
+#html_additional_pages = {}
+
+# If false, no module index is generated.
+#html_domain_indices = True
+
+# If false, no index is generated.
+#html_use_index = True
+
+# If true, the index is split into individual pages for each letter.
+#html_split_index = False
+
+# If true, links to the reST sources are added to the pages.
+#html_show_sourcelink = True
+
+# If true, "Created using Sphinx" is shown in the HTML footer. Default is True.
+#html_show_sphinx = True
+
+# If true, "(C) Copyright ..." is shown in the HTML footer. Default is True.
+#html_show_copyright = True
+
+# If true, an OpenSearch description file will be output, and all pages will
+# contain a <link> tag referring to it. The value of this option must be the
+# base URL from which the finished HTML is served.
+#html_use_opensearch = ''
+
+# This is the file name suffix for HTML files (e.g. ".xhtml").
+#html_file_suffix = None
+
+# Language to be used for generating the HTML full-text search index.
+# Sphinx supports the following languages:
+# 'da', 'de', 'en', 'es', 'fi', 'fr', 'hu', 'it', 'ja'
+# 'nl', 'no', 'pt', 'ro', 'ru', 'sv', 'tr'
+#html_search_language = 'en'
+
+# A dictionary with options for the search language support, empty by default.
+# Now only 'ja' uses this config value
+#html_search_options = {'type': 'default'}
+
+# The name of a javascript file (relative to the configuration directory) that
+# implements a search results scorer. If empty, the default will be used.
+#html_search_scorer = 'scorer.js'
+
+# Output file base name for HTML help builder.
+htmlhelp_basename = 'PALEOMIXdoc'
+
+# -- Options for LaTeX output ---------------------------------------------
+
+latex_elements = {
+# The paper size ('letterpaper' or 'a4paper').
+#'papersize': 'letterpaper',
+
+# The font size ('10pt', '11pt' or '12pt').
+#'pointsize': '10pt',
+
+# Additional stuff for the LaTeX preamble.
+#'preamble': '',
+
+# Latex figure (float) alignment
+#'figure_align': 'htbp',
+}
+
+# Grouping the document tree into LaTeX files. List of tuples
+# (source start file, target name, title,
+# author, documentclass [howto, manual, or own class]).
+latex_documents = [
+ (master_doc, 'PALEOMIX.tex', u'PALEOMIX Documentation',
+ u'Mikkel Schubert', 'manual'),
+]
+
+# The name of an image file (relative to this directory) to place at the top of
+# the title page.
+#latex_logo = None
+
+# For "manual" documents, if this is true, then toplevel headings are parts,
+# not chapters.
+#latex_use_parts = False
+
+# If true, show page references after internal links.
+#latex_show_pagerefs = False
+
+# If true, show URL addresses after external links.
+#latex_show_urls = False
+
+# Documents to append as an appendix to all manuals.
+#latex_appendices = []
+
+# If false, no module index is generated.
+#latex_domain_indices = True
+
+
+# -- Options for manual page output ---------------------------------------
+
+# One entry per manual page. List of tuples
+# (source start file, name, description, authors, manual section).
+man_pages = [
+ (master_doc, 'paleomix', u'PALEOMIX Documentation',
+ [author], 1)
+]
+
+# If true, show URL addresses after external links.
+#man_show_urls = False
+
+
+# -- Options for Texinfo output -------------------------------------------
+
+# Grouping the document tree into Texinfo files. List of tuples
+# (source start file, target name, title, author,
+# dir menu entry, description, category)
+texinfo_documents = [
+ (master_doc, 'PALEOMIX', u'PALEOMIX Documentation',
+ author, 'PALEOMIX', 'TODO',
+ 'Miscellaneous'),
+]
+
+# Documents to append as an appendix to all manuals.
+#texinfo_appendices = []
+
+# If false, no module index is generated.
+#texinfo_domain_indices = True
+
+# How to display URL addresses: 'footnote', 'no', or 'inline'.
+#texinfo_show_urls = 'footnote'
+
+# If true, do not generate a @detailmenu in the "Top" node's menu.
+#texinfo_no_detailmenu = False
diff --git a/docs/examples.rst b/docs/examples.rst
new file mode 100644
index 0000000..5abc5ac
--- /dev/null
+++ b/docs/examples.rst
@@ -0,0 +1,106 @@
+.. _examples:
+
+Example projects and data-sets
+==============================
+
+The PALEOMIX pipeline contains small example projects for the larger pipelines, which are designed to be executed in a short amount of time, and to help verify that the pipelines have been correctly installed.
+
+
+.. _examples_bam:
+
+BAM Pipeline example project
+----------------------------
+
+The example project for the BAM pipeline involves the processing of a small data set consisting of (simulated) ancient sequences derived from the human mitochondrial genome. The runtime of this project on a typical desktop or laptop ranges from around 1 minute to around 1 hour (when building of models of the ancient DNA damage patterns is enabled). To access this example project, use the 'example' command for the bam\_pipeline to copy the project files to a given directory (here, the cur [...]
+
+ $ paleomix bam_pipeline example .
+ $ cd bam_pipeline
+ $ paleomix bam_pipeline run 000_makefile.yaml
+
+By default, this example project includes the recalibration of quality scores for bases that are identified as putative *post-mortem* damage (see [Jonsson2013]_). However, this greatly increases the time needed to run this example. While it is recommended to run this step, this step may be disabled by setting the value of the 'RescaleQualities' option in the '000\_makefile.yaml' file to 'no'.
+
+Before:
+
+.. code-block:: yaml
+ :emphasize-lines: 3
+ :linenos:
+ :lineno-start: 83
+
+ # Carry out quality base re-scaling of libraries using mapDamage
+ # This will be done using the options set for mapDamage below
+ RescaleQualities: yes
+
+After:
+
+.. code-block:: yaml
+ :emphasize-lines: 3
+ :linenos:
+ :lineno-start: 83
+
+ # Carry out quality base re-scaling of libraries using mapDamage
+ # This will be done using the options set for mapDamage below
+ RescaleQualities: no
+
+The output generated by the pipeline is described in the :ref:`bam_filestructure` section. Please see the :ref:`troubleshooting` section if you run into problems running the pipeline.
+
+
+.. _examples_phylo:
+
+Phylogentic Pipeline example project
+------------------------------------
+
+The example project for the Phylogenetic pipeline involves the processing and mapping of a small data set consisting of (simulated) sequences derived from the human and primate mitochondrial genome, followed by the genotyping of gene sequences and the construction of a maximum likelihood phylogeny. Since this example project starts from raw reads, it therefore requires that the BAM pipeline has been correctly installed, as described in section :ref:`bam_requirements`). The runtime of thi [...]
+
+To access this example project, use the 'example' command for the phylo\_pipeline to copy the project files to a given directory (here, the current directory), and then run the 'setup.sh' script in the root directory, to generate the data set::
+
+ $ paleomix phylo_pipeline example .
+ $ cd phylo_pipeline
+ $ ./setup.sh
+
+Once the example data has been generated, the two pipelines may be executed::
+
+ $ cd alignment
+ $ bam_pipeline run 000_makefile.yaml
+ $ cd ../phylogeny
+ $ phylo_pipeline genotype+msa+phylogeny 000_makefile.yaml
+
+The output generated by the pipeline is described in the :ref:`phylo_filestructure` section. Please see the :ref:`troubleshooting` section if you run into problems running the pipeline.
+
+
+.. _examples_zonkey:
+
+Zonkey Pipeline example project
+-------------------------------
+
+The example project for the Zonkey pipeline is based on a synthetic hybrid between a Domestic donkey and an Arabian horse (obtained from [Orlando2013]_), using a low number of reads (1200). The runtime of these examples on a typical desktop or laptop ranges from around 30 minutes to around 1 hour, depending on your local configuration.
+
+To access this example project, download the Zonkey reference database (see the 'Prerequisites' section of the :ref:`zonkey_usage` page for instructions), and use the 'example' command for zonkey to copy the project files to a given directory. Here, the current directory directory is used; to place the example files in a different location, simply replace the '.' with the full path to the desired directory::
+
+ $ paleomix zonkey example database.tar .
+ $ cd zonkey_pipeline
+
+
+The example directory contains 3 BAM files; one containing a nuclear alignment ('nuclear.bam'); one containing a mitochondrial alignment ('mitochondrial.bam'); and one containing a combined nuclear and mitochondrial alignment ('combined.bam'). In addition, a sample table is included which shows how multiple samples may be specified and processed at once. Each of these may be run as follows::
+
+ # Process only the nuclear BAM;
+ # by default, results are saved in 'nuclear.zonkey'
+ $ paleomix zonkey run database.tar nuclear.bam
+
+ # Process only the mitochondrial BAM;
+ # by default, results are saved in 'mitochondrial.zonkey'
+ $ paleomix zonkey run database.tar mitochondrial.bam
+
+ # Process both the nuclear and the mitochondrial BAMs;
+ # note that is nessesary to specify an output directory
+ $ paleomix zonkey run database.tar nuclear.bam mitochondrial.bam results
+
+ # Process both the combined nuclear and the mitochondrial BAM;
+ # by default, results are saved in 'combined.zonkey'
+ $ paleomix zonkey run database.tar combined.bam
+
+ # Process multiple samples; the table corresponds to the four
+ # cases listed above.
+ $ paleomix zonkey run database.tar samples.txt
+
+
+Please see the :ref:`troubleshooting` section if you run into problems running the pipeline. The output generated by the pipeline is described in the :ref:`zonkey_filestructure` section.
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..83aa1c7
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,39 @@
+
+Welcome to PALEOMIX's documentation!
+====================================
+
+The PALEOMIX pipeline is a set of pipelines and tools designed to aid the rapid processing of High-Throughput Sequencing (HTS) data, starting from de-multiplexed reads from one or more samples, through sequence processing and alignment, followed by genotyping and phylogenetic inference on the samples. In addition, PALEOMIX aids in metagenomic analysis of the extracts. The pipeline has been designed with ancient DNA (aDNA) in mind, and includes several features especially useful for the a [...]
+
+If you make use of any part of the PALEOMIX pipeline and/or assosiated tools, then we ask that you kindly cite [Schubert2014]_.
+
+
+**Table of Contents:**
+
+.. toctree::
+ :maxdepth: 2
+
+ introduction.rst
+ installation.rst
+
+ bam_pipeline/index.rst
+ phylo_pipeline/index.rst
+ zonkey_pipeline/index.rst
+
+ other_tools.rst
+ examples.rst
+
+ troubleshooting/index.rst
+
+ yaml.rst
+ acknowledgements.rst
+ related.rst
+
+ references.rst
+
+
+
+Indices and tables
+==================
+
+* :ref:`genindex`
+* :ref:`search`
diff --git a/docs/installation.rst b/docs/installation.rst
new file mode 100644
index 0000000..b2cf81d
--- /dev/null
+++ b/docs/installation.rst
@@ -0,0 +1,115 @@
+.. highlight:: Bash
+.. _installation:
+
+
+Installation
+============
+
+The following instructions will install PALEOMIX for the current user, but does not include specific programs required by the pipelines. For pipeline specific instructions, refer to the requirements sections for the :ref:`BAM <bam_requirements>`, the :ref:`Phylogentic <phylo_requirements>`, and the :ref:`Zonkey <zonkey_requirements>` pipeline. The recommended way of installing PALEOMIX is by use of the `pip`_ package manager for Python. If Pip is not installed, then please consult the do [...]
+
+In addition to the `pip`_ package manager for Python, the pipelines require `Python`_ 2.7, and `Pysam`_ v0.8.3+, which in turn requires both Python and libz development files (see the :ref:`troubleshooting_install` section). When installing PALEOMIX using pip, Pysam is automatically installed as well. However, note that installing Pysam requires the zlib and Python 2.7 development files. On Debian based distributions, these may be installed as follows:
+
+ # apt-get install libz-dev python2.7-dev
+
+.. warning::
+ PALEOMIX has been developed for 64 bit systems, and has not been extensively tested on 32 bit systems!
+
+
+Regular installation
+--------------------
+
+The following command will install PALEOMIX, and the Python modules required to run it, for the current user only::
+
+ $ pip install --user paleomix
+
+To perform a system-wide installation, simply remove the --user option, and run as root::
+
+ $ sudo pip install paleomix
+
+To verify that the installation was carried out correctly, run the command 'paleomix'::
+
+ $ paleomix
+ PALEOMIX - pipelines and tools for NGS data analyses.
+ Version: v1.0.1
+
+ Usage: paleomix <command> [options]
+ [...]
+
+If the command fails, then please refer to the :ref:`troubleshooting` section.
+
+
+Self-contained installation
+---------------------------
+
+In some cases, it may be useful to make a self-contained installation of PALEOMIX, *e.g.* on shared servers. This is because Python modules that have been installed system-wide take precendence over user-installed modules (this is a limitation of Python itself), which may cause problems both with PALEOMIX itself, and with its Python dependencies.
+
+This is accomplished using `virtualenv`_ for Python, which may be installed using `pip`_ as follows::
+
+ $ pip install --user virtualenv
+
+or (for a system-wide installation)::
+
+ $ sudo pip install virtualenv
+
+
+The follow example installs paleomix in a virtual environmental located in *~/install/virtualenvs/paleomix*, but any location may be used::
+
+ $ virtualenv ~/install/virtualenvs/paleomix
+ $ source ~/install/virtualenvs/paleomix/bin/activate
+ $ (paleomix) pip install paleomix
+ $ (paleomix) deactivate
+
+
+Following succesful completion of these commands, the paleomix tools will be accessible in the ~/install/virtualenvs/paleomix/bin/ folder. However, as this folder also contains a copy of Python itself, it is not recommended to add it to your PATH. Instead, simply link the paleomix commands to a folder in your PATH. This can, for example, be accomplished as follows::
+
+ $ mkdir ~/bin/
+ $ echo 'export PATH=~/bin:$PATH' >> ~/.bashrc
+ $ ln -s ~/install/virtualenvs/paleomix/bin/paleomix ~/bin/
+
+PALEOMIX also includes a number of optional shortcuts which may be used in place of running 'paleomix <command>' (for example, the command 'bam_pipeline' is equivalent to running 'paleomix bam_pipeline')::
+
+ $ ln -s ~/install/virtualenvs/paleomix/bin/bam_pipeline ~/bin/
+ $ ln -s ~/install/virtualenvs/paleomix/bin/conv_gtf_to_bed ~/bin/
+ $ ln -s ~/install/virtualenvs/paleomix/bin/phylo_pipeline ~/bin/
+ $ ln -s ~/install/virtualenvs/paleomix/bin/bam_rmdup_collapsed ~/bin/
+ $ ln -s ~/install/virtualenvs/paleomix/bin/trim_pipeline ~/bin/
+
+
+Upgrading an existing installation
+----------------------------------
+
+Upgrade an existing installation of PALEOMIX, installed using the methods described above, may also be accomplished using pip. To upgrade a regular installation, simply run pip install with the --upgrade option, for a user installation::
+
+ $ pip install --user --upgrade paleomix
+
+Or for a system-wide installation::
+
+ $ sudo pip install --upgrade paleomix
+
+To upgrade an installation a self-contained installation, simply activate the environment before proceeding::
+
+ $ source ~/install/virtualenvs/paleomix/bin/activate
+ $ (paleomix) pip install --upgrade paleomix
+ $ (paleomix) deactivate
+
+
+Upgrading from PALEOMIX v1.1.x
+------------------------------
+
+When upgrading to v1.2.x or later from version 1.1.x or an before, it is nessesary to perform a manual installation the first time. This is accomplished by downloading and unpacking the desired version of PALEOMIX from the list of releases, and then invoking setup.py. For example::
+
+ $ wget https://github.com/MikkelSchubert/paleomix/archive/v1.2.4.tar.gz
+ $ tar xvzf v1.2.4.tar.gz
+ $ paleomix-1.2.4/
+ # Either for the current user:
+ $ python setup.py install --user
+ # Or, for all users:
+ $ sudo python setup.py install
+
+Once this has been done once, pip may be used to perform future upgrades as described above.
+
+
+.. _pip: https://pip.pypa.io/en/stable/
+.. _Pysam: https://github.com/pysam-developers/pysam/
+.. _Python: http://www.python.org/
+.. _virtualenv: https://virtualenv.readthedocs.org/en/latest/
\ No newline at end of file
diff --git a/docs/introduction.rst b/docs/introduction.rst
new file mode 100644
index 0000000..b4884c4
--- /dev/null
+++ b/docs/introduction.rst
@@ -0,0 +1,21 @@
+.. _introduction:
+
+============
+Introduction
+============
+
+The PALEOMIX pipeline is a set of pipelines and tools designed to enable the rapid processing of High-Throughput Sequencing (HTS) data from modern and ancient samples. Currently, PALEOMIX consists of 2 major pipelines, and one protocol described in [Schubert2014]_, as well as one as of yet unpublished pipeline:
+
+* **The BAM pipeline** operates on de-multiplexed NGS reads, and carries out the steps necessary to produce high-quality alignments against a reference sequence, ultimately outputting one or more annotated BAM files.
+
+* **The Metagenomic pipeline** is a protocol describing how to carry out metagenomic analyses on reads processed by the BAM pipeline, allowing for the characterisation of the metagenomic population of ancient samples. This protocol makes use of tools included with PALEOMIX.
+
+* **The Phylogenetic pipeline** carries out genotyping, multiple sequence alignment, and phylogenetic inference on a set of regions derived from BAM files (e.g. produced using the BAM Pipeline).
+
+* **The Zonkey Pipeline** is a smaller, experimental pipeline, for the detection of F1 hybrids in equids, based on low coverage nuclear genomes (as few as thousands of aligned reads) and (optionally) mitochondrial DNA.
+
+All pipelines operate through a mix of standard bioinformatics tools, including SAMTools [Li2009b]_, BWA [Li2009a]_, and more, as well as custom scripts written to support the pipelines. The automated pipelines have been designed to run analytical in parallel steps where possible, and to run with minimal user-intervention. To guard against failed steps and to allow easy debugging of failures, all analyses are run in individual temporary folders, all output is logged (though only retained [...]
+
+In order to faciliate automatic execution, and to ensure that analyses are documented and can be replicated easily, the BAM and the Phylogenetic Pipelines make use of configuration files (hence-forth "makefiles") in `YAML`_ format ; these are text files which describe a project in terms of input files, settings for programs run as part of the pipeline, and which steps to run. For an overview of the YAML format, refer to the included introduction to :ref:`yaml_intro`, or to the official ` [...]
+
+.. _YAML: http://www.yaml.org
diff --git a/docs/other_tools.rst b/docs/other_tools.rst
new file mode 100644
index 0000000..a69e72d
--- /dev/null
+++ b/docs/other_tools.rst
@@ -0,0 +1,119 @@
+.. _other_tools:
+
+Other tools
+===========
+
+On top of the pipelines described in the major sections of the documentation, the pipeline comes bundled with several other, smaller tools, all accessible via the 'paleomix' command. These tools are (briefly) described in this section.
+
+
+paleomix cleanup
+----------------
+
+.. TODO:
+.. paleomix cleanup -- Reads SAM file from STDIN, and outputs sorted,
+.. tagged, and filter BAM, for which NM and MD
+ tags have been updated.
+
+paleomix coverage
+-----------------
+
+.. TODO:
+.. paleomix coverage -- Calculate coverage across reference sequences
+.. or regions of interest.
+
+paleomix depths
+---------------
+
+.. TODO:
+.. paleomix depths -- Calculate depth histograms across reference
+.. sequences or regions of interest.
+
+paleomix duphist
+----------------
+
+.. TODO:
+.. paleomix duphist -- Generates PCR duplicate histogram; used with
+.. the 'Preseq' tool.
+
+paleomix rmdup_collapsed
+------------------------
+
+.. TODO:
+.. paleomix rmdup_collapsed -- Filters PCR duplicates for collapsed paired-
+.. ended reads generated by the AdapterRemoval
+ tool.
+
+paleomix genotype
+-----------------
+
+.. TODO:
+.. paleomix genotype -- Creates bgzipped VCF for a set of (sparse) BED
+.. regions, or for entire chromosomes / contigs
+.. using SAMTools / BCFTools.
+
+paleomix gtf_to_bed
+-------------------
+
+.. TODO:
+.. paleomix gtf_to_bed -- Convert GTF file to BED files grouped by
+.. feature (coding, RNA, etc).
+
+
+paleomix sample_pileup
+----------------------
+
+.. TODO:
+.. paleomix sample_pileup -- Randomly sample sites in a pileup to generate a
+.. FASTA sequence.
+
+.. warning::
+ This tool is deprecated, and will be removed in future versions of PALEOMIX.
+
+
+paleomix vcf_filter
+-------------------
+
+.. TODO:
+.. paleomix vcf_filter -- Quality filters for VCF records, similar to
+.. 'vcfutils.pl varFilter'.
+
+
+paleomix vcf_to_fasta
+---------------------
+.. The 'paleomix vcf\_to\_fasta' command is used to generate FASTA sequences from a VCF file, based either on a set of BED coordinates provided by the user, or for the entire genome covered by the VCF file. By default, heterzyous SNPs are represented using IUPAC codes; if a haploized sequence is desire, random sampling of heterozygous sites may be enabled.
+
+
+paleomix cat
+------------
+
+The 'paleomix cat' command provides a simple wrapper around the commands 'cat', 'gzip', and 'bzip2', calling each as appropriate depending on the files listed on the command-line. This tool is primarily used in order to allow the on-the-fly decompression of input for various programs that do not support both gzip and bzip2 compressed input.
+
+**Usage:**
+
+ usage: paleomix cat [options] files
+
+ positional arguments:
+ files One or more input files; these may be uncompressed,
+ compressed using gzip, or compressed using bzip2.
+
+ optional arguments:
+ -h, --help show this help message and exit
+ --output OUTPUT Write output to this file; by default, output
+ written to STDOUT.
+
+
+**Example:**
+
+.. code-block:: bash
+
+ $ echo "Simple file" > file1.txt
+ $ echo "Gzip'ed file" | gzip > file2.txt.gz
+ $ echo "Bzip2'ed file" | bzip2 > file3.txt.bz2
+ $ paleomix cat file1.txt file2.txt.gz file3.txt.bz2
+ Simple file
+ Gzip'ed file
+ Bzip2'ed file
+
+.. warning:
+
+ The 'paleomix cat' command works by opening the input files sequentually, identifying the compression scheme, and then calling the appropriate command. Therefore this command only works on regular files, but not on (named) pipes.
diff --git a/docs/phylo_pipeline/configuration.rst b/docs/phylo_pipeline/configuration.rst
new file mode 100644
index 0000000..24899e0
--- /dev/null
+++ b/docs/phylo_pipeline/configuration.rst
@@ -0,0 +1,8 @@
+.. highlight:: ini
+.. _phylo_configuration:
+
+
+Configuring the phylogenetic pipeline
+=====================================
+
+TODO
\ No newline at end of file
diff --git a/docs/phylo_pipeline/filestructure.rst b/docs/phylo_pipeline/filestructure.rst
new file mode 100644
index 0000000..87f00fd
--- /dev/null
+++ b/docs/phylo_pipeline/filestructure.rst
@@ -0,0 +1,7 @@
+.. highlight:: Yaml
+.. _phylo_filestructure:
+
+File structure
+==============
+
+TODO
\ No newline at end of file
diff --git a/docs/phylo_pipeline/index.rst b/docs/phylo_pipeline/index.rst
new file mode 100644
index 0000000..75cae21
--- /dev/null
+++ b/docs/phylo_pipeline/index.rst
@@ -0,0 +1,27 @@
+.. _phylo_pipeline:
+
+Phylogenetic Pipeline
+=====================
+
+**Table of Contents:**
+
+.. toctree::
+
+ overview.rst
+ requirements.rst
+ configuration.rst
+ usage.rst
+ makefile.rst
+ filestructure.rst
+
+
+.. warning::
+
+ This section of the documentation is currently undergoing a complete rewrite, and may therefore be incomplete in places.
+
+
+The Phylogenetic Pipeline is a pipeline designed for processing of (one or more) BAMs, in order to carry out genotyping of a set of regions of interest. Following genotyping, multiple sequence alignment may optionally be carried out (this is required if indels were called), and phylogenetic inference may be done on the regions of interest, using a supermatrix approach through ExaML.
+
+Regions of interest, as defined for the Phylogenetic pipeline, are simply any set of regions in a reference sequence, and may span anything from a few short genomic regions, to the complete exome of complex organisms (tens of thousands of genes), and even entire genomes.
+
+While the Phylogenetic pipeline is designed for ease of use in conjunction with the BAM pipeline, but can be used on arbitrary BAM files, provided that these follow the expected naming scheme (see the :ref:`phylo_usage` section).
diff --git a/docs/phylo_pipeline/makefile.rst b/docs/phylo_pipeline/makefile.rst
new file mode 100644
index 0000000..d8cd73f
--- /dev/null
+++ b/docs/phylo_pipeline/makefile.rst
@@ -0,0 +1,10 @@
+.. highlight:: Bash
+.. _phylo_makefile:
+
+Makefile description
+====================
+
+TODO
+
+
+TODO: Describe how to use 'MaxDepth: auto' with custom region, by creating new
\ No newline at end of file
diff --git a/docs/phylo_pipeline/overview.rst b/docs/phylo_pipeline/overview.rst
new file mode 100644
index 0000000..0cfa950
--- /dev/null
+++ b/docs/phylo_pipeline/overview.rst
@@ -0,0 +1,21 @@
+Overview of analytical steps
+============================
+
+During a typical analyses, the Phylogenetic pipeline will proceed through the following steps.
+
+
+1. Genotyping
+
+ 1. SNPs are called on the provided regions using SAMTools, and the resulting SNPs are filtered using the 'paleomix vcf_filter' tool.
+
+ 2. FASTA sequences are constructed from for the regions of interest, using the filtered SNPs generated above, one FASTA file per set of regions and per sample.
+
+2. Multiple sequence alignment
+
+ 1. Per-sample files generated in step 1 are collected, and used to build unaligned multi-FASTA files, one per region of interest.
+
+ 2. If enabled, multiple-sequence alignment is carried out on these files using MAFFT, to generate aligned multi-FASTA files.
+
+3. Phylogenetic inference
+
+ Following construction of (aligned) multi-FASTA sequences, phylogenetic inference may be carried out using a partioned maximum likelihood approach via ExaML.
\ No newline at end of file
diff --git a/docs/phylo_pipeline/requirements.rst b/docs/phylo_pipeline/requirements.rst
new file mode 100644
index 0000000..053ad7a
--- /dev/null
+++ b/docs/phylo_pipeline/requirements.rst
@@ -0,0 +1,51 @@
+.. highlight:: Bash
+.. _phylo_requirements:
+
+
+Software requirements
+=====================
+
+Depending on the parts of the Phylogenetic pipeline used, different programs are required. The following lists which programs are required for each pipeline, as well as the minimum version required:
+
+
+Genotyping
+----------
+
+* [SAMTools](http://samtools.sourceforge.net) v0.1.18+ [Li2009b]_
+* `Tabix`_ v0.2.5
+
+Both the 'tabix' and the 'bgzip' executable from the Tabix package must be installed.
+
+
+Multiple Sequence Alignment
+---------------------------
+
+* `MAFFT`_ v7+ [Katoh2013]_
+
+Note that the pipeline requires that the algorithm-specific MAFFT commands (e.g. 'mafft-ginsi', 'mafft-fftnsi'). These are automatically created by the 'make install' command.
+
+
+Phylogenetic Inference
+----------------------
+
+* `RAxML`_ v7.3.2+ [Stamatakis2006]_
+* `ExaML`_ v1.0.5+
+
+The pipeline expects a single-threaded binary named 'raxmlHPC' for RAxML. The pipeline expects the ExaML binary to be named 'examl', and the parser binary to be named 'parse-examl'. Compiling and running ExaML requires an MPI implementation (e.g. `OpenMPI`_), even if ExaML is run single-threaded. On Debian and Debian-based distributions, this may be accomplished installing 'mpi-default-dev' and 'mpi-default-bin'.
+
+Both programs offer a variety of makefiles suited for different server-architectures and use-cases. If in doubt, use the Makefile.SSE3.gcc makefiles, which are compatible with most modern systems::
+
+ $ make -f Makefile.SSE3.gcc
+
+
+Testing the pipeline
+--------------------
+
+An example project is included with the phylogenetic pipeline, and it is recommended to run this project in order to verify that the pipeline and required applications have been correctly installed. See the :ref:`examples` section for a description of how to run this example project.
+
+
+.. _Tabix: http://samtools.sourceforge.net/
+.. _MAFFT: http://mafft.cbrc.jp/alignment/software/
+.. _RAxML: https://github.com/stamatak/standard-RAxML
+.. _EXaML: https://github.com/stamatak/ExaML
+.. _OpenMPI: http://www.open-mpi.org/
\ No newline at end of file
diff --git a/docs/phylo_pipeline/usage.rst b/docs/phylo_pipeline/usage.rst
new file mode 100644
index 0000000..6d9a8e5
--- /dev/null
+++ b/docs/phylo_pipeline/usage.rst
@@ -0,0 +1,219 @@
+.. highlight:: Yaml
+.. _phylo_usage:
+
+Pipeline usage
+==============
+
+The 'phylo\_pipeline mkfile' command can be used to create a makefile template, as with the 'bam\_pipeline mkfile' command (see section :ref:`bam_usage`). This makefile is used to specify the samples, regions of interest (to be analysed), and options for the various programs:
+
+.. code-block:: bash
+
+ $ phylo_pipeline mkfile > makefile.yaml
+
+Note that filenames are not specified explicitly with this pipeline, but are instead inferred from the names of samples, prefixes, etc. as described below.
+
+To execute the pipeline, a command corresponding to the step to be invoked is used (see below):
+
+.. code-block:: bash
+
+ $ phylo_pipeline <STEP> [OPTIONS] <MAKEFILE>
+
+
+Samples
+-------
+
+The phylogenetic pipeline expects a number of samples to be specified. Each sample has a name, a gender, and a genotyping method::
+
+ Samples:
+ <GROUP>:
+ SAMPLE_NAME:
+ Gender: ...
+ Genotyping Method: ...
+
+Gender is required, and is used to filter SNPs at homozygous sex chromsomes (e.g. chrX and chrY for male humans). Any names may be used, and can simply be set to e.g. 'NA' in case this feature is not used.
+
+The genotyping method is either "SAMTools" for the default genotyping procedure using samtools mpileupe | bcftools view, or "Random Sampling" to sample one random nucleotide in the pileup at each position. This key may be left out to use the default (SAMTools) method.
+
+Groups are optional, and may be used either for the sake of the reader, or to specify a group of samples in lists of samples, e.g. when excluding samples form a subsequent step, when filtering singletons, or when rooting phylogenetic trees (see below)
+
+For a given sample with name S, and a prefix with name P, the pipeline will expect files to be located at ./data/samples/*S*.*P*.bam, or at ./data/samples/*S*.*P*.realigned.bam if the "Realigned" option is enabled (see below).
+
+
+Regions of interest
+-------------------
+
+Analysis is carried out for a set of "Regions of Interest", which is defined a set of named regions specified using BED files:
+
+RegionsOfInterest: NAME: Prefix: NAME_OF_PREFIX Realigned: yes/no ProteinCoding: yes/no IncludeIndels: yes/no
+
+The options 'ProteinCoding' and 'IncludeIndels' takes values 'yes' and 'no' (without quotation marks), and determines the behavior when calling indels. If 'IncludeIndels' is set to yes, indels are included in the consensus sequence, and if 'ProteinCoding' is set to yes, only indels that are a multiple of 3bp long are included.
+
+The name and the prefix determines the location of the expected BED file and the FASTA file for the prefix: For a region of interest named R, and a prefix named P, the pipeline will expect the BED file to be located at ./data/regions/P.R.bed. The prefix file is expected to be located at ./data/prefixes/P.fasta
+
+
+Genotyping
+----------
+
+Genotyping is done either by random sampling of positions, or by building a pileup using samtools and calling SNPs / indels using bcftools. The command used for full genotyping is similar to the following command:
+
+.. code-block:: bash
+
+ $ samtools mpileup [OPTIONS] | bcftools view [OPTIONS] -
+
+In addition, SNPs / indels are filtered using the script 'vcf_filter', which is included with the pipeline. This script implements the filteres found in "vcfutils.pl varFilter", with some additions.
+
+Options for either method, including for both "samtools mpileup" and the "bcftools view" command is set using the **Genotyping** section of the makefile, and may be set for all regions of interest (default behavior) or for each set of regions of interest::
+
+ Genotyping:
+ Defaults:
+ ...
+
+The 'Defaults' key specifies that the options given here apply to all regions of interest; in addition to this key, the name of each set of regions of interest may be used, to set specific values for one set of regions vs. another set. Thus, assuming regions of interest 'ROI\_a' and 'ROI\_b', options may be set as follows::
+
+ Genotyping:
+ Defaults:
+ ...
+
+ ROI_a:
+ ...
+
+ ROI_b:
+ ...
+
+For each set of regions of interest named ROI, the final settings are derived by first taking the Defaults, and then overwriting values using the value taken from the ROI section (if one such exists). The following shows how to change values in Defaults for a single ROI::
+
+ Genotyping:
+ Defaults:
+ --switch: value_a
+
+ ROI_N:
+ --switch: value_b
+
+In the above, all ROI except "ROI\_N" will use the switch with 'value\_a', while "ROI\_N" will use 'value\_b'. Executing the 'genotyping' step is described below.
+
+Finally, note the "Padding" option; this option specifies a number of bases to include around each interval in a set of regions of interest. The purpose of this padding is to allow filtering of SNPs based on the distance from indels, in the case where the indels are outside the intervals themselves.
+
+
+Multiple sequence alignment
+---------------------------
+
+Multiple sequence alignment (MSA) is currently carried out using MAFFT, if enabled. Note that it is still nessesary to run the MSA command (see below), even if the multiple sequence alignment itself is disabled (for example in the case where indels are not called in the genotyping step). This is because the MSA step is responsible for generating both the unaligned multi-FASTA files, and the aligned multi-FASTA files. It is nessesary to run the 'genotyping' step prior to running the MSA s [...]
+
+It is possible to select among the various MAFFT algorithms using the "Algorithm" key, and additionally to specify command-line options for the selected algorithm::
+
+ MultipleSequenceAlignment:
+ Defaults:
+ Enabled: yes
+
+ MAFFT:
+ Algorithm: G-INS-i
+ --maxiterate: 1000
+
+Currently supported algorithms are as follows (as described on the `MAFFT website`_):
+
+* mafft - The basic program (mafft)
+* auto - Equivalent to command 'mafft --auto'
+* fft-ns-1 - Equivalent to the command 'fftns --retree 1'
+* fft-ns-2 - Equivalent to the command 'fftns'
+* fft-ns-i - Equivalent to the command 'fftnsi'
+* nw-ns-i - Equivalent to the command 'nwnsi'
+* l-ins-i - Equivalent to the command 'linsi'
+* e-ins-i - Equivalent to the command 'einsi'
+* g-ins-i - Equivalent to the command 'ginsi'
+
+Command line options are specified as key / value pairs, as shown above for the --maxiterate option, in the same manner that options are specified for the genotyping section. Similarly, options may be specified for all regions of interest ("Defaults"), or using the name of a set of regions of interest, in order to set options for only that set of regions.
+
+
+Phylogenetic inference
+----------------------
+
+Maximum likelyhood Phylogenetic inference is carried out using the ExaML program. A phylogeny consists of a named (subsets of) one or more sets of regions of interest, with individual regions partitioned according to some scheme, and rooted on the midpoint of the tree or one or more taxa::
+
+ PhylogeneticInference:
+ PHYLOGENY_NAME:
+ ExcludeSamples:
+ ...
+
+ RootTreesOn: ...
+
+ PerGeneTrees: yes/no
+
+ RegionsOfInterest:
+ REGIONS_NAME:
+ Partitions: "111"
+ SubsetRegions: SUBSET_NAME
+
+ ExaML:
+ Replicates: 1
+ Bootstraps: 100
+ Model: GAMMA
+
+A phylogeny may exclude any number of samples specified in the Samples region, by listing them under the ExcludeSamples. Furthermore, if groups have been specified for samples (e.g. "<name>"), then these may be used as a short-hand for multiple samples, by using the name of the group including the angle-brackets ("<name>").
+
+Rooting is determined using the RootTreesOn options; if this option is not set, then the resulting trees are rooted on the midpoint of the tree, otherwise it is rooted on the clade containing all the given taxa. If the taxa does not form a monophyletic clade, then rooting is done on the monophyletic clade containing the given taxa.
+
+If PerGeneTrees is set to yes, a tree is generated for every named feature in the regions of interest (e.g. genes), otherwise a super-matrix is created based on all features in all the regions of interest specified for the current phylogeny.
+
+Each phylogeny may include one or more sets of regions of interest, specified under the "RegionsOfInterest", using the same names as those specified under the Project section. Each feature in a set of regions of interest may be partitioned according to position specific scheme. These are specified using a string of numbers (0-9), which is then applied across the selected sequences to determine the model for each position. For example, for the scheme "012" and a given nucleotide sequence, [...]
+
+ AAGTAACTTCACCGTTGTGA
+ 01201201201201201201
+
+Thus, the default partitioning scheme ("111") will use the same model for all positions, and is equivalent to the schemes "1", "11", "1111", etc. Similarly, a per-codon-position scheme may be accomplished using "123" or a similar string. In addition to numbers, the character 'X' may be used to exclude specific positions in an alignment. E.g. to exclude the third position in codons, use a string like "11X". Alternatively, Partitions may be set to 'no' to disable per-feature partitions; in [...]
+
+The options in the ExaML section specifies the number of bootstrap trees to generate from the original supermatrix, the number of phylogenetic inferences to carry out on the original supermatrix (replicate), and the model used (c.f. the ExaML documentation).
+
+The name (PHYLOGENY_NAME) is used to determine the location of the resulting files, by default ./results/TITLE/phylogenies/NAME/. If per-gene trees are generated, an addition two folders are used, namely the name of the regions of interest, and the name of the gene / feature.
+
+For each phylogeny, the following files are generated:
+
+**alignments.partitions**:
+
+ List of partitions used when running ExaML; the "reduced" file contains the same list of partitions, after empty columns (no called bases) have been excluded.
+
+**alignments.phy**:
+
+ Super-matrix used in conjunction with the list of partitions when calling ExaML; the "reduced" file contains the same matrix, but with empty columns (no bases called) excluded.
+
+**alignments.reduced.binary**:
+
+ The reduced supermatrix / partitions in the binary format used by ExaML.
+
+
+**bootstraps.newick**:
+
+ List of bootstrap trees in Newick format, rooted as specified in the makefile.
+
+
+**replicates.newick**:
+
+ List of phylogenies inferred from the full super-matrix, rooted as specified in the makefile.
+
+**replicates.support.newick**:
+
+ List of phylogenies inferred from the full super-matrix, with support values calculated using the bootstrap trees, and rooted as specified in the makefile.
+
+
+Executing the pipeline
+----------------------
+
+The phylogenetic pipeline is excuted similarly to the BAM pipeline, except that a command is provided for each step ('genotyping', 'msa', and 'phylogeny'):
+
+.. code-block:: bash
+
+ $ phylo_pipeline <COMMAND> [OPTIONS] <MAKEFILE>
+
+Thus, to execute the genotyping step, the following command is used:
+
+.. code-block:: bash
+
+ $ phylo_pipeline genotyping [OPTIONS] <MAKEFILE>
+
+In addition, it is possible to run multiple steps by joining these with the plus-symbol. To run both the 'genotyping' and 'msa' step at the same time, use the following command:
+
+.. code-block:: bash
+
+ $ phylo_pipeline genotyping+msa [OPTIONS] <MAKEFILE>
+
+
+.. _MAFFT website: http://mafft.cbrc.jp/alignment/software/algorithms/algorithms.html
\ No newline at end of file
diff --git a/docs/references.rst b/docs/references.rst
new file mode 100644
index 0000000..cccf9d5
--- /dev/null
+++ b/docs/references.rst
@@ -0,0 +1,28 @@
+==========
+References
+==========
+
+.. [Alexander2009] Alexander *et al*. "**Fast model-based estimation of ancestry in unrelated individuals**". Genome Res. 2009 Sep;19(9):1655-64. doi:10.1101/gr.094052.109
+.. [Chang2015] Chang *et al*. "**Second-generation PLINK: rising to the challenge of larger and richer datasets**". Gigascience. 2015 Feb 25;4:7. doi: 10.1186/s13742-015-0047-8
+.. [Daley2013] Daley and Smith. "**Predicting the molecular complexity of sequencing libraries**". Nat Methods. 2013 Apr;10(4):325-7. doi:10.1038/nmeth.2375
+.. [DerSarkissian2015] Der Sarkissian *et al*. "**Evolutionary Genomics and Conservation of the Endangered Przewalski's Horse**". Curr Biol. 2015 Oct 5;25(19):2577-83. doi:10.1016/j.cub.2015.08.032
+.. [Jonsson2013] Jónsson *et al*. "**mapDamage2.0: fast approximate Bayesian estimates of ancient DNA damage parameters**". Bioinformatics. 2013 Jul 1;29(13):1682-4. doi:10.1093/bioinformatics/btt193
+.. [Jonsson2014] Jónsson *et al*. "**Speciation with gene flow in equids despite extensive chromosomal plasticity**". PNAS. 2014 Dec 30;111(52):18655-60. doi:10.1073/pnas.1412627111
+.. [Katoh2013] Katoh and Standley. "**MAFFT multiple sequence alignment software version 7: improvements in performance and usability**". Mol Biol Evol. 2013 Apr;30(4):772-80. doi:10.1093/molbev/mst010
+.. [Langmead2012] Langmead and Salzberg. "**Fast gapped-read alignment with Bowtie 2**". Nat Methods. 2012 Mar 4;9(4):357-9. doi:10.1038/nmeth.1923
+.. [Li2009a] Li and Durbin. "**Fast and accurate short read alignment with Burrows-Wheeler transform**". Bioinformatics. 2009 Jul 15;25(14):1754-60. doi:10.1093/bioinformatics/btp324
+.. [Li2009b] Li *et al*. "**The Sequence Alignment/Map format and SAMtools**". Bioinformatics. 2009 Aug 15;25(16):2078-9. doi:10.1093/bioinformatics/btp352
+.. [Lindgreen2012] Lindgreen. "**AdapterRemoval: Easy Cleaning of Next Generation Sequencing Reads**", BMC Research Notes. 2012 Jul 5:337.
+.. [McKenna2010] McKenna *et al*. "**The Genome Analysis Toolkit: a MapReduce framework for analyzing next-generation DNA sequencing data**". Genome Res. 2010 Sep;20(9):1297-303. doi:10.1101/gr.107524.110
+.. [Orlando2013] Orlando *et al*. "**Recalibrating Equus evolution using the genome sequence of an early Middle Pleistocene horse**". Nature. 2013 Jul; 499(7456):74-78. doi:10.1038/nature12323.
+.. [Paradis2004] Paradis *et al*. "**APE: Analyses of Phylogenetics and Evolution in R language**". Bioinformatics. 2004 Jan 22;20(2):289-90. doi:10.1093/bioinformatics/btg412
+.. [Patterson2006] Patterson N, Price AL, Reich D. Population structure and eigenanalysis. PLoS Genet. 2006 Dec;2(12):e190. doi:10.1371/journal.pgen.0020190
+.. [Peltzer2016] Peltzer *et al*. "**EAGER: efficient ancient genome reconstruction**". Genome Biology. 2016 Mar 9; 17:60. doi:10.1186/s13059-016-0918-z
+.. [Pickrell2012] Pickrell and Pritchard. "**Inference of population splits and mixtures from genome-wide allele frequency data**". PLoS Genet. 2012;8(11):e1002967. doi:10.1371/journal.pgen.1002967
+.. [Price2006] Price *et al*. "**Principal components analysis corrects for stratification in genome-wide association studies**". Nat Genet. 2006 Aug;38(8):904-9. Epub 2006 Jul 23. doi:10.1038/ng1847
+.. [Quinlan2010] Quinlan and Hall. "**BEDTools: a flexible suite of utilities for comparing genomic features**". Bioinformatics. 2010 Mar 15;26(6):841-2. doi:10.1093/bioinformatics/btq033
+.. [Schubert2012] Schubert *et al*. "**Improving ancient DNA read mapping against modern reference genomes**". BMC Genomics. 2012 May 10;13:178. doi:10.1186/1471-2164-13-178.
+.. [Schubert2014] Schubert *et al*. "**Characterization of ancient and modern genomes by SNP detection and phylogenomic and metagenomic analysis using PALEOMIX**". Nature Protocols. 2014 May;9(5):1056-82. doi:10.1038/nprot.2014.063
+.. [Stamatakis2006] Stamatakis. "**RAxML-VI-HPC: maximum likelihood-based phylogenetic analyses with thousands of taxa and mixed models**". Bioinformatics. 2006 Nov 1;22(21):2688-90.
+.. [Wickham2007] Wickham. "**Reshaping Data with the reshape Package**". Journal of Statistical Software. 2007 21(1).
+.. [Wickham2009] Wichham. "**ggplot2: Elegant Graphics for Data Analysis**". Springer-Verlag New York 2009. ISBN:978-0-387-98140-6
\ No newline at end of file
diff --git a/docs/related.rst b/docs/related.rst
new file mode 100644
index 0000000..9bcca31
--- /dev/null
+++ b/docs/related.rst
@@ -0,0 +1,13 @@
+.. _related_tools:
+
+
+Related Tools
+=============
+
+**Pipelines:**
+
+* EAGER - Efficient Ancient GEnome Reconstruction (`website <http://it.inf.uni-tuebingen.de/?page_id=161>`_; [Peltzer2016]_)
+
+ EAGER provides an intuitive and user-friendly way for researchers to address two problems in current ancient genome reconstruction projects; firstly, EAGER allows users to efficiently preprocess, map and analyze ancient genomic data using a standardized general framework for small to larger genome reconstruction projects. Secondly, EAGER provides a user-friendly interface that allows users to run EAGER without needing to fully understand all the underlying technical details.
+
+ *(Description paraphrased from the EAGER website)*
diff --git a/docs/troubleshooting/bam_pipeline.rst b/docs/troubleshooting/bam_pipeline.rst
new file mode 100644
index 0000000..de25595
--- /dev/null
+++ b/docs/troubleshooting/bam_pipeline.rst
@@ -0,0 +1,201 @@
+.. _troubleshooting_bam:
+
+Troubleshooting the BAM Pipeline
+================================
+
+Troubleshooting BAM pipeline makefiles
+--------------------------------------
+
+**Path included multiple times in target**:
+
+ This message is triggered if the same target includes one more more input files more than once::
+
+ Error reading makefiles:
+ MakefileError:
+ Path included multiple times in target:
+ - Record 1: Name: ExampleProject, Sample: Synthetic_Sample_1, Library: ACGATA, Barcode: Lane_1_001
+ - Record 2: Name: ExampleProject, Sample: Synthetic_Sample_1, Library: ACGATA, Barcode: Lane_3_001
+ - Canonical path 1: /home/username/temp/bam_example/000_data/ACGATA_L1_R1_01.fastq.gz
+ - Canonical path 2: /home/username/temp/bam_example/000_data/ACGATA_L1_R2_01.fastq.gz
+
+ This may be caused by using too broad wildcards, or simple mistakes. The message indicates the lane in which the files were included, as well as the "canonical" (i.e. following the resolution of symbolic links, etc.) path to each of the files. To resolve this issue, ensure that each input file is only included once for a given target.
+
+
+**Target name used multiple times**:
+
+ If running multiple makefiles in the same folder, it is important that the names given to targets in each makefile are unique, as the pipeline will otherwise mixfiles between different projects (see the section :ref:`bam_filestructure` for more information). The PALEOMIX pipeline attempts to detect this, and prevents the pipeline from running in this case::
+
+ Error reading makefiles:
+ MakefileError:
+ Target name 'ExampleProject' used multiple times; output files would be clobbered!
+
+**OutOfMemoryException (PicardTools, GATK, etc.):**
+
+ By default, the BAM pipeline will limit the amount of heap-space used by Java programs to 4GB (on 64-bit systems, JVM defaults are used on 32-bit systems), which may prove insufficient in some instances. This will result in the failing program terminating with a stacktrace, such as the following::
+
+ Exception in thread "main" java.lang.OutOfMemoryError
+ at net.sf.samtools.util.SortingLongCollection.<init>(SortingLongCollection.java:101)
+ at net.sf.picard.sam.MarkDuplicates.generateDuplicateIndexes(MarkDuplicates.java:443)
+ at net.sf.picard.sam.MarkDuplicates.doWork(MarkDuplicates.java:115)
+ at net.sf.picard.cmdline.CommandLineProgram.instanceMain(CommandLineProgram.java:158)
+ at net.sf.picard.sam.MarkDuplicates.main(MarkDuplicates.java:97)
+
+
+ To resolve this issue, increase the maximum amount of heap-space used using the "--jre-option" command-line option; this permits the passing of options to the Java Runtime Environment (JRE). For example, to increase the maximum to 8gb, run the BAM pipeline as follows::
+
+ $ bam_pipeline run --jre-option -Xmx8g [...]
+
+
+Troubleshooting AdapterRemoval
+------------------------------
+
+The AdapterRemoval task will attempt to determine verify the quality-offset specified in the makefile; if the contents of the file does not match the expected offset (i.e. contains quality scores that fall outside the range expected with that offset http://en.wikipedia.org/wiki/FASTQ_format\#Encoding, the task will be aborted.
+
+**Incorrect quality offsets specified in makefile**:
+
+ In case where the sequence data can be determined to contain FASTQ records with a different quality offset than that specified in the makefile, the task will be aborted with the message corresponding to the following::
+
+ <AdapterRM (SE): '000_data/TGCTCA_L1_R1_02.fastq.gz' -> 'ExampleProject/reads/Synthetic_Sample_1/TGCTCA/Lane_1_002/reads.*'>: Error occurred running command:
+ Error(s) running Node:
+ Temporary directory: '/path/to/temp/folder'
+
+ FASTQ file contains quality scores with wrong quality score offset (33); expected reads with quality score offset 64. Ensure that the 'QualityOffset' specified in the makefile corresponds to the input.
+ Filename = 000_data/TGCTCA_L1_R1_02.fastq.gz
+
+ Please verify the format of the input file, and update the makefile to use the correct QualityOffset before starting the pipeline.
+
+
+**Input file contains mixed FASTQ quality scores**:
+
+ In case where the sequence data can be determined to contain FASTQ records with a different quality scores corresponding to the both of the possible offsets (for example both "!" and "a"), the task will be aborted with the message corresponding to the following example::
+
+ <AdapterRM (SE): '000_data/TGCTCA_L1_R1_02.fastq.gz' -> 'ExampleProject/reads/Synthetic_Sample_1/TGCTCA/Lane_1_002/reads.*'>: Error occurred running command:
+ Error(s) running Node:
+ Temporary directory: '/path/to/temp/folder'
+
+ FASTQ file contains quality scores with both quality offsets (33 and 64); file may be unexpected format or corrupt. Please ensure that this file contains valid FASTQ reads from a single source.
+ Filename = '000_data/TGCTCA_L1_R1_02.fastq.gz'
+
+ This error would suggest that the input-file contains a mix of FASTQ records from multiple sources, e.g. resulting from the concatenation of multiple sets of data. If so, make use of the original data, and ensure that the quality score offset set for each is set correctly.
+
+
+**Input file does not contain quality scores**:
+
+ If the input files does not contain any quality scores (e.g. due to malformed FASTQ records), the task will terminate, as these are required by the AdapterRemoval program. Please ensure that the input files are valid FASTQ files before proceeding.
+
+ Input files in FASTA format / not in FASTQ format:
+
+ If the input file can be determined to be in FASTA format, or otherwise be determined to not be in FASTQ format, the task will terminate with the following message::
+
+ <AdapterRM (SE): '000_data/TGCTCA_L1_R1_02.fastq.gz' -> 'ExampleProject/reads/Synthetic_Sample_1/TGCTCA/Lane_1_002/reads.*'>: Error occurred running command:
+ Error(s) running Node:
+ Temporary directory: '/path/to/temp/folder'
+
+ Input file appears to be in FASTA format (header starts with '>', expected '@'), but only FASTQ files are supported.
+ Filename = '000_data/TGCTCA_L1_R1_02.fastq.gz'
+
+ Note that the pipeline only supports FASTQ files as input for the trimming stage, and that these have to be either uncompressed, gzipped, or bzipped. Other compression schemes are not supported at this point in time.
+
+
+Troubleshooting BWA
+-------------------
+
+The BAM pipeline has primarily been tested with BWA v0.5.x; this is due in part to a number of issues with the Backtrack algorithm in later versions of BWA. For this reason, either v0.5.9-10 or BWA 0.7. Currently there is no version of BWA 0.7.x prior to 0.7.9a for which bugs have not been observed (see sub-sections below), excepting BWA v0.7.0 which does however lack several important bug-fixes added to later versions (see the BWA changelog).
+
+**BWA prefix generated using different version of BWA / corrupt index**:
+
+ Between versions 0.5 and 0.6, BWA changed the binary format used to store the index sequenced produced using the command "bwa index". Version 0.7 is compatible with indexes generated using v0.6. The pipeline will attempt to detect the case where the current version of BWA does not correspond to the version used to generate the index, and will terminate if that is the case.
+
+ As the two formats contain both contain files with the same names, the two formats cannot co-exist in the same location. Thus to resolve this issue, either create a new index in a new location, and update the makefile to use that location, or delete the old index files (path/to/prefix.fasta.*), and re-index it by using the command "bwa index path/to/prefix.fasta", or by simply re-starting the pipeline.
+
+ However, because the filenames used by v0.6+ is a subset of the filenames used by v0.5.x, it is possible to accidentally end up with a prefix that appears to be v0.5.x to the pipeline, but in fact contains a mix of v0.5.x and v0.6+ files. This situation, as well as corruption of the index, may result in the following errors:
+
+ 1. [bwt_restore_sa] SA-BWT inconsistency: seq_len is not the same
+
+ 2. [bns_restore_core] fail to open file './rCRS.fasta.nt.ann'
+
+ 3. Segmentation faults when running 'bwa aln'; these are reported as "SIGSEGV" in the file pipe.errors
+
+ If this occurs, removing the old prefix files and generating a new index is advised (see above).
+
+
+**[gzclose] buffer error**:
+
+ On some systems, BWA may terminate with an "[gzclose] buffer error" error when mapping empty files (sometimes produced by AdapterRemoval). This is caused by a bug / regression in some versions of zlibhttp://www.zlib.net/, included with some distributions. As it is typically not possible to upgrade zlib without a full system update, BWA may instead be compiled using a up-to-date version of zlib, as shown here for zlib v1.2.8 and BWA v0.5.10::
+
+ $ wget http://downloads.sourceforge.net/project/bio-bwa/bwa-0.5.10.tar.bz2
+ $ tar xvjf bwa-0.5.10.tar.bz2
+ $ cd bwa-0.5.10
+ $ sed -e's#INCLUDES=#INCLUDES=-Izlib-1.2.8/ #' -e's#-lz#zlib-1.2.8/libz.a#' Makefile > Makefile.zlib
+ $ wget http://zlib.net/zlib-1.2.8.tar.gz
+ $ tar xvzf zlib-1.2.8.tar.gz
+ $ cd zlib-1.2.8
+ $ ./configure
+ $ make
+ $ cd ..
+ $ make -f Makefile.zlib
+
+ The resulting "bwa" executable must be placed in the PATH *before* the version of BWA built against the outdated version of zlib.
+
+
+Troublshooting validation of BAM files
+--------------------------------------
+
+**Both mates are marked as second / first of pair**:
+
+ This error message may occur during validation of the final (realigned) BAM, if the input files specified for different libraries contained duplicates reads (*not* PCR duplicate). In that case, the final BAM will contain multiple copies of the same data, thereby risking a significant bias in downstream analyses.
+
+ The following demonstrates this problem, using a contrieved example based on the examples/bam_example project included with the pipeline::
+
+ $ bam_pipeline run 000_makefile.yaml
+ [...]
+ <Validate BAM: 'ExampleProject.rCRS.realigned.bam'>: Error occurred running command:
+ Error(s) running Node:
+ Temporary directory: '/path/to/temp/folder'
+
+ Error(s) running Node:
+ Return-codes: [1]
+ Temporary directory: '/path/to/temp/folder'
+
+ <Command = ['java', '-server', '-Xmx4g',
+ '-Djava.io.tmpdir=/tmp/bam_pipeline/9a5beba9-1b24-4494-836e-62a85eb74bf3',
+ '-Djava.awt.headless=true', '-XX:+UseSerialGC', '-jar',
+ '/home/research/tools/opt/jar_root/ValidateSamFile.jar',
+ 'I=ExampleProject.rCRS.realigned.bam',
+ 'IGNORE=MATE_NOT_FOUND', 'IGNORE=INVALID_QUALITY_FORMAT']
+ Status = Exited with return-code 1
+ STDOUT = '/path/to/temp/folder/rCRS.realigned.validated'
+ STDERR* = '/path/to/temp/folder/pipe_java_20885232.stderr'
+ CWD = '/home/temp/bam_example'>
+
+ Picard's ValidateSamfile prints the error messages to STDOUT, the location of which is indicated above::
+
+ $ cat '/tmp/bam_pipeline/9a5beba9-1b24-4494-836e-62a85eb74bf3/rCRS.realigned.validated'
+ ERROR: Record 684, Read name Seq_101_1324_104_rv_0\2, Both mates are marked as second of pair
+ ERROR: Record 6810, Read name Seq_1171_13884_131_fw_0\2, Both mates are marked as second of pair
+
+ To identify the source of the problems, the problematic reads may be extracted from the BAM file::
+
+ $ samtools view ExampleProject.rCRS.realigned.bam|grep -w "^Seq_101_1324_104_rv_0"
+ Seq_101_1324_104_rv_0\2 131 NC_012920_1 1325 60 100M = 1325 -1 [...]
+ Seq_101_1324_104_rv_0\2 131 NC_012920_1 1325 60 100M = 1325 1 [...]
+ Seq_101_1324_104_rv_0\1 16 NC_012920_1 1327 37 51M2D49M * 0 0 [...]
+ Seq_101_1324_104_rv_0\1 89 NC_012920_1 1327 60 51M2D49M * 0 0 [...]
+
+
+ Note that both mate pairs are duplicated, with slight variations in the flags. The source of the reads may be determined using the "RG" tags (not shown here), which for files produced by the pipeline corresponds to the library names. Once these are known, the corresponding FASTQ files may be examined to determine the source of the duplicate reads. This problem should normally be detected early in the pipeline, as checks for the inclusion of duplicate data has been implemented (see below).
+
+**Read ... found in multiple files**:
+
+ In order to detect the presence of data that has been included multiple times, e.g. due to incorrect merging of data, the pipeline looks for alignments with identical names, sequences and quality scores. If such reads are found, the follow error is reported::
+
+ <Detect Input Duplication: 15 files>: Error occurred running command:
+ Read 'Seq_junk_682_0' found in multiple files:
+ - 'ExampleProject/rCRS/Synthetic_Sample_1/ACGATA/Lane_1_002/paired.minQ0.bam'
+ - 'ExampleProject/rCRS/Synthetic_Sample_1/ACGATA/Lane_1_001/paired.minQ0.bam'
+
+ This indicates that the same data files have been included multiple times in the project. Please review the input files used in this project, to ensure that each set of data is included only once.
+
+ The message given indicates which files (and hence which samples/libraries and lanes were affected, as described in section :ref:`bam_filestructure`). If only a single file is given, this suggests that the reads were also found in that one file.
+
+ This problem may result from the accidental concatenation of files provided to the pipeline, or from multiple copies of the same files being included in the wildcards specified in the makefile. As including the same sequencing reads multiple times are bound to bias downstream analyses (if it does not cause validation failure, see sub-section above), this must be fixed before the pipeline is re-started.
diff --git a/docs/troubleshooting/common.rst b/docs/troubleshooting/common.rst
new file mode 100644
index 0000000..05c1127
--- /dev/null
+++ b/docs/troubleshooting/common.rst
@@ -0,0 +1,114 @@
+.. highlight:: Bash
+.. _troubleshooting_common:
+
+Troubleshooting general problems
+================================
+
+
+If a command fails while the pipeline is running (e.g. mapping, genotyping, validation of BAMs, etc.), the pipeline will print a message to the command-line and write a message to a log-file. The location of the log-file may be specified using the --log-file command-line option, but if --log-file is not specified, a time-stamped log-file is generated in the temporary folder specified using the --temp-root command-line option, and the location of this log-file is printed by the pipeline d [...]
+
+ $ 2014-01-07 09:46:19 Pipeline; 1 failed, 202 done of 203 tasks:
+ Log-file located at '/path/to/temp/folder/bam_pipeline.20140107_094554_00.log'
+ [...]
+
+
+Most error-messages will involve a message in the following form::
+
+ <Validate BAM: 'ExampleProject.rCRS.realigned.bam'>:
+ Error occurred running command:
+ Error(s) running Node:
+ Return-codes: [1]
+ Temporary directory: '/path/to/temp/folder'
+
+ <Command = ['java', '-server', '-Xmx4g',
+ '-Djava.io.tmpdir=/path/to/temp/folder',
+ '-Djava.awt.headless=true', '-XX:+UseSerialGC', '-jar',
+ '/home/username/install/jar_root/ValidateSamFile.jar',
+ 'I=ExampleProject.rCRS.realigned.bam',
+ 'IGNORE=MATE_NOT_FOUND',
+ 'IGNORE=INVALID_QUALITY_FORMAT']
+
+ Status = Exited with return-code 1
+ STDOUT = '/path/to/temp/folder/rCRS.realigned.validated'
+ STDERR* = '/path/to/temp/folder/pipe_java_4454836272.stderr'
+ CWD = '/path/to/project'>
+
+The task that failed was the validation of the BAM 'ExampleProject.rCRS.realigned.bam' using Picard ValidateSamFile, which terminated with return-code 1. For each command involved in a given task ('node'), the command-line (as the list passed to 'Popen'http://docs.python.org/2.7/library/subprocess.html), return code, and the current working directory (CWD) is shown. In addition, STDOUT and STDERR are always either piped to files, or to a different command. In the example given, STDOUT is [...]
+
+To determine the cause of the failure (indicated by the non-zero return-code), examine the output of each command involved in the node. Normally, messages relating to failures may be found in the STDERR file, but in some cases (and in this case) the cause is found in the STDOUT file::
+
+ $ cat /path/to/temp/folder/rCRS.realigned.validated
+ ERROR: Record 87, Read name [...], Both mates are marked as second of pair
+ ERROR: Record 110, Read name [...], Both mates are marked as first of pair
+ [...]
+
+
+This particular error indicates that the same reads have been included multiple times in the makefile (see section [sub:Troubleshooting-BAM]). Normally it is nessesary to consult the documentation of the specified program in order to determine the cause of the failure.
+
+In addition, the pipeline performs a number of which during startup, which may result in the following issues being detected:
+
+**Required file does not exist, and is not created by a node**:
+
+ Before start, the BAM and Phylogenetic pipeline checks for the presence of all required files. Should one or more files be missing, and the missing file is NOT created by the pipeline itself, an error similar to the following will be raised::
+
+ $ bam_pipeline run 000_makefile.yaml
+ [...]
+ Errors detected during graph construction (max 20 shown):
+ Required file does not exist, and is not created by a node:
+ Filename: 000_prefix/rCRS.fasta
+ Dependent node(s): [...]
+
+ This typically happens if the Makefile contains typos, or if the required files have been moved since the last time the makefile was executed. To proceed, it is necessary to determine the current location of the files in question, and/or update the makefile.
+
+
+**Required executables are missing**:
+
+ Before starting to execute a makefile, the pipeline will check that the requisite programs are installed, and verify that the installed versions meet the minimum requirements. Should an executable be missing, an error similar to the following will be issued, and the pipeline will not run::
+
+ $ bam_pipeline run 000_makefile.yaml
+ [...]
+ Errors detected during graph construction (max 20 shown):
+ Required executables are missing: bwa
+
+ In that case, please verify that all required programs are installed (see sections TODO) and ensure that these are accessible via the current user's PATH (i.e. can be excuted on the command-line using just the executable name).
+
+
+**Version requirement not met**:
+
+ In addition to checking for the presence of required executables (including java JARs), version of a program is checked. Should the version of the program not be compatible with the pipeline (e.g. because it is too old), the following error is raised::
+
+ $ bam_pipeline run 000_makefile.yaml
+ [...]
+ Version requirement not met for 'Picard CreateSequenceDictionary.jar';
+ please refer to the PALEOMIX documentation for more information.
+
+ Executable: /Users/mischu/bin/bwa
+ Call: bwa
+ Version: v0.5.7.x
+ Required: v0.5.19.x or v0.5.110.x or v0.6.2.x or at least v0.7.9.x
+
+ If so, please refer to the documentation for the pipeline in question, and install/update the program to the version required by the pipeline. Note that the executable MUST be accessible by the PATH variable. If multiple versions of a program is installed, the version required by the pipeline must be first, which may be verified by using the "which" command::
+
+ $ which -a bwa
+ /home/username/bin/bwa
+ /usr/local/bin/bwa
+
+**Java Runtime Environment outdated / UnsupportedClassVersionError**:
+
+ If the version of the Java Runtime Environment (JRE) is too old, the pipeline may fail to run with the follow message::
+
+ The version of the Java Runtime Environment on this
+ system is too old; please check the the requirement
+ for the program and upgrade your version of Java.
+
+ See the documentation for more information.
+
+ Alternatively, Java programs may fail with a message similar to the following, as reported in the pipe_*.stderr file (abbreviated)::
+
+ Exception in thread "main" java.lang.UnsupportedClassVersionError: org/broadinstitute/sting/gatk/CommandLineGATK :
+ Unsupported major.minor version 51.0 at [...]
+
+ This problem is typically caused by the GenomeAnalysisTK (GATK), which as of version 2.6 requires Java 1.7 (see `their website`_). To solve this problem, you will need to either upgrade your copy of Java.
+
+
+.. _their website: http://www.broadinstitute.org/gatk/guide/article?id=2846
\ No newline at end of file
diff --git a/docs/troubleshooting/index.rst b/docs/troubleshooting/index.rst
new file mode 100644
index 0000000..4b9b06a
--- /dev/null
+++ b/docs/troubleshooting/index.rst
@@ -0,0 +1,15 @@
+.. _troubleshooting:
+
+Troubleshooting
+===============
+
+.. toctree::
+
+ install.rst
+ common.rst
+ bam_pipeline.rst
+ phylo_pipeline.rst
+ zonkey_pipeline.rst
+
+For troubleshooting of individual pipelines, please see the BAM pipeline :ref:`troubleshooting_bam` section, the Phylo pipeline :ref:`troubleshooting_phylo` section, and the Zonkey pipeline :ref:`troubleshooting_zonkey` section.
+
diff --git a/docs/troubleshooting/install.rst b/docs/troubleshooting/install.rst
new file mode 100644
index 0000000..16ebf97
--- /dev/null
+++ b/docs/troubleshooting/install.rst
@@ -0,0 +1,54 @@
+.. highlight:: Bash
+.. _troubleshooting_install:
+
+Throubleshooting the installation
+=================================
+
+**Pysam / Cython installation fails with "Python.h: No such file or directory" or "pyconfig.h: No such file or directory"**:
+
+ Installation of Pysam and Cython requires that Python development files are installed. On Debian based distributions, for example, this may be accomplished by running the following command::
+
+ $ sudo apt-get install python-dev
+
+
+**Pysam installation fails with "zlib.h: No such file or directory"**:
+
+ Installation of Pysam requires that "libz" development files are installed. On Debian based distributions, for example, this may be accomplished by running the following command::
+
+ $ sudo apt-get install libz-dev
+
+
+**Command not found when attempting to run 'paleomix'**:
+
+ By default, the PALEOMIX executables ('paleomix', etc.) are installed in ~/.local/bin. You must ensure that this path is included in your PATH::
+
+ $ export PATH=$PATH:~/.local/bin
+
+ To automatically apply this setting on sub-sequent logins (assuming that you are using Bash), run the following command::
+
+ $ echo "export PATH=\$PATH:~/.local/bin" >> ~/.bash_profile
+
+
+**PALEOMIX command-line aliases invokes wrong tools**:
+
+ When upgrading an old PALEOMIX installation (prior to v1.2.x) using pip, the existence of old files may result in all command-line aliases ('bam\_pipeline', 'phylo\_pipeline', 'bam\_rmdup\_collapsed', etc.) invoking the same command (typically 'phylo_pipeline')::
+
+ $ bam_pipeline makefile.yaml
+ Phylogeny Pipeline v1.2.1
+
+ [...]
+
+ This can be solved by removing these aliases, and then re-installing PALEOMIX using 'pip', shown here for a system-wide install::
+
+ $ sudo rm -v /usr/local/bin/bam_pipeline /usr/local/bin/conv_gtf_to_bed /usr/local/bin/phylo_pipeline /usr/local/bin/bam_rmdup_collapsed /usr/local/bin/trim_pipeline
+ $ sudo python setup.py install
+
+ Alternatively, this may be resolved by downloading and manually installing PALEOMIX::
+
+ $ wget https://github.com/MikkelSchubert/paleomix/archive/v1.2.4.tar.gz
+ $ tar xvzf v1.2.4.tar.gz
+ $ paleomix-1.2.4/
+ # Either for the current user:
+ $ python setup.py install --user
+ # Or, for all users:
+ $ sudo python setup.py install
\ No newline at end of file
diff --git a/docs/troubleshooting/phylo_pipeline.rst b/docs/troubleshooting/phylo_pipeline.rst
new file mode 100644
index 0000000..388f9fb
--- /dev/null
+++ b/docs/troubleshooting/phylo_pipeline.rst
@@ -0,0 +1,8 @@
+.. _troubleshooting_phylo:
+
+Troubleshooting the Phylogenetic Pipeline
+=========================================
+
+TODO
+
+TODO: MaxDepth not found in depth files .. --target must match name used in makefile
\ No newline at end of file
diff --git a/docs/troubleshooting/zonkey_pipeline.rst b/docs/troubleshooting/zonkey_pipeline.rst
new file mode 100644
index 0000000..33e556b
--- /dev/null
+++ b/docs/troubleshooting/zonkey_pipeline.rst
@@ -0,0 +1,6 @@
+.. _troubleshooting_zonkey:
+
+Troubleshooting the Zonkey Pipeline
+===================================
+
+TODO
\ No newline at end of file
diff --git a/docs/yaml.rst b/docs/yaml.rst
new file mode 100644
index 0000000..e9ee157
--- /dev/null
+++ b/docs/yaml.rst
@@ -0,0 +1,28 @@
+.. highlight:: YAML
+.. _yaml_intro:
+
+YAML usage in PALEOMIX
+======================
+
+The format, `YAML`_, is a simple human-readable markup language in which the structure of the data is determined by its identation, and will look familiar to anyone who has experience with the `Python`_ programming language.
+
+The following showcases basic structure of a YAML document, as used by the pipelines::
+
+ # This is a comment; this line is completely ignored
+ This is a section:
+ This is a subsection:
+ # This subsection contains 4 key / value pairs:
+ First key: "First value"
+ Second key: 2
+ Third key: 3.4
+ # The following key has no value assosiated!
+ Fourth key:
+
+ This is a section containing a list:
+ - The first item
+ - The second item
+
+
+
+.. _Python: http://www.python.org/
+.. _YAML: http://www.yaml.org
diff --git a/docs/zonkey_pipeline/configuration.rst b/docs/zonkey_pipeline/configuration.rst
new file mode 100644
index 0000000..d37757c
--- /dev/null
+++ b/docs/zonkey_pipeline/configuration.rst
@@ -0,0 +1,47 @@
+.. highlight:: ini
+.. _zonkey_configuration:
+
+Configuring the Zonkey pipeline
+===============================
+
+Unlike the :ref:`bam_pipeline` and the :ref:`phylo_pipeline`, the :ref:`zonkey_pipeline` does not make use of makefiles. However, the pipeline does expose a number options, including the maximum number of threads used, various program parameters, and more. These may be set using the corresponding command-line options (e.g. --max-threads to set the maximum number of threads used during runtime). However, it is also possible to set default values for such options, including on a per-host b [...]
+
+.. code-block:: bash
+
+ $ paleomix zonkey --write-config
+
+
+The resulting file contains a list of options which can be overwritten::
+
+ [Defaults]
+ max_threads = 1
+ log_level = warning
+ progress_ui = progress
+ treemix_k = 0
+ admixture_replicates = 1
+ ui_colors = on
+ downsample_to = 1000000
+
+These values will be used by the pipeline, unless the corresponding option is also supplied on the command-line. I.e. if "max_threads" is set to 4 in the "zonkey.ini" file, but the pipeline is run using "paleomix zonkey run --max-threads 10", then the max threads value is set to 10.
+
+.. note::
+ Options in the configuration file correspond directly to command-line options for the BAM pipeline, with two significant differences: The leading dashes (--) are removed and any remaining dashes are changed to underscores (_); as an example, the command-line option --max-threads becomes max\_threads in the configuration file, as shown above.
+
+It is furthermore possible to set specific options depending on the current host-name. Assuming that the pipeline was run on multiple servers sharing a single home directory, one might set the maximum number of threads on a per-server basis as follows::
+
+ [Defaults]
+ max_threads = 32
+ [BigServer]
+ max_threads = 64
+ [SmallServer]
+ max_threads = 16
+
+
+The names used (here "BigServer" and "SmallServer") should correspond to the hostname, i.e. the value returned by the "hostname" command:
+
+.. code-block:: bash
+
+ $ hostname
+ BigServer
+
+Any value set in the section matching the name of the current host will take precedence over the 'Defaults' section, but can still be overridden by specifying the same option on the command-line, as described above.
diff --git a/docs/zonkey_pipeline/filestructure.rst b/docs/zonkey_pipeline/filestructure.rst
new file mode 100644
index 0000000..209a486
--- /dev/null
+++ b/docs/zonkey_pipeline/filestructure.rst
@@ -0,0 +1,68 @@
+.. highlight:: Bash
+.. _zonkey_filestructure:
+
+File structure
+==============
+
+The following section explains the file structure for results generated by the Zonkey pipeline, based on the results generated when analyzing the example files included with the pipeline (see :ref:`examples_zonkey`).
+
+
+Single sample analysis
+----------------------
+
+The following is based on running case 4a, as described in the :ref:`zonkey_usage` section. More specifically, the example in which the analysis are carried out on a BAM alignment file containing both nuclear and mitochondrial alignments::
+
+ # Case 4a: Analyses both nuclear and mitochondrial genome; results are placed in 'combined.zonkey'
+ $ paleomix zonkey run database.tar combined.bam
+
+As noted in the comment, executing this command places the results in the directory 'combined.zonkey'. For a completed analysis, the results directory is expected to contain a (HTML) report and a directory containing each of the figures generated by the pipeline:
+
+* report.css
+* report.html
+* figures/
+
+The report may be opened with any modern browser. Each figure displayed in the report is also available as a PDF file, accessed by clicking on a given figure in the report, or directly in the figures/ sub-directory.
+
+
+Analysis result files
+^^^^^^^^^^^^^^^^^^^^^
+
+In addition, the following directories are generated by the analytical steps, and contain the various files used by or generated by the programs run as part of the Zonkey pipeline:
+
+* admixture/
+* mitochondria/
+* pca/
+* plink/
+* treemix/
+
+In general, files in these directories are sorted by the prefix 'incl\_ts' and the prefix 'excl\_ts', which indicate that sites containing transitions (C<->G, and C<->T) have been included or excluded from the analyses, respectively. For a detailed description of the files generated by each analysis, please refer to the documentation for the respective programs used in said analyses.
+
+Additionally, the results directory is expected to contain a 'temp' directory. This directory may safely be removed following the completion of a Zonkey run, but should be empty unless one or more analytical steps have failed.
+
+
+Multi-sample analysis
+---------------------
+
+When multiple samples are processed at once, as described in case 5 (:ref:`zonkey_usage`), results are written to a single 'results' directory. This directory will contain a summary report for all samples, as well as a sub-directory for each sample listed in the table of samples provided when running the pipeline. Thus, for the samples table shown in case 5::
+
+ $ cat samples.table
+ example1 combined.bam
+ example2 nuclear.bam
+ example3 mitochondrial.bam
+ example4 nuclear.bam mitochondrial.bam
+
+ # Case 5a) Analyse 3 samples; results are placed in 'my_samples.zonkey'
+ $ paleomix zonkey run database.tar my_samples.txt
+
+The results directory is expected to contain the following files and directories:
+
+* summary.html
+* summary.css
+* example1/
+* example2/
+* example3/
+* example4/
+
+The summary report may be opened with any modern browser, and offers a quick over-view of all samples processed as part of this analysis. The individual report for each sample may further more be accessed by clicking on the headers corresponding to the name of a give nsample.
+
+The per-sample directories corresponding exactly to the result directories that would have been generated if the sample was processed by itself (see above), excepting that only a single 'temp' directory located in the root of the results directory is used.
diff --git a/docs/zonkey_pipeline/index.rst b/docs/zonkey_pipeline/index.rst
new file mode 100644
index 0000000..0c55fe7
--- /dev/null
+++ b/docs/zonkey_pipeline/index.rst
@@ -0,0 +1,19 @@
+.. _zonkey_pipeline:
+
+Zonkey Pipeline
+===============
+
+**Table of Contents:**
+
+.. toctree::
+
+ overview.rst
+ requirements.rst
+ configuration.rst
+ usage.rst
+ panel.rst
+ filestructure.rst
+
+The Zonkey Pipeline is a easy-to-use pipeline designed for the analyses of low-coverage, ancient DNA derived from historical equid samples, with the purpose of determining the species of the sample, as well as determining possible hybridization between horses, zebras, and asses (see :ref:`zonkey_usage`).
+
+This is accomplished by comparing one or more samples aligned against the *Equus caballus* 2.0 reference sequence with a reference panel of modern equids, including wild and domesticated equids. The reference panel is further described in the :ref:`zonkey_panel` section.
\ No newline at end of file
diff --git a/docs/zonkey_pipeline/overview.rst b/docs/zonkey_pipeline/overview.rst
new file mode 100644
index 0000000..31d11b9
--- /dev/null
+++ b/docs/zonkey_pipeline/overview.rst
@@ -0,0 +1,55 @@
+Overview of analytical steps
+============================
+
+Briefly, the Zonkey pipeline can run admixture tests on pre-defined species categories (asses, horses, and zebras) to evaluate the ancestry proportions found in the test samples. F1-hybrids are expected to show a balance mixture of two species ancestries, although this balance can deviate from the 50:50 expectation in case limited genetic information is available. This is accomplished using ADMIXTURE [Alexander2009]_.
+
+The zonkey pipeline additionally builds maximum likelihood phylogenetic trees, using RAxML [Stamatakis2006]_ for mitochondrial sequence data and using Treeemix [Pickrell2012]_ for autosomal data. In the latter case, phylogenetic affinities are reconstructed twice: First considering no migration edges and secondly allowing for one migration edge. This allows for fine-grained testing of admixture between the sample and any of the species represented in the reference panel.
+
+In cases where an admixture signal is found, the location of the sample in the mitochondrial tree allows for the identification of the maternal species contributing to the hybrid being examined. For equids, this is essential to distinguish between possible the hybrid forms, such as distinguishing between mules (|female| horse x |male| donkey F1-hybrid) and hinnies (|male| horse x |female| donkey F1-hybrid).
+
+Analyses are presented in HTML reports, one per sample and one summary report when analyzing multiple samples. Figures are generated in both as PNG and PDF format in order to facilitate use in publications (see :ref:`zonkey_filestructure`).
+
+
+Individual analytical steps
+---------------------------
+
+During a typical analyses, the Zonkey pipeline will proceed through the following major analytical steps:
+
+
+1. Analyzing nuclear alignments:
+
+ 1. Input BAMs are indexed using the equivalent of 'samtools index'.
+
+ 2. Nucleotides at sites overlapping SNPs in the reference panel are sampled to produce a pseudo-haploid sequence, one in which transitions are included and one in which transitions are excluded, in order to account for the presence of *post-mortem* deamination causing base substitutions. The resulting tables are processed using PLINK to generate the prerequisite files for further analyses.
+
+ 3. PCA plots are generated using SmartPCA from the EIGENSOFT suite of tools for both panels of SNPs (including and excluding transitions).
+
+ 4. Admixture estimates are carried out using ADMIXTURE, with a partially supervised approach by assigning each sample in the reference panel to one of either two groups (caballine and non-caballine equids) or three groups (asses, horses, and zebras), and processing the SNP panels including and excluding transitions. The input sample is not assigned to a group.
+
+ 5. Migration edges are modeled using TreeMix, assuming either 0 or 1 migration edge; analyses is carried out on both the SNP panel including transitions and on the SNP panel excluding transitions.
+
+ 6. PNG and PDF figures are generated for each analytical step; in addition, the the per-chromosome coverage of the nuclear genome is plotted.
+
+
+1. Analyzing mitochondrial alignments:
+
+ 1. Input BAMs are indexed using the equivalent of 'samtools index'.
+
+ 2. The majority nucleotide at each position in the BAM is determined, and the resulting sequence is added to the mitochondrial reference multiple sequence alignment included in the reference panel.
+
+ 3. A maximum likelihood phylogeny is inferred using RAxML, and the resulting tree is drawn, rooted on the midpoint of the phylogeny.
+
+
+3. Generating reports and summaries
+
+ 1. A HTML report is generated for each sample, summarizing the data used and presenting (graphically) the results of each analysis carried out above. All figures are available as PNG and PDF (each figure in the report links to its PDF equivalent).
+
+ 2. If multiple samples were processed, a summary of all samples is generated, which presents the major results in an abbreviated form.
+
+
+.. note::
+
+ While the above shows an ordered list of steps, the pipeline may interleave individual steps during runtime, and may execute multiple steps in parallel in when running in multi-threaded mode (see :ref:`zonkey_usage` for how to run the Zonkey pipeline using multiple threads).
+
+.. |male| unicode:: U+02642 .. MALE
+.. |female| unicode:: U+02640 .. FEMALE
diff --git a/docs/zonkey_pipeline/panel.rst b/docs/zonkey_pipeline/panel.rst
new file mode 100644
index 0000000..7b9350c
--- /dev/null
+++ b/docs/zonkey_pipeline/panel.rst
@@ -0,0 +1,235 @@
+.. _zonkey_panel:
+
+Reference Panel
+===============
+
+The :ref:`zonkey_pipeline` operates using a reference panel of SNPs generated from a selection of extant equid species, including the domestic horse (Equus caballus) and the Przewalski’s wild horse (Equus ferus przewalski); within African asses, the domestic donkey (Equus asinus) and the Somali wild ass (Equus africanus); within Asian asses, the onager (Equus hemionus) and the Tibetan kiang (Equus kiang), and; within zebras: the plains zebra (Equus quagga), the mountains zebra (Equus har [...]
+
+The reference panel has been generated using alignments against the Equus caballus reference nuclear genome (equCab2, via `UCSC`_) and the horse mitochondrial genome (NC\_001640.1, via `NCBI`_). The exact samples used to create the latest version of the reference panel are described below.
+
+
+Obtaining the reference panel
+-------------------------------
+
+The latest version of the Zonkey reference panel (dated 2016-11-01) may be downloaded via the following website:
+
+http://geogenetics.ku.dk/publications/zonkey
+
+Once this reference panel has been downloaded, it is strongly recommended that you decompress it using the 'bunzip2' command, since this speeds up several analytical steps (at the cost of about 600 MB of additional disk usage). To decompress the reference panel, simply run 'bunzip2' on the file, as shown here:
+
+.. code-block:: bash
+
+ $ bunzip2 database.tar.bz2
+
+.. warning:
+ Do not untar the reference panel. The Zonkey pipeline currently expects data files to be stored in a tar archive, and will not work if files have been extracted into a folder. This may change in the future.
+
+Once this has been done, the Zonkey pipeline may be used as described in the :ref:`zonkey_usage` section.
+
+
+Samples used in the reference panel
+-----------------------------------
+
+The following samples have been used in the construction of the latest version of the reference panel:
+
+====== =================== ====== =========== =============================
+Group Species Sex Sample Name Publication
+====== =================== ====== =========== =============================
+Horses *E. caballus* Male FM1798 doi:`10.1016/j.cub.2015.08.032 <https://doi.org/10.1016/j.cub.2015.08.032>`_
+. *E. przewalskii* Male SB281 doi:`10.1016/j.cub.2015.08.032 <https://doi.org/10.1016/j.cub.2015.08.032>`_
+Asses *E. a. asinus* Male Willy doi:`10.1038/nature12323 <https://doi.org/10.1038/nature12323>`_
+. *E. kiang* Female KIA doi:`10.1073/pnas.1412627111 <https://doi.org/10.1073/pnas.1412627111>`_
+. *E. h. onager* Male ONA doi:`10.1073/pnas.1412627111 <https://doi.org/10.1073/pnas.1412627111>`_
+. *E. a. somaliensis* Female SOM doi:`10.1073/pnas.1412627111 <https://doi.org/10.1073/pnas.1412627111>`_
+Zebras *E. q. boehmi* Female BOE doi:`10.1073/pnas.1412627111 <https://doi.org/10.1073/pnas.1412627111>`_
+. *E. grevyi* Female GRE doi:`10.1073/pnas.1412627111 <https://doi.org/10.1073/pnas.1412627111>`_
+. *E. z. hartmannae* Female HAR doi:`10.1073/pnas.1412627111 <https://doi.org/10.1073/pnas.1412627111>`_
+====== =================== ====== =========== =============================
+
+
+Constructing a reference panel
+==============================
+
+The following section describes the format used for the reference panel in Zonkey. It is intended for people who are interested in constructing their own reference panels for a set of species.
+
+.. warning::
+ At the time of writing, the number of ancestral groups is hardcoded to 2 and 3 groups; support for any number of ancestral groups is planned. Contact me if this is something you need, and I'll prioritize adding this to the Zonkey pipeline.
+
+
+It is important to note that a reference panel will is created relative to a single reference genome. For example, for the equine reference panel, all alignments and positions are listed relative to the EquCab2.0 reference genome.
+
+The reference consists of a number of files, which are described below:
+
+
+settings.yaml
+-------------
+
+The settings file is a simple YAML-markup file, which species global options that apply to the reference panel. The current setting file looks as follows:
+
+.. code-block:: yaml
+
+ # Database format; is incremented when the format changes
+ Format: 1
+ # Revision number; is incremented when the database (but not format) changes
+ Revision: 20161101
+ # Arguments passed to plink
+ Plink: "--horse"
+ # Number of chromosomes; required for e.g. PCA analyses
+ NChroms: 31
+ # N bases of padding used for mitochondrial sequences; the last N bases are
+ # expected to be the same as the first N bases, in order to allow alignments
+ # at this region of the genome, and are combined to generate final consensus.
+ MitoPadding: 31
+ # The minimum distance between SNPs, assuming an even distribution of SNPs
+ # across the genome. Used when --treemix-k is set to 'auto', which is the
+ # default behavior. Value from McCue 2012 (doi:10.1371/journal.pgen.1002451).
+ SNPDistance: 150000
+
+
+The *Format* option defines the panel format, reflects the version of the Zonkey pipeline that supports this panel. It should therefore not be changed unless the format, as described on this page, is changed. The *Revision* reflects the version of a specific reference panel, and should be updated every time data or settings in the reference panel is changed. The equid reference panel simply uses the date at which a given version was created as the revision number.
+
+The *Plink* option lists specific options passed to plink. In the above, this includes just the '--horse' option, which specifies the expected number of chromosomes expected for the horse genome and data aligned against the horse genome.
+
+The *NChroms* option specifies the number of autosomal chromosomes for the reference genome used to construct the reference panel. This is requried for running PCA, but will likely be removed in the future (it is redundant due to contigs.txt).
+
+The *MitoPadding* option is used for the mitochondrial reference sequences, and specifies that some number of the bases at the end of the sequences are identical to the first bases in the sequence. Such duplication (or padding) is used to enable alignments spanning the break introduced when representing a circular genome as a FASTA sequence. If no such padding has been used, then this may simply be set to 0.
+
+The *SNPDistance* option is used to calculate the number of SNPs per block when the --treemix-k option is set to 'auto' (the default behavior). This option assumes that SNPs are evenly distributed across the genome, and calculates block size based on the number of SNPs covered for a given sample.
+
+
+contigs.txt
+-----------
+
+The 'contigs.txt' file contains a table describing the chromsomes included in the zonkey analyses:
+
+.. code-block:: text
+
+ ID Size Checksum Ns
+ 1 185838109 NA 2276254
+ 2 120857687 NA 1900145
+ 3 119479920 NA 1375010
+ 4 108569075 NA 1172002
+ 5 99680356 NA 1937819
+ X 124114077 NA 2499591
+
+The *ID* column specifies the name of the chromosome. Note that these names are expected to be either numerical (i.e. 1, 2, 21, 31) or sex chromosomes (X or Y). The *Size* column must correspond to the length of the chromosome in the reference genome. The *Ns* column, on the other hand, allows for the number of uncalled bases in the reference to be specified. This value is subtracted from the chromosome size when calculating the relative coverage for sex determination.
+
+The *Checksum* column should contain the MD5 sum calculated for the reference sequence or 'NA' if not available. If specified, this value is intended to be compared with the MD5 sums listed in the headers of BAM files analyzed by the Zonkey pipeline, to ensure that the correct reference sequence is used.
+
+.. note::
+ This checksum check is currently not supported, but will be added soon.
+
+
+.. note::
+ The mitochondria is not included in this table; only list autosomes to be analyzed.
+
+
+samples.txt
+-----------
+
+The 'samples.txt' table should contains a list of all samples included in the reference panel, and provides various information about these, most important of which is what ancestral groups a given sample belongs to:
+
+.. code-block:: text
+
+ ID Group(3) Group(2) Species Sex SampleID Publication
+ ZBoe Zebra NonCaballine E. q. boehmi Female BOE doi:10.1073/pnas.1412627111
+ AOna Ass NonCaballine E. h. onager Male ONA doi:10.1073/pnas.1412627111
+ HPrz Horse Caballine E. przewalskii Male SB281 doi:10.1016/j.cub.2015.08.032
+
+
+The *ID* column is used as the name of the sample in the text, tables, and figures generated when running the Zonkey pipeline. It is adviced to keep this name short and preferably descriptive about the group to which the sample belongs.
+
+The *Group(2)* and *Group(3)* columns specify the ancestral groups to which the sample belongs, when connsidering either 2 or 3 ancestral groups. Note that Zonkey currently only supports 2 and 3 ancestral groups (see above).
+
+The *Species*, *Sex*, *SampleID*, and *Publication* columns are meant to contain extra information about the samples, used in the reports generated by the Zonkey pipeline, and are not used directly by the pipeline.
+
+
+mitochondria.fasta
+------------------
+
+The 'mitochondria.fasta' file is expected to contain a multi-sequence alignment involving two different set of sequences. Firstly, it must contain one or more reference sequences against which the input mitochondria alignments have been carried out. In addition, it should contain at least one sequence per species in the reference panel.
+
+Zonkey will compare the reference sequences (either or not subtracting the amount of padding specified in the 'settings.txt' file) against the contigs in the input BAM in order to identify mitochondrial sequences. The Zonkey pipeline then uses the alignment of the reference sequence identified to place the sample into the multi-sequence alignment.
+
+By default, all sequences in the 'mitochondria.fasta' file are included in the mitochondrial phylogeny. However, reference sequences can be excluded by adding a 'EXCLUDE' label after the sequence name:
+
+.. code-block:: text
+
+ >5835107Eq_mito3 EXCLUDE
+ gttaatgtagcttaataatat-aaagcaaggcactgaaaatgcctagatgagtattctta
+
+Sequences thus marked are not used for the phylogenetic inference itself.
+
+
+simulations.txt
+---------------
+
+The 'simulations.txt' file contains the results of analyzing simulated data sets in order to generate an emperical distribution of deviations from the expected admixture values.
+
+.. code-block:: text
+
+ NReads K Sample1 Sample2 HasTS Percentile Value
+ 1000 2 Caballine NonCaballine FALSE 0.000 7.000000e-06
+ 1000 2 Caballine NonCaballine FALSE 0.001 1.973480e-04
+ 1000 2 Caballine NonCaballine FALSE 0.002 2.683880e-04
+ 1000 2 Caballine NonCaballine FALSE 0.003 3.759840e-04
+ 1000 2 Caballine NonCaballine FALSE 0.004 4.595720e-04
+ 1000 2 Caballine NonCaballine FALSE 0.005 5.518900e-04
+ 1000 2 Caballine NonCaballine FALSE 0.006 6.591180e-04
+
+The *NReads* column specifies the number of sequence alignments used in the simulated sample (e.g. 1000, 10000, 100000, and 1000000). Zonkey will use these simulations for different numbers of reads to establish lower and upper bounds on the empirical p-values, where the lower bound is selected as the NReads <= to the number of reads analyzed, and the upper bound is selected as the NReads >= to the number of reads analyzed, when running Zonkey.
+
+The *K* column lists the number of ancestral groups specified when the sample was analyzed; in the equine reference panel, this is either 2 or 3.
+
+The *Sample1* and *Sample2* columns lists the two ancestral groups from which the synthetic hybrid was produced. The order in which these are listed does not matter.
+
+The *HasTS* column specifies if transitions were included (TRUE) or excluded (FALSE).
+
+The *Percentile* column specifies the percent of simulations with a *Value* less than or equal to the current *Value*.
+
+The *Value* column lists the absolute observed deviation from the expected admixture proportion (i.e. 0.5).
+
+
+There is currently no way to generate this automatically table, but having some support for doing this is planned. Note also that zonkey can be run using a hidden option '--admixture-only', which skips all analyses but those required in order to run ADMIXTURE on the data, and thereby makes running ADMIXTURE exactly as it would be run by Zonkey trivial. For example:
+
+ $ paleomix zonkey run --admixture-only database.tar simulation.bam
+
+
+genotypes.txt
+-------------
+
+The 'genotypes.txt' file contains a table of heterozyous sites relative to the reference sequence used for the reference panel.
+
+.. warning::
+ Columns in the 'genotypes.txt' file are expected to be in the exact order shown below.
+
+
+.. code-block:: text
+
+ Chrom Pos Ref AAsi;AKia;AOna;ASom;HCab;HPrz;ZBoe;ZGre;ZHar
+ 1 1094 A CAACAAAAA
+ 1 1102 G AGGAGGGGG
+ 1 1114 A AAAAAAAGA
+ 1 1126 C CCCCCCCYC
+ 1 1128 C CCCCCCCGC
+ 1 1674 T GGGGTTGGG
+ 1 1675 G GCCGGGGGG
+
+
+The *Chrom* column is expected to contain only those contigs / chromosomes listed in the 'contigs.txt' file; the *Pos* column contains the 1-based positions of the variable sites relative to the reference sequence. The *Ref* column contains the nucleotide observed in the reference sequence for the current position; it is currently not used, and may be removed in future versions of Zonkey. The final column contains the nucleotides observed for every sample named in 'samples.txt', joined b [...]
+
+
+Packaging the files
+-------------------
+
+The reference panel is distributed as a tar archive. For best performance, the files should be laid out so that the genotypes.txt file is the last file in the archive. This may be accomplished with the following command:
+
+.. code-block:: bash
+
+ $ tar cvf database.tar settings.yaml contigs.txt samples.txt mitochondria.fasta simulations.txt examples genotypes.txt
+
+The tar file may be compressed for distribution (bzip2 or gzip), but should be used uncompressed for best performance.
+
+
+.. _NCBI: https://www.ncbi.nlm.nih.gov/nuccore/5835107
+.. _UCSC: https://genome.ucsc.edu/cgi-bin/hgGateway?clade=mammal&org=Horse&db=0
diff --git a/docs/zonkey_pipeline/requirements.rst b/docs/zonkey_pipeline/requirements.rst
new file mode 100644
index 0000000..36cbc32
--- /dev/null
+++ b/docs/zonkey_pipeline/requirements.rst
@@ -0,0 +1,76 @@
+.. highlight:: bash
+.. _zonkey_requirements:
+
+
+Software Requirements
+=====================
+
+The Zonkey pipeline requires PALEOMIX version 1.2.7 or later. In addition to the requirements listed for the PALEOMIX pipeline itself in the :ref:`installation` section, the Zonkey pipeline requires that other pieces of software be installed:
+
+* RScript from the `R`_ package, v3.1+.
+* SmartPCA from the `EIGENSOFT`_ package, v13050+ [Patterson2006]_, [Price2006]_
+* `ADMIXTURE`_ v1.23+ [Alexander2009]_
+* `PLINK`_ v1.7+ [Chang2015]_
+* `RAxML`_ v7.3.2+ [Stamatakis2006]_
+* `SAMTools`_ v0.1.19+ [Li2009b]_
+* `TreeMix`_ v1.12+ [Pickrell2012]_
+
+The following R packages are required in order to carry out the plotting:
+
+* `RColorBrewer`_
+* `ape`_ [Paradis2004]_
+* `ggplot2`_ [Wickham2009]_
+* `ggrepel`_
+* `reshape2`_ [Wickham2007]_
+
+The R packages may be installed using the following commands::
+
+ $ R
+ > install.packages(c('RColorBrewer', 'ape', 'ggrepel', 'ggplot2', 'reshape2'))
+
+
+Installing under OSX
+--------------------
+
+Installing the Zonkey pipeline under OSX poses several difficulties, mainly due to SmartPCA. In the follow, it is assumed that the `Brew package manager`_ has been installed, as this greatly simplifies the installation of other, required pieces of software.
+
+Firstly, install software and libraries required to compile SmartPCA::
+
+ $ brew install gcc
+ $ brew install homebrew/dupes/lapack
+ $ brew install homebrew/science/openblas
+
+In each case, note down the values indicated for LDFLAGS, CFLAGS, CPPFLAGS, etc.
+
+Next, download and unpack the `EIGENSOFT`_ software. The following has been tested on EIGENSOFT version 6.1.1 ('EIG6.1.1.tar.gz').
+
+To build SmartPCA it may further be nessesary to remove the use of the 'real-time' library::
+
+ $ sed -e's# -lrt##' Makefile > Makefile.no_rt
+
+Once you have done this, you can build SmartPCA using the locally copied libraries::
+
+ $ env CC="/usr/local/opt/gcc/bin/gcc-6" LDFLAGS="-L/usr/local/opt/openblas/lib/" CFLAGS="-flax-vector-conversions -I/usr/local/opt/lapack/include/" make -f Makefile.no_rt
+
+The above worked on my installation, but you may need to correct the variables using the values provided by Brew, which you noted down after running the 'install' command. You may also need to change the location of GGC set in the CC variable.
+
+
+Testing the pipeline
+--------------------
+
+An example project is included with the BAM pipeline, and it is recommended to run this project in order to verify that the pipeline and required applications have been correctly installed. See the :ref:`examples_zonkey` section for a description of how to run this example project.
+
+
+.. _ADMIXTURE: https://www.genetics.ucla.edu/software/admixture/
+.. _EIGENSOFT: http://www.hsph.harvard.edu/alkes-price/software/
+.. _PLINK: https://www.cog-genomics.org/plink2
+.. _R: http://www.r-base.org/
+.. _RAxML: https://github.com/stamatak/standard-RAxML
+.. _RColorBrewer: https://cran.r-project.org/web/packages/RColorBrewer/index.html
+.. _SAMTools: https://samtools.github.io
+.. _TreeMix: http://pritchardlab.stanford.edu/software.html
+.. _ape: https://cran.r-project.org/web/packages/ape/index.html
+.. _ggrepel: https://cran.r-project.org/web/packages/ggrepel/index.html
+.. _ggplot2: https://cran.r-project.org/web/packages/ggplot2/index.html
+.. _reshape2: https://cran.r-project.org/web/packages/reshape2/index.html
+.. _Brew package manager: http://www.brew.sh
diff --git a/docs/zonkey_pipeline/usage.rst b/docs/zonkey_pipeline/usage.rst
new file mode 100644
index 0000000..c54dbcc
--- /dev/null
+++ b/docs/zonkey_pipeline/usage.rst
@@ -0,0 +1,160 @@
+.. highlight:: Yaml
+.. _zonkey_usage:
+
+Pipeline usage
+==============
+
+The Zonkey pipeline is run on the command-line using the command 'paleomix zonkey', which gives access to a handful of commands:
+
+.. code-block:: bash
+
+ $ paleomix zonkey
+ USAGE:
+ paleomix zonkey run <panel> <sample.bam> [<destination>]
+ paleomix zonkey run <panel> <nuclear.bam> <mitochondrial.bam> <destination>
+ paleomix zonkey run <panel> <samples.txt> [<destination>]
+ paleomix zonkey dryrun <panel> [...]
+ paleomix zonkey mito <panel> <destination>
+
+Briefly, it is possible to run the pipeline on a single sample by specifying the location of `BAM alignments`_ against the Equus caballus reference nuclear genome (equCab2, see `UCSC`_), and / or against the horse mitochondrial genome (using either the standard mitochondrial sequence NC\_001640.1, see `NCBI`_, or a mitochondrial genome of one of the samples included in the reference panel, as described below). The individual commands allow for different combinations of alignment strategies:
+
+**paleomix zonkey run <panel> <sample.bam> [<destination>]**
+ Runs the Zonkey pipeline on a single BAM alignment <sample.bam>, which is expected to contain a nuclear and / or a mitochondrial alignment. If <destination> is specified, a directory at that location is created, and the resulting output saved there. If <destination> is not specified, the default location is chosen by replacing the file-extension of the alignment file (typically '.bam') with '.zonkey'.
+
+**paleomix zonkey run <panel> <nuclear.bam> <mitochondrial.bam> <destination>**
+ This commands allow for the combined analyses of the nuclear and mitochondrial genomes, in cases where these alignments have been carried out seperately. In this case, specifying a <destination> location is madatory.
+
+**paleomix zonkey run <panel> <samples.txt> [<destination>]**
+ It is possible to run the pipeline on multiple samples at once, by specifying a list of BAM files (here <samples.txt>), which lists a sample name and one or two BAM files per line, with each column seperated by tabs). A destination may (optionally) be specified, as when specifying a single BAM file (see above).
+
+**paleomix zonkey dryrun <panel> [...]**
+ The 'dryrun' command is equivalent to the 'run' command, but does not actually carry out the analytical steps; this command is useful to test for problems before excuting the pipeline, such as missing or outdated software requirements (see :ref:`zonkey_requirements`).
+
+**paleomix zonkey mito <panel> <destination>**
+ The 'mito' command is included to create a :ref:`bam_pipeline` project template for mapping FASTQ reads against the mitochondrial genomes of the samples included in the Zonkey reference panel samples (see Prerequisites below) for a list of samples).
+
+These possibilities are described in further detail below.
+
+
+Prerequisites
+-------------
+
+All invocations of the Zonkey pipeline takes the path to a 'panel' file as their first argument. This file is the reference panel providing the genetic information necessary for performing species and/or hybrid identification, and currently includes representatives of all extant equid species. The reference panel thereby allows for the identification of first generation hybrids between any living equine species, i.e. within caballines. For a more detailed description of the reference pan [...]
+
+Secondly, the pipeline requires either one or two BAM files per sample, representing alignments against nuclear and / or mitochondrial genomes as described above. The analyses carried out by the Zonkey pipeline depends on the contents of the BAM alignment file provided for a given sample, and are presented below.
+
+
+Single sample analysis
+----------------------
+
+For a single sample, the pipeline may be invoked by providing the path to the reference panel file followed by the path to one or two BAM files belonging to that sample, as well as an (mostly optional) destination directory.
+
+For these examples, we will assume that the reference panel is saved in the file 'database.tar', that the BAM file 'nuclear.bam' contains an alignment against the equCab2 reference genome, that the BAM file 'mitochondrial.bam' contains an alignment against the corresponding mitochondrial reference genome (Genbank Accession Nb. NC_001640.1), and that the BAM file 'combined.bam' contains an alignment against both the nuclear and mitochondrial genomes. If so, the pipeline may be invoked as [...]
+
+.. code-block:: bash
+
+ # Case 1a: Analyse nuclear genome; results are placed in 'nuclear.zonkey'
+ $ paleomix zonkey run database.tar nuclear.bam
+ # Case 1b: Analyse nuclear genome; results are placed in 'my_results'
+ $ paleomix zonkey run database.tar nuclear.bam my_results
+
+ # Case 2b: Analyse mitochondrial genome; results are placed in 'mitochondrial.zonkey'
+ $ paleomix zonkey run database.tar mitochondrial.bam
+ # Case 2b: Analyse mitochondrial genome; results are placed in 'my_results'
+ $ paleomix zonkey run database.tar mitochondrial.bam my_results
+
+ # Case 3: Analyses both nuclear and mitochondrial genome, placing results in 'my_results'
+ $ paleomix zonkey run database.tar nuclear.bam mitochondrial.bam my_results
+
+ # Case 4a: Analyses both nuclear and mitochondrial genome; results are placed in 'combined.zonkey'
+ $ paleomix zonkey run database.tar combined.bam
+ # Case 4b: Analyses both nuclear and mitochondrial genome; results are placed in 'my_results'
+ $ paleomix zonkey run database.tar combined.bam my_results
+
+
+.. note::
+
+ The filenames used here are have been chosen purely to illustrate each operation, and do not affect the operation of the pipeline.
+
+As shown above, the pipeline will place the resulting output files in a directory named after the input file by default. This behavior, however, can be overridden by the user by specifying a destination directory (cases 1b, 2b, and 4b). When specifying two input files, however, it is required to manually specify the directory in which to store output files (case 3).
+
+The resulting report may be accessed in the output directory under the name 'report.html', which contains summary statistics and figures for the analyses performed for the sample. The structure of directory containing the output files is described further in the :ref:`zonkey_filestructure` section.
+
+
+Multi-sample analysis
+---------------------
+
+As noted above, it is possible to analyze multiple, different samples in one go. This is accomplished by providing a text file containing a tab separated table of samples, with columns separated by tabs. The first column in this table specifies the name of the sample, while the second and third column specifies the location of one or two BAM alignments associated with that sample. The following example shows one such file corresponding to cases 1 - 4 described above.
+
+.. code-block:: bash
+
+ $ cat samples.txt
+ case_1 nuclear.bam
+ case_2 mitochondrial.bam
+ case_3 nuclear.bam mitochondrial.bam
+ case_4 combined.bam
+
+Processing of these samples is then carried out as shown above:
+
+.. code-block:: bash
+
+ # Case 5a) Analyse 3 samples; results are placed in 'samples.zonkey'
+ $ paleomix zonkey run database.tar samples.txt
+ # Case 5b) Analyse 3 samples; results are placed in 'my_results'
+ $ paleomix zonkey run database.tar samples.txt my_results
+
+The resulting directory contains a 'summary.html' file, providing an overview of all samples processed in the analyses, with link to the individual, per-sample, reports, as well as a sub-directory for each sample corresponding to that obtained from running individual analyses on each of the samples. The structure of directory containing the output files is further described in the :ref:`zonkey_filestructure` section.
+
+
+.. note:
+ Note that only upper-case and lower-case letters (a-z, and A-Z), as well as numbers (0-9), and underscores (_) are allowed in sample names.
+
+
+Rooting TreeMix trees
+---------------------
+
+By default, the Zonkey pipeline does not attempt to root TreeMix trees; this is because the out-group specified *must* form a monophyletic clade; if this is not the case (e.g. if the clade containing the two reference horse samples becomes paraphyletic due to the test sample nesting with one of them), TreeMix will fail to run to completion.
+
+Therefore it may be preferable to run the pipeline without specifying an outgroup, and then specifying the outgroup, in a second run, once the placement of the sample is done. This is accomplished by specifying these using the --treemix-outgroup command-line option, specifying the samples forming the out-group as a comma-separated list. For example, assuming that the following TreeMix tree was generated for our sample:
+
+.. image:: ../_static/zonkey/incl_ts_0_tree_unrooted.png
+
+If so, we may wish to root on the caballine specimen (all other command-line arguments omitted for simplicity):
+
+.. code-block:: bash
+
+ $ paleomix zonkey run ... --treemix-outgroup Sample,HPrz,HCab
+
+This yields a tree rooted using this group as the outgroup:
+
+.. image:: ../_static/zonkey/incl_ts_0_tree_rooted.png
+
+
+.. note::
+
+ Rooting of the tree will be handled automatically in future versions of the Zonkey pipeline.
+
+
+Mapping against mitochondrial genomes
+-------------------------------------
+
+In order to identify the species of the sire and dam, respectively, for F1 hybrids, the Zonkey pipeline allows for the construction of a maximum likelihood phylogeny using RAxML [Stamatakis2006]_ based on the mitochondrial genomes of reference panel (see Prerequisites, above) and a consensus sequence derived from the mitochondrial alignment provided for the sample being investigated.
+
+
+The resulting phylogeny is presented rooted on the mid-point:
+
+.. image:: ../_static/zonkey/mito_phylo.png
+
+
+As noted above, this requires that the the sample has been mapped against the mitochondrial reference genome NC\_001640.1 (see `NCBI`_), corresponding to the 'MT' mitochondrial genome included with the equCab2 reference sequence (see `UCSC`_). In addition, it is possible to carry out mapping against the mitochondrial genomes of the reference panel used in the Zonkey reference panel, by using the :ref:`bam_pipeline`.
+
+This is accomplished by running the Zonkey 'mito' command, which writes a simple BAM pipeline makefile template to a given directory, along with a directory containing the FASTA sequences of the reference mitochondrial genomes::
+
+ $ paleomix zonkey mito my_mapping/
+
+Please refer to the :ref:`bam_pipeline` documentation if you wish to use the BAM pipeline to perform the mapping itself. Once your data has been mapped against either or all of these mitochondrial genomes, the preferred BAM file (e.g. the alignment with the highest coverage) may be included in the analyses as described above.
+
+
+.. _NCBI: https://www.ncbi.nlm.nih.gov/nuccore/5835107
+.. _UCSC: https://genome.ucsc.edu/cgi-bin/hgGateway?clade=mammal&org=Horse&db=0
+.. _BAM alignments: http://samtools.github.io/hts-specs/SAMv1.pdf
diff --git a/examples b/examples
new file mode 120000
index 0000000..b4a7e0d
--- /dev/null
+++ b/examples
@@ -0,0 +1 @@
+paleomix/resources/examples
\ No newline at end of file
diff --git a/licenses/gpl.txt b/licenses/gpl.txt
new file mode 100644
index 0000000..94a9ed0
--- /dev/null
+++ b/licenses/gpl.txt
@@ -0,0 +1,674 @@
+ GNU GENERAL PUBLIC LICENSE
+ Version 3, 29 June 2007
+
+ Copyright (C) 2007 Free Software Foundation, Inc. <http://fsf.org/>
+ Everyone is permitted to copy and distribute verbatim copies
+ of this license document, but changing it is not allowed.
+
+ Preamble
+
+ The GNU General Public License is a free, copyleft license for
+software and other kinds of works.
+
+ The licenses for most software and other practical works are designed
+to take away your freedom to share and change the works. By contrast,
+the GNU General Public License is intended to guarantee your freedom to
+share and change all versions of a program--to make sure it remains free
+software for all its users. We, the Free Software Foundation, use the
+GNU General Public License for most of our software; it applies also to
+any other work released this way by its authors. You can apply it to
+your programs, too.
+
+ When we speak of free software, we are referring to freedom, not
+price. Our General Public Licenses are designed to make sure that you
+have the freedom to distribute copies of free software (and charge for
+them if you wish), that you receive source code or can get it if you
+want it, that you can change the software or use pieces of it in new
+free programs, and that you know you can do these things.
+
+ To protect your rights, we need to prevent others from denying you
+these rights or asking you to surrender the rights. Therefore, you have
+certain responsibilities if you distribute copies of the software, or if
+you modify it: responsibilities to respect the freedom of others.
+
+ For example, if you distribute copies of such a program, whether
+gratis or for a fee, you must pass on to the recipients the same
+freedoms that you received. You must make sure that they, too, receive
+or can get the source code. And you must show them these terms so they
+know their rights.
+
+ Developers that use the GNU GPL protect your rights with two steps:
+(1) assert copyright on the software, and (2) offer you this License
+giving you legal permission to copy, distribute and/or modify it.
+
+ For the developers' and authors' protection, the GPL clearly explains
+that there is no warranty for this free software. For both users' and
+authors' sake, the GPL requires that modified versions be marked as
+changed, so that their problems will not be attributed erroneously to
+authors of previous versions.
+
+ Some devices are designed to deny users access to install or run
+modified versions of the software inside them, although the manufacturer
+can do so. This is fundamentally incompatible with the aim of
+protecting users' freedom to change the software. The systematic
+pattern of such abuse occurs in the area of products for individuals to
+use, which is precisely where it is most unacceptable. Therefore, we
+have designed this version of the GPL to prohibit the practice for those
+products. If such problems arise substantially in other domains, we
+stand ready to extend this provision to those domains in future versions
+of the GPL, as needed to protect the freedom of users.
+
+ Finally, every program is threatened constantly by software patents.
+States should not allow patents to restrict development and use of
+software on general-purpose computers, but in those that do, we wish to
+avoid the special danger that patents applied to a free program could
+make it effectively proprietary. To prevent this, the GPL assures that
+patents cannot be used to render the program non-free.
+
+ The precise terms and conditions for copying, distribution and
+modification follow.
+
+ TERMS AND CONDITIONS
+
+ 0. Definitions.
+
+ "This License" refers to version 3 of the GNU General Public License.
+
+ "Copyright" also means copyright-like laws that apply to other kinds of
+works, such as semiconductor masks.
+
+ "The Program" refers to any copyrightable work licensed under this
+License. Each licensee is addressed as "you". "Licensees" and
+"recipients" may be individuals or organizations.
+
+ To "modify" a work means to copy from or adapt all or part of the work
+in a fashion requiring copyright permission, other than the making of an
+exact copy. The resulting work is called a "modified version" of the
+earlier work or a work "based on" the earlier work.
+
+ A "covered work" means either the unmodified Program or a work based
+on the Program.
+
+ To "propagate" a work means to do anything with it that, without
+permission, would make you directly or secondarily liable for
+infringement under applicable copyright law, except executing it on a
+computer or modifying a private copy. Propagation includes copying,
+distribution (with or without modification), making available to the
+public, and in some countries other activities as well.
+
+ To "convey" a work means any kind of propagation that enables other
+parties to make or receive copies. Mere interaction with a user through
+a computer network, with no transfer of a copy, is not conveying.
+
+ An interactive user interface displays "Appropriate Legal Notices"
+to the extent that it includes a convenient and prominently visible
+feature that (1) displays an appropriate copyright notice, and (2)
+tells the user that there is no warranty for the work (except to the
+extent that warranties are provided), that licensees may convey the
+work under this License, and how to view a copy of this License. If
+the interface presents a list of user commands or options, such as a
+menu, a prominent item in the list meets this criterion.
+
+ 1. Source Code.
+
+ The "source code" for a work means the preferred form of the work
+for making modifications to it. "Object code" means any non-source
+form of a work.
+
+ A "Standard Interface" means an interface that either is an official
+standard defined by a recognized standards body, or, in the case of
+interfaces specified for a particular programming language, one that
+is widely used among developers working in that language.
+
+ The "System Libraries" of an executable work include anything, other
+than the work as a whole, that (a) is included in the normal form of
+packaging a Major Component, but which is not part of that Major
+Component, and (b) serves only to enable use of the work with that
+Major Component, or to implement a Standard Interface for which an
+implementation is available to the public in source code form. A
+"Major Component", in this context, means a major essential component
+(kernel, window system, and so on) of the specific operating system
+(if any) on which the executable work runs, or a compiler used to
+produce the work, or an object code interpreter used to run it.
+
+ The "Corresponding Source" for a work in object code form means all
+the source code needed to generate, install, and (for an executable
+work) run the object code and to modify the work, including scripts to
+control those activities. However, it does not include the work's
+System Libraries, or general-purpose tools or generally available free
+programs which are used unmodified in performing those activities but
+which are not part of the work. For example, Corresponding Source
+includes interface definition files associated with source files for
+the work, and the source code for shared libraries and dynamically
+linked subprograms that the work is specifically designed to require,
+such as by intimate data communication or control flow between those
+subprograms and other parts of the work.
+
+ The Corresponding Source need not include anything that users
+can regenerate automatically from other parts of the Corresponding
+Source.
+
+ The Corresponding Source for a work in source code form is that
+same work.
+
+ 2. Basic Permissions.
+
+ All rights granted under this License are granted for the term of
+copyright on the Program, and are irrevocable provided the stated
+conditions are met. This License explicitly affirms your unlimited
+permission to run the unmodified Program. The output from running a
+covered work is covered by this License only if the output, given its
+content, constitutes a covered work. This License acknowledges your
+rights of fair use or other equivalent, as provided by copyright law.
+
+ You may make, run and propagate covered works that you do not
+convey, without conditions so long as your license otherwise remains
+in force. You may convey covered works to others for the sole purpose
+of having them make modifications exclusively for you, or provide you
+with facilities for running those works, provided that you comply with
+the terms of this License in conveying all material for which you do
+not control copyright. Those thus making or running the covered works
+for you must do so exclusively on your behalf, under your direction
+and control, on terms that prohibit them from making any copies of
+your copyrighted material outside their relationship with you.
+
+ Conveying under any other circumstances is permitted solely under
+the conditions stated below. Sublicensing is not allowed; section 10
+makes it unnecessary.
+
+ 3. Protecting Users' Legal Rights From Anti-Circumvention Law.
+
+ No covered work shall be deemed part of an effective technological
+measure under any applicable law fulfilling obligations under article
+11 of the WIPO copyright treaty adopted on 20 December 1996, or
+similar laws prohibiting or restricting circumvention of such
+measures.
+
+ When you convey a covered work, you waive any legal power to forbid
+circumvention of technological measures to the extent such circumvention
+is effected by exercising rights under this License with respect to
+the covered work, and you disclaim any intention to limit operation or
+modification of the work as a means of enforcing, against the work's
+users, your or third parties' legal rights to forbid circumvention of
+technological measures.
+
+ 4. Conveying Verbatim Copies.
+
+ You may convey verbatim copies of the Program's source code as you
+receive it, in any medium, provided that you conspicuously and
+appropriately publish on each copy an appropriate copyright notice;
+keep intact all notices stating that this License and any
+non-permissive terms added in accord with section 7 apply to the code;
+keep intact all notices of the absence of any warranty; and give all
+recipients a copy of this License along with the Program.
+
+ You may charge any price or no price for each copy that you convey,
+and you may offer support or warranty protection for a fee.
+
+ 5. Conveying Modified Source Versions.
+
+ You may convey a work based on the Program, or the modifications to
+produce it from the Program, in the form of source code under the
+terms of section 4, provided that you also meet all of these conditions:
+
+ a) The work must carry prominent notices stating that you modified
+ it, and giving a relevant date.
+
+ b) The work must carry prominent notices stating that it is
+ released under this License and any conditions added under section
+ 7. This requirement modifies the requirement in section 4 to
+ "keep intact all notices".
+
+ c) You must license the entire work, as a whole, under this
+ License to anyone who comes into possession of a copy. This
+ License will therefore apply, along with any applicable section 7
+ additional terms, to the whole of the work, and all its parts,
+ regardless of how they are packaged. This License gives no
+ permission to license the work in any other way, but it does not
+ invalidate such permission if you have separately received it.
+
+ d) If the work has interactive user interfaces, each must display
+ Appropriate Legal Notices; however, if the Program has interactive
+ interfaces that do not display Appropriate Legal Notices, your
+ work need not make them do so.
+
+ A compilation of a covered work with other separate and independent
+works, which are not by their nature extensions of the covered work,
+and which are not combined with it such as to form a larger program,
+in or on a volume of a storage or distribution medium, is called an
+"aggregate" if the compilation and its resulting copyright are not
+used to limit the access or legal rights of the compilation's users
+beyond what the individual works permit. Inclusion of a covered work
+in an aggregate does not cause this License to apply to the other
+parts of the aggregate.
+
+ 6. Conveying Non-Source Forms.
+
+ You may convey a covered work in object code form under the terms
+of sections 4 and 5, provided that you also convey the
+machine-readable Corresponding Source under the terms of this License,
+in one of these ways:
+
+ a) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by the
+ Corresponding Source fixed on a durable physical medium
+ customarily used for software interchange.
+
+ b) Convey the object code in, or embodied in, a physical product
+ (including a physical distribution medium), accompanied by a
+ written offer, valid for at least three years and valid for as
+ long as you offer spare parts or customer support for that product
+ model, to give anyone who possesses the object code either (1) a
+ copy of the Corresponding Source for all the software in the
+ product that is covered by this License, on a durable physical
+ medium customarily used for software interchange, for a price no
+ more than your reasonable cost of physically performing this
+ conveying of source, or (2) access to copy the
+ Corresponding Source from a network server at no charge.
+
+ c) Convey individual copies of the object code with a copy of the
+ written offer to provide the Corresponding Source. This
+ alternative is allowed only occasionally and noncommercially, and
+ only if you received the object code with such an offer, in accord
+ with subsection 6b.
+
+ d) Convey the object code by offering access from a designated
+ place (gratis or for a charge), and offer equivalent access to the
+ Corresponding Source in the same way through the same place at no
+ further charge. You need not require recipients to copy the
+ Corresponding Source along with the object code. If the place to
+ copy the object code is a network server, the Corresponding Source
+ may be on a different server (operated by you or a third party)
+ that supports equivalent copying facilities, provided you maintain
+ clear directions next to the object code saying where to find the
+ Corresponding Source. Regardless of what server hosts the
+ Corresponding Source, you remain obligated to ensure that it is
+ available for as long as needed to satisfy these requirements.
+
+ e) Convey the object code using peer-to-peer transmission, provided
+ you inform other peers where the object code and Corresponding
+ Source of the work are being offered to the general public at no
+ charge under subsection 6d.
+
+ A separable portion of the object code, whose source code is excluded
+from the Corresponding Source as a System Library, need not be
+included in conveying the object code work.
+
+ A "User Product" is either (1) a "consumer product", which means any
+tangible personal property which is normally used for personal, family,
+or household purposes, or (2) anything designed or sold for incorporation
+into a dwelling. In determining whether a product is a consumer product,
+doubtful cases shall be resolved in favor of coverage. For a particular
+product received by a particular user, "normally used" refers to a
+typical or common use of that class of product, regardless of the status
+of the particular user or of the way in which the particular user
+actually uses, or expects or is expected to use, the product. A product
+is a consumer product regardless of whether the product has substantial
+commercial, industrial or non-consumer uses, unless such uses represent
+the only significant mode of use of the product.
+
+ "Installation Information" for a User Product means any methods,
+procedures, authorization keys, or other information required to install
+and execute modified versions of a covered work in that User Product from
+a modified version of its Corresponding Source. The information must
+suffice to ensure that the continued functioning of the modified object
+code is in no case prevented or interfered with solely because
+modification has been made.
+
+ If you convey an object code work under this section in, or with, or
+specifically for use in, a User Product, and the conveying occurs as
+part of a transaction in which the right of possession and use of the
+User Product is transferred to the recipient in perpetuity or for a
+fixed term (regardless of how the transaction is characterized), the
+Corresponding Source conveyed under this section must be accompanied
+by the Installation Information. But this requirement does not apply
+if neither you nor any third party retains the ability to install
+modified object code on the User Product (for example, the work has
+been installed in ROM).
+
+ The requirement to provide Installation Information does not include a
+requirement to continue to provide support service, warranty, or updates
+for a work that has been modified or installed by the recipient, or for
+the User Product in which it has been modified or installed. Access to a
+network may be denied when the modification itself materially and
+adversely affects the operation of the network or violates the rules and
+protocols for communication across the network.
+
+ Corresponding Source conveyed, and Installation Information provided,
+in accord with this section must be in a format that is publicly
+documented (and with an implementation available to the public in
+source code form), and must require no special password or key for
+unpacking, reading or copying.
+
+ 7. Additional Terms.
+
+ "Additional permissions" are terms that supplement the terms of this
+License by making exceptions from one or more of its conditions.
+Additional permissions that are applicable to the entire Program shall
+be treated as though they were included in this License, to the extent
+that they are valid under applicable law. If additional permissions
+apply only to part of the Program, that part may be used separately
+under those permissions, but the entire Program remains governed by
+this License without regard to the additional permissions.
+
+ When you convey a copy of a covered work, you may at your option
+remove any additional permissions from that copy, or from any part of
+it. (Additional permissions may be written to require their own
+removal in certain cases when you modify the work.) You may place
+additional permissions on material, added by you to a covered work,
+for which you have or can give appropriate copyright permission.
+
+ Notwithstanding any other provision of this License, for material you
+add to a covered work, you may (if authorized by the copyright holders of
+that material) supplement the terms of this License with terms:
+
+ a) Disclaiming warranty or limiting liability differently from the
+ terms of sections 15 and 16 of this License; or
+
+ b) Requiring preservation of specified reasonable legal notices or
+ author attributions in that material or in the Appropriate Legal
+ Notices displayed by works containing it; or
+
+ c) Prohibiting misrepresentation of the origin of that material, or
+ requiring that modified versions of such material be marked in
+ reasonable ways as different from the original version; or
+
+ d) Limiting the use for publicity purposes of names of licensors or
+ authors of the material; or
+
+ e) Declining to grant rights under trademark law for use of some
+ trade names, trademarks, or service marks; or
+
+ f) Requiring indemnification of licensors and authors of that
+ material by anyone who conveys the material (or modified versions of
+ it) with contractual assumptions of liability to the recipient, for
+ any liability that these contractual assumptions directly impose on
+ those licensors and authors.
+
+ All other non-permissive additional terms are considered "further
+restrictions" within the meaning of section 10. If the Program as you
+received it, or any part of it, contains a notice stating that it is
+governed by this License along with a term that is a further
+restriction, you may remove that term. If a license document contains
+a further restriction but permits relicensing or conveying under this
+License, you may add to a covered work material governed by the terms
+of that license document, provided that the further restriction does
+not survive such relicensing or conveying.
+
+ If you add terms to a covered work in accord with this section, you
+must place, in the relevant source files, a statement of the
+additional terms that apply to those files, or a notice indicating
+where to find the applicable terms.
+
+ Additional terms, permissive or non-permissive, may be stated in the
+form of a separately written license, or stated as exceptions;
+the above requirements apply either way.
+
+ 8. Termination.
+
+ You may not propagate or modify a covered work except as expressly
+provided under this License. Any attempt otherwise to propagate or
+modify it is void, and will automatically terminate your rights under
+this License (including any patent licenses granted under the third
+paragraph of section 11).
+
+ However, if you cease all violation of this License, then your
+license from a particular copyright holder is reinstated (a)
+provisionally, unless and until the copyright holder explicitly and
+finally terminates your license, and (b) permanently, if the copyright
+holder fails to notify you of the violation by some reasonable means
+prior to 60 days after the cessation.
+
+ Moreover, your license from a particular copyright holder is
+reinstated permanently if the copyright holder notifies you of the
+violation by some reasonable means, this is the first time you have
+received notice of violation of this License (for any work) from that
+copyright holder, and you cure the violation prior to 30 days after
+your receipt of the notice.
+
+ Termination of your rights under this section does not terminate the
+licenses of parties who have received copies or rights from you under
+this License. If your rights have been terminated and not permanently
+reinstated, you do not qualify to receive new licenses for the same
+material under section 10.
+
+ 9. Acceptance Not Required for Having Copies.
+
+ You are not required to accept this License in order to receive or
+run a copy of the Program. Ancillary propagation of a covered work
+occurring solely as a consequence of using peer-to-peer transmission
+to receive a copy likewise does not require acceptance. However,
+nothing other than this License grants you permission to propagate or
+modify any covered work. These actions infringe copyright if you do
+not accept this License. Therefore, by modifying or propagating a
+covered work, you indicate your acceptance of this License to do so.
+
+ 10. Automatic Licensing of Downstream Recipients.
+
+ Each time you convey a covered work, the recipient automatically
+receives a license from the original licensors, to run, modify and
+propagate that work, subject to this License. You are not responsible
+for enforcing compliance by third parties with this License.
+
+ An "entity transaction" is a transaction transferring control of an
+organization, or substantially all assets of one, or subdividing an
+organization, or merging organizations. If propagation of a covered
+work results from an entity transaction, each party to that
+transaction who receives a copy of the work also receives whatever
+licenses to the work the party's predecessor in interest had or could
+give under the previous paragraph, plus a right to possession of the
+Corresponding Source of the work from the predecessor in interest, if
+the predecessor has it or can get it with reasonable efforts.
+
+ You may not impose any further restrictions on the exercise of the
+rights granted or affirmed under this License. For example, you may
+not impose a license fee, royalty, or other charge for exercise of
+rights granted under this License, and you may not initiate litigation
+(including a cross-claim or counterclaim in a lawsuit) alleging that
+any patent claim is infringed by making, using, selling, offering for
+sale, or importing the Program or any portion of it.
+
+ 11. Patents.
+
+ A "contributor" is a copyright holder who authorizes use under this
+License of the Program or a work on which the Program is based. The
+work thus licensed is called the contributor's "contributor version".
+
+ A contributor's "essential patent claims" are all patent claims
+owned or controlled by the contributor, whether already acquired or
+hereafter acquired, that would be infringed by some manner, permitted
+by this License, of making, using, or selling its contributor version,
+but do not include claims that would be infringed only as a
+consequence of further modification of the contributor version. For
+purposes of this definition, "control" includes the right to grant
+patent sublicenses in a manner consistent with the requirements of
+this License.
+
+ Each contributor grants you a non-exclusive, worldwide, royalty-free
+patent license under the contributor's essential patent claims, to
+make, use, sell, offer for sale, import and otherwise run, modify and
+propagate the contents of its contributor version.
+
+ In the following three paragraphs, a "patent license" is any express
+agreement or commitment, however denominated, not to enforce a patent
+(such as an express permission to practice a patent or covenant not to
+sue for patent infringement). To "grant" such a patent license to a
+party means to make such an agreement or commitment not to enforce a
+patent against the party.
+
+ If you convey a covered work, knowingly relying on a patent license,
+and the Corresponding Source of the work is not available for anyone
+to copy, free of charge and under the terms of this License, through a
+publicly available network server or other readily accessible means,
+then you must either (1) cause the Corresponding Source to be so
+available, or (2) arrange to deprive yourself of the benefit of the
+patent license for this particular work, or (3) arrange, in a manner
+consistent with the requirements of this License, to extend the patent
+license to downstream recipients. "Knowingly relying" means you have
+actual knowledge that, but for the patent license, your conveying the
+covered work in a country, or your recipient's use of the covered work
+in a country, would infringe one or more identifiable patents in that
+country that you have reason to believe are valid.
+
+ If, pursuant to or in connection with a single transaction or
+arrangement, you convey, or propagate by procuring conveyance of, a
+covered work, and grant a patent license to some of the parties
+receiving the covered work authorizing them to use, propagate, modify
+or convey a specific copy of the covered work, then the patent license
+you grant is automatically extended to all recipients of the covered
+work and works based on it.
+
+ A patent license is "discriminatory" if it does not include within
+the scope of its coverage, prohibits the exercise of, or is
+conditioned on the non-exercise of one or more of the rights that are
+specifically granted under this License. You may not convey a covered
+work if you are a party to an arrangement with a third party that is
+in the business of distributing software, under which you make payment
+to the third party based on the extent of your activity of conveying
+the work, and under which the third party grants, to any of the
+parties who would receive the covered work from you, a discriminatory
+patent license (a) in connection with copies of the covered work
+conveyed by you (or copies made from those copies), or (b) primarily
+for and in connection with specific products or compilations that
+contain the covered work, unless you entered into that arrangement,
+or that patent license was granted, prior to 28 March 2007.
+
+ Nothing in this License shall be construed as excluding or limiting
+any implied license or other defenses to infringement that may
+otherwise be available to you under applicable patent law.
+
+ 12. No Surrender of Others' Freedom.
+
+ If conditions are imposed on you (whether by court order, agreement or
+otherwise) that contradict the conditions of this License, they do not
+excuse you from the conditions of this License. If you cannot convey a
+covered work so as to satisfy simultaneously your obligations under this
+License and any other pertinent obligations, then as a consequence you may
+not convey it at all. For example, if you agree to terms that obligate you
+to collect a royalty for further conveying from those to whom you convey
+the Program, the only way you could satisfy both those terms and this
+License would be to refrain entirely from conveying the Program.
+
+ 13. Use with the GNU Affero General Public License.
+
+ Notwithstanding any other provision of this License, you have
+permission to link or combine any covered work with a work licensed
+under version 3 of the GNU Affero General Public License into a single
+combined work, and to convey the resulting work. The terms of this
+License will continue to apply to the part which is the covered work,
+but the special requirements of the GNU Affero General Public License,
+section 13, concerning interaction through a network will apply to the
+combination as such.
+
+ 14. Revised Versions of this License.
+
+ The Free Software Foundation may publish revised and/or new versions of
+the GNU General Public License from time to time. Such new versions will
+be similar in spirit to the present version, but may differ in detail to
+address new problems or concerns.
+
+ Each version is given a distinguishing version number. If the
+Program specifies that a certain numbered version of the GNU General
+Public License "or any later version" applies to it, you have the
+option of following the terms and conditions either of that numbered
+version or of any later version published by the Free Software
+Foundation. If the Program does not specify a version number of the
+GNU General Public License, you may choose any version ever published
+by the Free Software Foundation.
+
+ If the Program specifies that a proxy can decide which future
+versions of the GNU General Public License can be used, that proxy's
+public statement of acceptance of a version permanently authorizes you
+to choose that version for the Program.
+
+ Later license versions may give you additional or different
+permissions. However, no additional obligations are imposed on any
+author or copyright holder as a result of your choosing to follow a
+later version.
+
+ 15. Disclaimer of Warranty.
+
+ THERE IS NO WARRANTY FOR THE PROGRAM, TO THE EXTENT PERMITTED BY
+APPLICABLE LAW. EXCEPT WHEN OTHERWISE STATED IN WRITING THE COPYRIGHT
+HOLDERS AND/OR OTHER PARTIES PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY
+OF ANY KIND, EITHER EXPRESSED OR IMPLIED, INCLUDING, BUT NOT LIMITED TO,
+THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
+PURPOSE. THE ENTIRE RISK AS TO THE QUALITY AND PERFORMANCE OF THE PROGRAM
+IS WITH YOU. SHOULD THE PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF
+ALL NECESSARY SERVICING, REPAIR OR CORRECTION.
+
+ 16. Limitation of Liability.
+
+ IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING
+WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MODIFIES AND/OR CONVEYS
+THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, INCLUDING ANY
+GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING OUT OF THE
+USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED TO LOSS OF
+DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY YOU OR THIRD
+PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER PROGRAMS),
+EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE POSSIBILITY OF
+SUCH DAMAGES.
+
+ 17. Interpretation of Sections 15 and 16.
+
+ If the disclaimer of warranty and limitation of liability provided
+above cannot be given local legal effect according to their terms,
+reviewing courts shall apply local law that most closely approximates
+an absolute waiver of all civil liability in connection with the
+Program, unless a warranty or assumption of liability accompanies a
+copy of the Program in return for a fee.
+
+ END OF TERMS AND CONDITIONS
+
+ How to Apply These Terms to Your New Programs
+
+ If you develop a new program, and you want it to be of the greatest
+possible use to the public, the best way to achieve this is to make it
+free software which everyone can redistribute and change under these terms.
+
+ To do so, attach the following notices to the program. It is safest
+to attach them to the start of each source file to most effectively
+state the exclusion of warranty; and each file should have at least
+the "copyright" line and a pointer to where the full notice is found.
+
+ <one line to give the program's name and a brief idea of what it does.>
+ Copyright (C) <year> <name of author>
+
+ This program is free software: you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation, either version 3 of the License, or
+ (at your option) any later version.
+
+ This program is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with this program. If not, see <http://www.gnu.org/licenses/>.
+
+Also add information on how to contact you by electronic and paper mail.
+
+ If the program does terminal interaction, make it output a short
+notice like this when it starts in an interactive mode:
+
+ <program> Copyright (C) <year> <name of author>
+ This program comes with ABSOLUTELY NO WARRANTY; for details type `show w'.
+ This is free software, and you are welcome to redistribute it
+ under certain conditions; type `show c' for details.
+
+The hypothetical commands `show w' and `show c' should show the appropriate
+parts of the General Public License. Of course, your program's commands
+might be different; for a GUI interface, you would use an "about box".
+
+ You should also get your employer (if you work as a programmer) or school,
+if any, to sign a "copyright disclaimer" for the program, if necessary.
+For more information on this, and how to apply and follow the GNU GPL, see
+<http://www.gnu.org/licenses/>.
+
+ The GNU General Public License does not permit incorporating your program
+into proprietary programs. If your program is a subroutine library, you
+may consider it more useful to permit linking proprietary applications with
+the library. If this is what you want to do, use the GNU Lesser General
+Public License instead of this License. But first, please read
+<http://www.gnu.org/philosophy/why-not-lgpl.html>.
diff --git a/licenses/mit.txt b/licenses/mit.txt
new file mode 100644
index 0000000..969d061
--- /dev/null
+++ b/licenses/mit.txt
@@ -0,0 +1,17 @@
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/misc/setup_bam_pipeline_example.makefile.yaml b/misc/setup_bam_pipeline_example.makefile.yaml
new file mode 100644
index 0000000..f60e5d1
--- /dev/null
+++ b/misc/setup_bam_pipeline_example.makefile.yaml
@@ -0,0 +1,35 @@
+# -*- mode: Yaml; -*-
+Options:
+ Platform: Illumina
+ QualityOffset: 33
+ SplitLanesByFilenames: no
+ CompressionFormat: gz
+
+ Aligners:
+ Program: Bowtie2
+
+ Bowtie2:
+ MinQuality: 0
+ --very-sensitive:
+
+ PCRDuplicates: no
+ RescaleQualities: no
+
+ ExcludeReads:
+ - Paired
+
+ Features: []
+
+
+Prefixes:
+ rCRS:
+ Path: 000_prefixes/rCRS.fasta
+
+
+ExampleProject:
+ Synthetic_Sample_1:
+ ACGATA:
+ Lane_2: 000_data/ACGATA_L2_R{Pair}_*.fastq.gz
+
+ GCTCTG:
+ Lane_2: 000_data/GCTCTG_L2_R1_*.fastq.gz
diff --git a/misc/setup_bam_pipeline_example.sh b/misc/setup_bam_pipeline_example.sh
new file mode 100644
index 0000000..567718e
--- /dev/null
+++ b/misc/setup_bam_pipeline_example.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+#
+# Copyright (c) 2013 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+set -o nounset # Fail on unset variables
+set -o errexit # Fail on uncaught non-zero returncodes
+set -o pipefail # Fail is a command in a chain of pipes fails
+
+SP_SEED=${RANDOM}
+
+rm -rv 000_data
+mkdir -p 000_data
+for barcode in ACGATA GCTCTG TGCTCA;
+do
+ python $(dirname $0)/synthesize_reads.py 000_prefixes/rCRS.fasta 000_data/ \
+ --library-barcode=${barcode} \
+ --specimen-seed=${SP_SEED} \
+ --lanes-reads-mu=2500 \
+ --lanes-reads-sigma=500 \
+ --lanes-reads-per-file=1000 \
+ --lanes=2 \
+ --damage
+done
+
+rm -v 000_data/GCTCTG_L*R2*.gz
+rm -v 000_data/TGCTCA_L1_R2*.gz
+
+bam_pipeline run $(dirname $0)/setup_bam_pipeline_example.makefile.yaml --destination .
+
+mkdir -p 000_data/ACGATA_L2/
+mv ExampleProject/reads/Synthetic_Sample_1/ACGATA/Lane_2/reads.singleton.truncated.gz 000_data/ACGATA_L2/reads.singleton.truncated.gz
+mv ExampleProject/reads/Synthetic_Sample_1/ACGATA/Lane_2/reads.collapsed.gz 000_data/ACGATA_L2/reads.collapsed.gz
+mv ExampleProject/reads/Synthetic_Sample_1/ACGATA/Lane_2/reads.collapsed.truncated.gz 000_data/ACGATA_L2/reads.collapsed.truncated.gz
+
+mv ExampleProject/rCRS/Synthetic_Sample_1/GCTCTG/Lane_2/single.minQ0.bam 000_data/GCTCTG_L2.bam
+
+rm -v 000_data/ACGATA_L2_R*.fastq.gz
+rm -v 000_data/GCTCTG_L2_R1_*.fastq.gz
+rm -rv ExampleProject
diff --git a/misc/setup_phylo_pipeline_example.sh b/misc/setup_phylo_pipeline_example.sh
new file mode 100755
index 0000000..d33d329
--- /dev/null
+++ b/misc/setup_phylo_pipeline_example.sh
@@ -0,0 +1,50 @@
+#!/bin/bash
+#
+# Copyright (c) 2013 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+set -o nounset # Fail on unset variables
+set -o errexit # Fail on uncaught non-zero returncodes
+set -o pipefail # Fail is a command in a chain of pipes fails
+
+rm -rvf alignment/000_reads
+mkdir -p alignment/000_reads
+for PREFIX in `ls alignment/000_prefixes/*.fasta | grep -v rCRS`;
+do
+ SP_SEED=${RANDOM}
+ NAME=$(echo ${PREFIX} | sed -e's#alignment/000_prefixes/##' -e's#\..*##')
+ mkdir -p alignment/000_reads/${NAME/*\//}/
+
+ ./synthesize_reads.py ${PREFIX} alignment/000_reads/${NAME}/ \
+ --specimen-seed=${SP_SEED} \
+ --lanes-reads-mu=50000 \
+ --lanes-reads-sigma=500 \
+ --lanes-reads-per-file=10000 \
+ --reads-len=50 \
+ --lanes=1
+done
+
+# These links would not survive the package installation, so setup here
+ln -sf ../../alignment/000_prefixes/ phylogeny/data/prefixes
+ln -sf ../../alignment phylogeny/data/samples
+
+# Create link to reference sequence
+mkdir -p phylogeny/data/refseqs
+ln -sf ../../../alignment/000_prefixes/rCRS.fasta phylogeny/data/refseqs/rCRS.rCRS.fasta
diff --git a/misc/skeleton.py b/misc/skeleton.py
new file mode 100644
index 0000000..017a800
--- /dev/null
+++ b/misc/skeleton.py
@@ -0,0 +1,37 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2014 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import sys
+import argparse
+
+def parse_args(argv):
+ parser = argparse.ArgumentParser()
+
+ return parser.parse_args(argv)
+
+
+def main(argv):
+
+ return 0
+
+
+if __name__ == '__main__':
+ sys.exit(main(sys.argv[1:]))
diff --git a/misc/synthesize_reads.py b/misc/synthesize_reads.py
new file mode 100755
index 0000000..5a91b9b
--- /dev/null
+++ b/misc/synthesize_reads.py
@@ -0,0 +1,406 @@
+#!/usr/bin/python -3
+#
+# Copyright (c) 2013 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+from __future__ import print_function
+
+import sys
+import math
+import gzip
+import random
+
+from optparse import \
+ OptionParser, \
+ OptionGroup
+
+from paleomix.common.sequences import \
+ reverse_complement
+from paleomix.common.formats.fasta import \
+ FASTA
+from paleomix.common.utilities import \
+ fragment
+from paleomix.common.sampling import \
+ weighted_sampling
+
+
+def _dexp(lambda_value, position):
+ return lambda_value * math.exp(-lambda_value * position)
+
+
+def _rexp(lambda_value, rng):
+ return - math.log(rng.random()) / lambda_value
+
+
+def toint(value):
+ return int(round(value))
+
+
+# Adapter added to the 5' end of the forward strand (read from 5' ...)
+PCR1 = "AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC%sATCTCGTATGCCGTCTTCTGCTTG"
+# Adapter added to the 5' end of the reverse strand (read from 3' ...):
+# rev. compl of the forward
+PCR2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"
+
+
+def _get_indel_length(indel_lambda, rng):
+ return 1 + toint(_rexp(indel_lambda, rng))
+
+
+def _get_weighted_choices(rng, sub_rate, indel_rate):
+ choices_by_nt = {}
+ for src_nt in "ACGT":
+ choices = "ACGTID"
+ probs = [sub_rate / 4] * 4 # ACGT
+ probs += [indel_rate / 2] * 2 # ID
+ probs[choices.index(src_nt)] = 1 - sum(probs) + sub_rate / 4
+ choices_by_nt[src_nt] = weighted_sampling(choices, probs, rng)
+ return choices_by_nt
+
+
+def _mutate_sequence(rng, choices, refseq, indel_lambda=0):
+ position = 0
+ sequence, positions = [], []
+ while position < len(refseq):
+ ref_nt = refseq[position]
+ if ref_nt not in "ACGT":
+ read_nt = rng.choice("ACGT")
+ else:
+ read_nt = choices[ref_nt].next()
+
+ if read_nt == "D":
+ for _ in xrange(_get_indel_length(indel_lambda, rng)):
+ position += 1
+ elif read_nt == "I":
+ for _ in xrange(_get_indel_length(indel_lambda, rng)):
+ sequence.append(rng.choice("ACGT"))
+ positions.append(position)
+ else:
+ sequence.append(read_nt)
+ positions.append(position)
+ position += 1
+ return "".join(sequence), positions
+
+
+class Specimen(object):
+ """Represents a specimen, from which samples are derived.
+
+ These are mutated by the addition of changes to the sequence
+
+ """
+ def __init__(self, options, filename):
+ genome = list(FASTA.from_file(filename))
+ assert len(genome) == 1, len(genome)
+
+ self._genome = genome[0].sequence.upper()
+ self._sequence = None
+ self._positions = None
+ self._annotations = None
+
+ self._mutate(options)
+
+ def _mutate(self, options):
+ rng = random.Random(options.specimen_seed)
+ choices = _get_weighted_choices(rng, options.specimen_sub_rate,
+ options.specimen_indel_rate)
+ self._sequence, self._positions = \
+ _mutate_sequence(rng, choices, self._genome,
+ options.specimen_indel_lambda)
+
+ @property
+ def sequence(self):
+ return self._sequence
+
+ @property
+ def positions(self):
+ return self._positions
+
+ @property
+ def annotations(self):
+ return self._annotations
+
+
+class Sample(object):
+ def __init__(self, options, specimen):
+ self._specimen = specimen
+ self._random = random.Random(options.sample_seed)
+ self._options = options
+
+ frac_endog = self._random.gauss(options.sample_endog_mu,
+ options.sample_endog_sigma)
+ self._frac_endog = min(1, max(0.01, frac_endog))
+ self._endog_id = 0
+ self._contam_id = 0
+
+ def get_fragment(self):
+ """Returns either a DNA fragmnet, representing either a fragment of
+ the sample genome, or a randomly generated DNA sequence representing
+ contaminant DNA that is not related to the species."""
+ if self._random.random() <= self._frac_endog:
+ return self._get_endogenous_sequence()
+ return self._get_contaminant_sequence()
+
+ def _get_contaminant_sequence(self):
+ length = self._get_frag_len()
+ sequence = [self._random.choice("ACGT") for _ in xrange(length)]
+
+ self._contam_id += 1
+ name = "Seq_junk_%i" % (self._contam_id,)
+ return (False, name, "".join(sequence))
+
+ def _get_endogenous_sequence(self):
+ length = self._get_frag_len()
+ max_position = len(self._specimen.sequence) - length
+ position = self._random.randint(0, max_position)
+ strand = self._random.choice(("fw", "rv"))
+
+ sequence = self._specimen.sequence[position:position + length]
+ real_pos = self._specimen.positions[position]
+ if strand == "rv":
+ sequence = reverse_complement("".join(sequence))
+
+ self._endog_id += 1
+ name = "Seq_%i_%i_%i_%s" % (self._endog_id, real_pos, length, strand)
+ return (True, name, sequence)
+
+ def _get_frag_len(self):
+ length = toint(self._random.gauss(self._options.sample_frag_len_mu,
+ self._options.sample_frag_len_sigma))
+
+ return max(self._options.sample_frag_len_min,
+ min(self._options.sample_frag_len_max, length))
+
+
+class Damage(object):
+ def __init__(self, options, sample):
+ self._options = options
+ self._sample = sample
+ self._random = random.Random(options.damage_seed)
+ self._rates = self._calc_damage_rates(options)
+
+ def get_fragment(self):
+ is_endogenous, name, sequence = self._sample.get_fragment()
+ if is_endogenous and self._options.damage:
+ sequence = self._damage_sequence(sequence)
+ return (name, sequence)
+
+ def _damage_sequence(self, sequence):
+ result = []
+ length = len(sequence)
+ for (position, nucleotide) in enumerate(sequence):
+ if nucleotide == "C":
+ if self._random.random() < self._rates[position]:
+ nucleotide = "T"
+ elif nucleotide == "G":
+ rv_position = length - position - 1
+ if self._random.random() < self._rates[rv_position]:
+ nucleotide = "A"
+ result.append(nucleotide)
+ return "".join(result)
+
+ @classmethod
+ def _calc_damage_rates(cls, options):
+ rate = options.damage_lambda
+ rates = [_dexp(rate, position)
+ for position in range(options.sample_frag_len_max)]
+ return rates
+
+
+class Library(object):
+ def __init__(self, options, sample):
+ self._options = options
+ self._sample = sample
+ self._cache = []
+ self._rng = random.Random(options.library_seed)
+
+ self.barcode = options.library_barcode
+ if self.barcode is None:
+ self.barcode = "".join(self._rng.choice("ACGT") for _ in range(6))
+ assert len(self.barcode) == 6, options.barcode
+
+ pcr1 = PCR1 % (self.barcode,)
+ self.lanes = self._generate_lanes(options, self._rng, sample, pcr1)
+
+ @classmethod
+ def _generate_lanes(cls, options, rng, sample, pcr1):
+ lane_counts = []
+ for _ in xrange(options.lanes_num):
+ lane_counts.append(toint(random.gauss(options.lanes_reads_mu,
+ options.lanes_reads_sigma)))
+ reads = cls._generate_reads(options, rng, sample,
+ sum(lane_counts), pcr1)
+
+ lanes = []
+ for count in lane_counts:
+ lanes.append(Lane(options, reads[:count]))
+ reads = reads[count:]
+ return lanes
+
+ @classmethod
+ def _generate_reads(cls, options, rng, sample, minimum, pcr1):
+ reads = []
+ while len(reads) < minimum:
+ name, sequence = sample.get_fragment()
+ cur_forward = sequence + pcr1
+ cur_reverse = reverse_complement(sequence) + PCR2
+ # Number of PCR copies -- minimum 1
+ num_dupes = toint(_rexp(options.library_pcr_lambda, rng)) + 1
+ for dupe_id in xrange(num_dupes):
+ cur_name = "%s_%s" % (name, dupe_id)
+ reads.append((cur_name, cur_forward, cur_reverse))
+ random.shuffle(reads)
+ return reads
+
+
+class Lane(object):
+ def __init__(self, options, reads):
+ rng = random.Random()
+ choices = _get_weighted_choices(rng, options.reads_sub_rate,
+ options.reads_indel_rate)
+
+ self._sequences = []
+ for (name, forward, reverse) in reads:
+ forward, _ = _mutate_sequence(rng, choices, forward,
+ options.reads_indel_lambda)
+
+ if len(forward) < options.reads_len:
+ forward += "A" * (options.reads_len - len(forward))
+ elif len(forward) > options.reads_len:
+ forward = forward[:options.reads_len]
+
+ reverse, _ = _mutate_sequence(rng, choices, reverse,
+ options.reads_indel_lambda)
+
+ if len(reverse) < options.reads_len:
+ reverse += "T" * (options.reads_len - len(reverse))
+ elif len(reverse) > options.reads_len:
+ reverse = reverse[:options.reads_len]
+
+ self._sequences.append((name, "".join(forward), "".join(reverse)))
+
+ @property
+ def sequences(self):
+ return self._sequences
+
+
+def parse_args(argv):
+ parser = OptionParser()
+
+ group = OptionGroup(parser, "Specimen")
+ group.add_option("--specimen-seed", default=None,
+ help="Seed used to initialize the 'speciment', for the "
+ "creation of a random genotype. Set to a specific "
+ "values if runs are to be done for the same "
+ "genotype.")
+ group.add_option("--specimen-sub-rate", default=0.005, type=float)
+ group.add_option("--specimen-indel-rate", default=0.0005, type=float)
+ group.add_option("--specimen-indel-lambda", default=0.9, type=float)
+ parser.add_option_group(group)
+
+ group = OptionGroup(parser, "Samples from specimens")
+ group.add_option("--sample-seed", default=None)
+ group.add_option("--sample-frag-length-mu",
+ dest="sample_frag_len_mu", default=100, type=int)
+ group.add_option("--sample-frag-length-sigma",
+ dest="sample_frag_len_sigma", default=30, type=int)
+ group.add_option("--sample-frag-length-min",
+ dest="sample_frag_len_min", default=0, type=int)
+ group.add_option("--sample-frag-length-max",
+ dest="sample_frag_len_max", default=500, type=int)
+ group.add_option("--sample-endogenous_mu",
+ dest="sample_endog_mu", default=0.75, type=float)
+ group.add_option("--sample-endogenous_sigma",
+ dest="sample_endog_sigma", default=0.10, type=float)
+ parser.add_option_group(group)
+
+ group = OptionGroup(parser, "Post mortem damage of samples")
+ group.add_option("--damage", dest="damage",
+ default=False, action="store_true")
+ group.add_option("--damage-seed", dest="damage_seed", default=None)
+ group.add_option("--damage-lambda", dest="damage_lambda",
+ default=0.25, type=float)
+ parser.add_option_group(group)
+
+ group = OptionGroup(parser, "Libraries from samples")
+ group.add_option("--library-seed", dest="library_seed", default=None)
+ group.add_option("--library-pcr-lambda", dest="library_pcr_lambda",
+ default=3, type=float)
+ group.add_option("--library-barcode", dest="library_barcode", default=None)
+ parser.add_option_group(group)
+
+ group = OptionGroup(parser, "Lanes from libraries")
+ group.add_option("--lanes", dest="lanes_num", default=3, type=int)
+ group.add_option("--lanes-reads-mu", dest="lanes_reads_mu",
+ default=10000, type=int)
+ group.add_option("--lanes-reads-sigma", dest="lanes_reads_sigma",
+ default=2500, type=int)
+ group.add_option("--lanes-reads-per-file", dest="lanes_per_file",
+ default=2500, type=int)
+ parser.add_option_group(group)
+
+ group = OptionGroup(parser, "Reads from lanes")
+ group.add_option("--reads-sub-rate", dest="reads_sub_rate",
+ default=0.005, type=float)
+ group.add_option("--reads-indel-rate", dest="reads_indel_rate",
+ default=0.0005, type=float)
+ group.add_option("--reads-indel-lambda",
+ dest="reads_indel_lambda", default=0.9, type=float)
+ group.add_option("--reads-length", dest="reads_len", default=100, type=int)
+ parser.add_option_group(group)
+
+ options, args = parser.parse_args(argv)
+ if len(args) != 2:
+ sys.stderr.write("Usage: %s <genome> <prefix>\n" % sys.argv[0])
+ return None, None
+ return options, args
+
+
+def main(argv):
+ options, args = parse_args(argv)
+ if not options:
+ return 1
+
+ print("Generating %i lane(s) of synthetic reads ...\nDISCLAIMER: For "
+ "demonstration of PALEOMIX only; the synthetic data is not "
+ "biologically meaningful!" % (options.lanes_num,))
+
+ specimen = Specimen(options, args[0])
+ sample = Sample(options, specimen)
+ damage = Damage(options, sample)
+ library = Library(options, damage)
+
+ for (lnum, lane) in enumerate(library.lanes, start=1):
+ fragments = fragment(options.lanes_per_file, lane.sequences)
+ for (readsnum, reads) in enumerate(fragments, start=1):
+ templ = "%s%s_L%i_R%%s_%02i.fastq.gz" % (args[1], library.barcode,
+ lnum, readsnum)
+
+ print(" Writing %s" % (templ % "{Pair}",))
+ with gzip.open(templ % 1, "w") as out_1:
+ with gzip.open(templ % 2, "w") as out_2:
+ for (name, seq_1, seq_2) in reads:
+ out_1.write("@%s%s/1\n%s\n" % (library.barcode, name, seq_1))
+ out_1.write("+\n%s\n" % ("I" * len(seq_1),))
+ out_2.write("@%s%s/2\n%s\n" % (library.barcode, name, seq_2))
+ out_2.write("+\n%s\n" % ("H" * len(seq_2),))
+
+
+if __name__ == '__main__':
+ sys.exit(main(sys.argv[1:]))
diff --git a/paleomix/__init__.py b/paleomix/__init__.py
new file mode 100644
index 0000000..005648e
--- /dev/null
+++ b/paleomix/__init__.py
@@ -0,0 +1,63 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+
+__version_info__ = (1, 2, 7)
+__version__ = '%i.%i.%i' % __version_info__
+
+
+def run(command=None):
+ """Main entry-point for setuptools"""
+ import sys
+ import paleomix.main
+
+ argv = []
+ if command is not None:
+ argv.append(command)
+ argv.extend(sys.argv[1:])
+
+ return paleomix.main.main(argv)
+
+
+def run_bam_pipeline():
+ """Legacy entry-point for setuptools"""
+ return run("bam_pipeline")
+
+
+def run_gtf_to_bed():
+ """Legacy entry-point for setuptools"""
+ return run("gtf_to_bed")
+
+
+def run_phylo_pipeline():
+ """Legacy entry-point for setuptools"""
+ return run("phylo_pipeline")
+
+
+def run_rmdup_collapsed():
+ """Legacy entry-point for setuptools"""
+ return run("rmdup_collapsed")
+
+
+def run_trim_pipeline():
+ """Legacy entry-point for setuptools"""
+ return run("trim_pipeline")
diff --git a/paleomix/atomiccmd/__init__.py b/paleomix/atomiccmd/__init__.py
new file mode 100644
index 0000000..90e5529
--- /dev/null
+++ b/paleomix/atomiccmd/__init__.py
@@ -0,0 +1,22 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
diff --git a/paleomix/atomiccmd/builder.py b/paleomix/atomiccmd/builder.py
new file mode 100644
index 0000000..ceceaf3
--- /dev/null
+++ b/paleomix/atomiccmd/builder.py
@@ -0,0 +1,541 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+"""Tools for passing CLI options to AtomicCmds used by Nodes.
+
+The module contains 1 class and 2 decorators, which may be used in conjunction
+to create Node classes for which the call carried out by AtomicCmds may be
+modified by the end user, without explicit support added to the init function
+of the class. The basic outline of such a class is as follows:
+
+
+class ExampleNode(CommandNode):
+ @create_customizable_cli_parameters
+ def customize(self, ...):
+ # Use passed parameters to create AtomicCmdBuilder obj
+ builder = AtomicCmdBuilder(...)
+ builder.set_option(...)
+
+ # Return dictionary of AtomicCmdBuilder objects and any
+ # additional parameters required to run the Node.
+ return {"command" : builder,
+ "example" : ...}
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ # Create AtomicCmd object using (potentially tweaked) parameters
+ command = parameters.command.finalize()
+
+ # Do something with a parameter passed to customize
+ description = "<ExampleNode: %s>" % parameters.example
+
+ CommandNode.__init__(command = command,
+ description = description,
+ ...)
+
+This class can then be used in two ways:
+1) Without doing any explicit modifications to the CLI calls:
+>> node = ExampleNode(...)
+
+2) Retrieving and tweaking AtomicCmdBuilder before creating the Node:
+>> params = ExampleNode.customize(...)
+>> params.command.set_option(...)
+>> node = params.build_node()
+
+"""
+import os
+import types
+import inspect
+import subprocess
+import collections
+
+from paleomix.atomiccmd.command import \
+ AtomicCmd
+from paleomix.common.utilities import \
+ safe_coerce_to_tuple
+
+import paleomix.common.versions as versions
+
+
+class AtomicCmdBuilderError(RuntimeError):
+ pass
+
+
+class AtomicCmdBuilder:
+ """AtomicCmdBuilder is a class used to allow step-wise construction of an
+ AtomicCmd object. This allows the user of a Node to modify the behavior
+ of the called programs using some CLI parameters, without explicit support
+ for these in the Node API. Some limitations are in place, to help catch
+ cases where overwriting or adding a flag would break the Node.
+
+ The system call is constructed in the following manner:
+ $ <call> <options> <values>
+
+ The components are defined as follows:
+ <call> - The minimal call needed invoke the current program. Typically
+ this is just the name of the executable, but may be a more
+ complex set of values for nested calls (e.g. java/scripts).
+ <option> - A flag, typically prefixed with one or two dashes, followed
+ by an optional value. The flag may be joined with the value
+ by an seperator (e.g. '='), otherwise they are added to the
+ final call as seperate values.
+ <values> - One or more values, e.g. paths or similar.
+
+ Options are divided into two classes; singletons and non-singletons:
+ Singletons - May be specified exactly one (using 'set_option'), with
+ subsequent calls to 'set_option' overwriting the
+ previous value of the option (if any).
+ Non-singletons - May be specified one or more times (with 'push_option'),
+ with each subsequent call added to list of existing
+ parameters.
+
+ Furthermore, any parameter may be marked as fixed (typically because the
+ node dependends on that option being set), which prevents any subsequent
+ call from modifying this option. By default, all options are fixed.
+
+ Any number of keywords may be set, which are passed to the AtomicCmd object
+ created by the AtomicCmdBuilder object (using 'set_kwargs'). The rules
+ specified in the AtomicCmd documentation apply to these. If a
+ AtomicCmdBuilder object is passed, this will be finalized as well.
+ """
+
+ def __init__(self, call, **kwargs):
+ """See AtomiCmd.__init__ for parameters / keyword arguments.
+ """
+ self._call = safe_coerce_to_tuple(call)
+ self._options = []
+ self._values = []
+ self._kwargs = {}
+ self._object = None
+
+ self.set_kwargs(**kwargs)
+
+ def set_option(self, key, value=None, sep=None, fixed=True):
+ """Sets or overwrites an option that may be specified at most once. If
+ the option has already been set using 'add_option', or with 'fixed' set
+ to True, a AtomicCmdBuilderError will be raised."""
+ old_option = self._get_option_for_editing(key, singleton=True)
+ new_option = {"Key": key,
+ "Value": value,
+ "Sep": sep,
+ "Fixed": fixed,
+ "Singleton": True}
+
+ if old_option:
+ if old_option["Fixed"]:
+ message = "Attemping to overwrite fixed option: %r" % key
+ raise AtomicCmdBuilderError(message)
+ old_option.update(new_option)
+ else:
+ self._options.append(new_option)
+
+ def add_option(self, key, value=None, sep=None, fixed=True):
+ """Adds an option that may be specified one or more times. If the
+ option has already been set using 'set_option', a AtomicCmdBuilderError
+ will be raised.
+ """
+ # Previous values are not used, but checks are required
+ self._get_option_for_editing(key, singleton=False)
+ self._options.append({"Key": key,
+ "Value": value,
+ "Sep": sep,
+ "Fixed": fixed,
+ "Singleton": False})
+
+ def pop_option(self, key):
+ old_option = self._get_option_for_editing(key, singleton=None)
+ if not old_option:
+ raise KeyError("Option with key %r does not exist" % key)
+ elif old_option["Fixed"]:
+ raise AtomicCmdBuilderError("Attempting to pop fixed key %r" % key)
+ self._options.remove(old_option)
+
+ def add_value(self, value):
+ """Adds a positional value to the call. Usage should be restricted to
+ paths and similar values, and set/add_option used for actual options.
+ """
+ self._values.append(value)
+
+ def set_kwargs(self, **kwargs):
+ if self._object:
+ message = "Parameters have already been finalized"
+ raise AtomicCmdBuilderError(message)
+
+ for key in kwargs:
+ if key in self._kwargs:
+ message = "Attempted to overwrite existing path: %r"
+ raise AtomicCmdBuilderError(message % key)
+ self._kwargs.update(kwargs)
+
+ def add_multiple_options(self, key, values, sep=None,
+ template="IN_FILE_%02i"):
+ """Add multiple options as once, with corresponding kwargs.
+
+ The template determines the key-names used for the arguments,
+ using numbers starting from 1 to differentiate between multiple
+ values.
+ """
+ kwargs = {}
+ for file_key, value in self._get_new_kwarg_keys(values, template):
+ self.add_option(key, "%%(%s)s" % (file_key,),
+ sep=sep, fixed=True)
+ kwargs[file_key] = value
+ self.set_kwargs(**kwargs)
+ return kwargs
+
+ def add_multiple_values(self, values, template="IN_FILE_%02i"):
+ """Add multiple values as once, with corresponding kwargs.
+
+ The template determines the key-names used for the arguments,
+ using numbers starting from 1 to differentiate between multiple
+ values.
+ """
+ kwargs = {}
+ for file_key, value in self._get_new_kwarg_keys(values, template):
+ self.add_value("%%(%s)s" % (file_key,))
+ kwargs[file_key] = value
+ self.set_kwargs(**kwargs)
+ return kwargs
+
+ @property
+ def call(self):
+ """Returns the system-call based on the call passed to the constructor,
+ and every parameter set or pushed using 'set_option' and 'add_option'.
+ """
+ command = list(self._call)
+ for parameter in self._options:
+ if parameter["Value"] is not None:
+ if parameter["Sep"] is not None:
+ command.append("%s%s%s" % (parameter["Key"],
+ parameter["Sep"],
+ parameter["Value"]))
+ else:
+ command.append(parameter["Key"])
+ command.append(parameter["Value"])
+ else:
+ command.append(parameter["Key"])
+
+ command.extend(self._values)
+ return command
+
+ @property
+ def finalized_call(self):
+ """Returns the system-call, as 'call', but with all key-values
+ instantiated to the values passed to the AtomicCmdBuilder. This is
+ intended for use with direct Popen calls.
+ """
+ kwargs = self.kwargs
+ kwargs["TEMP_DIR"] = "%(TEMP_DIR)"
+ return [(str(field) % kwargs) for field in self.call]
+
+ @property
+ def kwargs(self):
+ """Returns a dictionary of keyword arguments as set by 'set_kwargs'.
+ If the value of an argument is an AtomicCmdBuilder, then the builder
+ is finalized and the resulting value is used."""
+ kwargs = {}
+ for (key, value) in self._kwargs.iteritems():
+ if isinstance(value, AtomicCmdBuilder):
+ value = value.finalize()
+ kwargs[key] = value
+ return kwargs
+
+ def finalize(self):
+ """Creates an AtomicCmd object based on the AtomicParam object. Once
+ finalized, the AtomicCmdBuilder cannot be modified further."""
+ if not self._object:
+ self._object = AtomicCmd(self.call, **self.kwargs)
+
+ return self._object
+
+ def _get_option_for_editing(self, key, singleton):
+ if self._object:
+ message = "AtomicCmdBuilder has already been finalized"
+ raise AtomicCmdBuilderError(message)
+ elif not isinstance(key, types.StringTypes):
+ message = "Key must be a string, not %r" \
+ % (key.__class__.__name__,)
+ raise TypeError(message)
+ elif not key:
+ raise KeyError("Key cannot be an empty string")
+
+ for option in reversed(self._options):
+ if (option["Key"] == key):
+ if (singleton is not None) \
+ and (option["Singleton"] != singleton):
+ message = "Mixing singleton and non-singleton options: %r"
+ raise AtomicCmdBuilderError(message % key)
+ return option
+
+ def _get_new_kwarg_keys(self, values, template):
+ start = 0
+ for value in values:
+ start += 1
+ key = template % (start,)
+ while key in self._kwargs:
+ start += 1
+ key = template % (start,)
+ yield key, value
+
+
+class AtomicJavaCmdBuilder(AtomicCmdBuilder):
+ """AtomicCmdBuilder for running java JARs.
+
+ The resulting command will run the JAR in head-less mode, in order to ensure
+ that the JARs can be run on head-less servers (and to avoid popups on OSX),
+ using the process-specific temp-folder, and using at most a single thread
+ for garbage collection (to ensure that thread-limits are obeyed).
+
+ """
+
+ def __init__(self, jar, jre_options=(), temp_root="%(TEMP_DIR)s",
+ gc_threads=1, java_version=(1, 6), **kwargs):
+ """Parameters:
+ jar -- Path to a JAR file to be executed; is included as an
+ auxiliary file dependency in the final command.
+ jre_options -- List of CLI options to be passed to 'java' command.
+ temp_root -- Temp folder to use for java process; if not set, the
+ process specific temp folder is used.
+ gc_threads -- Number of threads to use during garbage collections.
+ ... -- Key-word args are passed to AtomicCmdBuilder.
+ """
+ call = ["java", "-server",
+ "-Djava.io.tmpdir=%s" % temp_root,
+ "-Djava.awt.headless=true"]
+
+ if not isinstance(gc_threads, (types.IntType, types.LongType)):
+ raise TypeError("'gc_threads' must be an integer value, not %r"
+ % gc_threads.__class__.__name__)
+ elif gc_threads > 1:
+ call.append("-XX:ParallelGCThreads=%i" % gc_threads)
+ elif gc_threads == 1:
+ call.append("-XX:+UseSerialGC")
+ else:
+ raise ValueError("'gc_threads' must be a 1 or greater, not %r"
+ % gc_threads)
+
+ jre_options = tuple(jre_options)
+ call.extend(jre_options)
+
+ # Only set -Xmx if no user-supplied setting is given
+ if not any(opt.startswith("-Xmx") for opt in jre_options):
+ # Our experience is that the default -Xmx value tends to cause
+ # OutOfMemory exceptions with typical datasets, so require at least
+ # 4gb. However, this is not possible on 32bit systems, which cannot
+ # handle such datasets in any case (due to e.g. BWA memory usage).
+ if AtomicJavaCmdBuilder._IS_JAVA_64_BIT is None:
+ with open("/dev/null", "w") as dev_null:
+ version_call = call + ["-d64", "-version"]
+ try:
+ result = subprocess.call(version_call,
+ stdout=dev_null,
+ stderr=dev_null,
+ preexec_fn=os.setsid,
+ close_fds=True)
+
+ AtomicJavaCmdBuilder._IS_JAVA_64_BIT = (result == 0)
+ except OSError:
+ # We don't care if this fails here, the exec / version
+ # checks will report any problems downstream
+ AtomicJavaCmdBuilder._IS_JAVA_64_BIT = False
+
+ # The default memory-limit tends to be insufficent for whole-genome
+ # datasets, so this is increased on 64-bit architectures.
+ if AtomicJavaCmdBuilder._IS_JAVA_64_BIT:
+ call.append("-Xmx4g")
+
+ version = self._get_java_version(java_version)
+ call.extend(("-jar", "%(AUX_JAR)s"))
+ AtomicCmdBuilder.__init__(self, call,
+ AUX_JAR=jar,
+ CHECK_JRE=version,
+ **kwargs)
+
+ _IS_JAVA_64_BIT = None
+
+ @classmethod
+ def _get_java_version(cls, version):
+ version = tuple(map(int, version))
+ if version not in JAVA_VERSIONS:
+ regexp = r"[\._]".join(r"(\d+)" for _ in version)
+ regexp = r'version "%s' % (regexp,)
+ jre_call = ["java", "-Djava.awt.headless=true", "-version"]
+
+ JAVA_VERSIONS[version] \
+ = versions.Requirement(call=jre_call,
+ name="JAVA Runtime Environment",
+ search=regexp,
+ checks=versions.GE(*version),
+ priority=10)
+ return JAVA_VERSIONS[version]
+JAVA_VERSIONS = {}
+
+
+class AtomicMPICmdBuilder(AtomicCmdBuilder):
+ """AtomicCmdBuilder for MPI enabled programs;
+
+ Simplifies specification of number of threads to use, only invoking the
+ 'mpi' command if more than one thread is used; furthermore, the 'mpi'
+ binary is used as a dependency, since MPI enabled programs tend to fail
+ catastrophically if the 'mpi' binary and associated libraries are missing.
+
+ """
+
+ def __init__(self, call, threads = 1, **kwargs):
+ if not isinstance(threads, (types.IntType, types.LongType)):
+ raise TypeError("'threads' must be an integer value, not %r" % threads.__class__.__name__)
+ elif threads < 1:
+ raise ValueError("'threads' must be 1 or greater, not %i" % threads)
+ elif threads == 1:
+ AtomicCmdBuilder.__init__(self, call, EXEC_MPI = "mpirun", **kwargs)
+ else:
+ call = safe_coerce_to_tuple(call)
+ mpi_call = ["mpirun", "-n", threads]
+ mpi_call.extend(call)
+
+ AtomicCmdBuilder.__init__(self, mpi_call, EXEC_MAIN = call[0], **kwargs)
+
+
+def use_customizable_cli_parameters(init_func): # pylint: disable=C0103
+ """Decorator for __init__ functions, implementing the customizable Node
+ interface: Allows a node to be implemented either using default behavior:
+ >>> node = SomeNode(value1 = ..., value2 = ...)
+
+ Or using tweaked parameters for calls that support it:
+ >>> parameters = SomeNode.customize(value1 = ..., value2 = ...)
+ >>> parameters["command"].set_options(...)
+ >>> node = SomeNode(parameters)
+
+ To be able to use this interface, the class must implement a
+ function 'customize' that takes the parameters that the constructor
+ would take, while the constructor must take a 'parameters' argument.
+
+ """
+ if init_func.func_name != '__init__':
+ raise ValueError("Function name must be '__init__', not %r"
+ % (init_func.func_name,))
+
+ def do_call(self, parameters = None, **kwargs):
+ if not parameters:
+ parameters = self.customize(**kwargs)
+
+ return init_func(self, parameters)
+
+ return do_call
+
+
+def create_customizable_cli_parameters(customize_func): # pylint: disable=C0103
+ """Decorator complementing the 'use_customizable_cli_parameters' decorator
+ defined above, which should be used on a function named 'customize'; this
+ function is made a classmethod.
+
+ The modified function returns a object with a member for each keyword
+ parameter, and a 'build_node' function which calls the init function using
+ these parameter values. The initializer function is expected to take a
+ single argument, corresponding to ehe wrapper object.
+
+ Typically, the returned wrapper will include an AtomicCmdBuilder, which can
+ be modified by the user to directly modify the call carried out by the
+ resulting node.
+
+ class Example:
+ @create_customizable_cli_parameters
+ def customize(cls, first, second, third):
+ # (Typicall) builds initial command
+ command = AtomicCmdBuilder(...)
+ return {"command" : command}
+
+ parameters = Example.customize(first = ..., second = ...)
+ print obj.first
+ print obj.second
+ # Modify command-builder object
+ obj.command.set_option(...)
+
+ # Calls __init__ with the parameter object
+ node = wrapper.build_node()
+
+ """
+ if customize_func.func_name != 'customize':
+ raise ValueError("Function name must be 'customize', not %r"
+ % (customize_func.func_name,))
+
+ def do_call(cls, *args, **kwargs):
+ # Build dictionary containing all arguments
+ kwargs = inspect.getcallargs(customize_func, cls, *args, **kwargs)
+ # Allow parameters to be updated in the 'customize' function
+ kwargs.update(customize_func(**kwargs))
+
+ return _create_cli_parameters_cls(cls, kwargs)
+
+ return classmethod(do_call)
+
+
+def apply_options(builder, options, pred = lambda s: s.startswith("-")):
+ """Applies a dictionary of options to a builder. By default, only
+ options where the key start with "-" are used (determined by 'pred').
+ The following rules are used when applying options:
+ - If a key is associated with a single value, 'set_option' is used.
+ - If a key is associated with a list of values, 'add_option' is used.
+ - If the key is associated with a boolean value, the option is set
+ if true (without a value) or removed from the call if false. This
+ allows easy setting/unsetting of '--do-something' type options.
+
+ """
+ for (key, values) in dict(options).iteritems():
+ if not isinstance(key, types.StringTypes):
+ raise TypeError("Keys must be strings, not %r" % (key.__class__.__name__,))
+ elif pred(key):
+ if isinstance(values, (types.ListType, types.TupleType)):
+ for value in values:
+ if not isinstance(value, _ADDABLE_TYPES) or isinstance(value, _SETABLE_ONLY_TYPES):
+ raise TypeError("Unexpected type when adding options: %r" % (value.__class__.__name__,))
+ builder.add_option(key, value)
+ elif not isinstance(values, _SETABLE_TYPES):
+ raise TypeError("Unexpected type when setting option: %r" % (values.__class__.__name__,))
+ elif isinstance(values, (types.BooleanType, types.NoneType)):
+ if values or values is None:
+ builder.set_option(key)
+ else:
+ builder.pop_option(key)
+ else:
+ builder.set_option(key, values)
+
+
+_create_cli_parameters_cls_cache = {}
+def _create_cli_parameters_cls(cls, kwargs):
+ key = (cls, frozenset(kwargs))
+ clsobj = _create_cli_parameters_cls_cache.get(key)
+ if not clsobj:
+ _create_cli_parameters_cls_cache[key] = clsobj = \
+ collections.namedtuple("CustomCLIParams", " ".join(kwargs))
+
+ class _ParametersWrapper(clsobj): # pylint: disable=W0232
+ def build_node(self):
+ return cls(self)
+
+ return _ParametersWrapper(**kwargs)
+
+
+
+_ADDABLE_TYPES = (types.FloatType, types.IntType, types.LongType) + types.StringTypes
+_SETABLE_ONLY_TYPES = (types.BooleanType, types.NoneType)
+_SETABLE_TYPES = _ADDABLE_TYPES + _SETABLE_ONLY_TYPES
diff --git a/paleomix/atomiccmd/command.py b/paleomix/atomiccmd/command.py
new file mode 100644
index 0000000..8148251
--- /dev/null
+++ b/paleomix/atomiccmd/command.py
@@ -0,0 +1,482 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import collections
+import errno
+import os
+import re
+import signal
+import sys
+import types
+import weakref
+
+import paleomix.atomiccmd.pprint as atomicpp
+import paleomix.common.fileutils as fileutils
+import paleomix.common.procs as procs
+import paleomix.common.signals as signals
+
+from paleomix.common.utilities import safe_coerce_to_tuple
+
+_PIPES = (("IN", "IN_STDIN"),
+ ("OUT", "OUT_STDOUT"),
+ ("OUT", "OUT_STDERR"))
+_KEY_RE = re.compile("^(IN|OUT|EXEC|AUX|CHECK|TEMP_IN|TEMP_OUT)_[A-Z0-9_]+")
+_FILE_MAP = {"IN": "input",
+ "OUT": "output",
+ "TEMP_IN": None,
+ "TEMP_OUT": "temporary_fname",
+ "EXEC": "executable",
+ "AUX": "auxiliary",
+ "CHECK": "requirements"}
+
+
+class CmdError(RuntimeError):
+ def __init__(self, msg):
+ RuntimeError.__init__(self, msg)
+
+
+class AtomicCmd(object):
+ """Executes a command, only moving resulting files to the destination
+ directory if the command was succesful. This helps prevent the
+ accidential use of partial files in downstream analysis, and eases
+ restarting of a pipeline following errors (no cleanup).
+
+ Inividual files/paths in the command are specified using keywords (see
+ the documentation for the constructor), allowing the command to be
+ transparently modified to execute in a temporary directory.
+
+ When an AtomicCmd is run(), a signal handler is installed for SIGTERM,
+ which ensures that any running processes are terminated. In the absence
+ of this, AtomicCmds run in terminated subprocesses can result in still
+ running children after the termination of the parents."""
+ PIPE = procs.PIPE
+ DEVNULL = procs.DEVNULL
+
+ def __init__(self, command, set_cwd=False, **kwargs):
+ """Takes a command and a set of files.
+
+ The command is expected to be an iterable starting with the name of an
+ executable, with each item representing one string on the command line.
+ Thus, the command "find /etc -name 'profile*'" might be represented as
+ the list ["find", "/etc", "-name", "profile*"].
+
+ Commands typically consist of an executable, one or more input files,
+ one or more output files, and one or more pipes. In atomic command,
+ such files are not specified directly, but instead are specified using
+ keywords, which allows easy tracking of requirements and other
+ features. Note that only files, and not directories, are supported as
+ input/output!
+
+ Each keyword represents a type of file, as determined by the prefix:
+ IN_ -- Path to input file transformed/analysed the executable.
+ OUT_ -- Path to output file generated by the executable. During
+ execution of the AtomicCmd, these paths are modified to
+ point to the temporary directory.
+ EXEC_ -- Name of / path to executable. The first item in the
+ command is always one of the executables, even if not
+ specified in this manner.
+ AUX_ -- Auxillery files required by the executable(s), which are
+ themselves not executable. Examples include scripts,
+ config files, data-bases, and the like.
+ CHECK_ -- A callable, which upon calling does version checking,
+ raising an exception in the case of requirements not being
+ met. This may be used to ensure that prerequisites are met
+ before running the command. The function is not called by
+ AtomicCmd itself.
+
+ EXAMPLE 1: Creating a gzipped tar-archive from two files
+ The command "tar cjf output-file input-file-1 input-file-2" could be
+ represented using the following AtomicCmd:
+ cmd = AtomicCmd(["tar", "cjf", "%(OUT_FILE)s",
+ "%(IN_FILE_1)s", "%(IN_FILE_2)s"],
+ OUT_FILE = "output-file",
+ IN_FILE_1 = "input-file-1",
+ IN_FILE_2 = "input-file-2")
+
+ Note that files that are not directly invoked may be included above,
+ in order to allow the specification of requirements. This could include
+ required data files, or executables indirectly executed by a script.
+
+ If the above is prefixed with "TEMP_", files are read from / written
+ to the temporary folder in which the command is executed. Note that all
+ TEMP_OUT_ files are deleted when commit is called (if they exist), and
+ only filenames (not dirname component) are allowed for TEMP_ values.
+
+ In addition, the follow special names may be used with the above:
+ STDIN_ -- Takes a filename, or an AtomicCmd, in which case stdout
+ of that command is piped to the stdin of this instance.
+ STDOUT_ -- Takes a filename, or the special value PIPE to allow
+ another AtomicCmd instance to use the output directly.
+ STDERR_ -- Takes a filename.
+
+ Each pipe can only be used once, with or without the TEMP_ prefix.
+
+ EXAMPLE 2: zcat'ing an archive
+ The command "zcat input-file > output-file" could be represented using
+ the following AtomicCmd:
+ cmd = AtomicCmd(["zcat", "%(IN_FILE)s"],
+ OUT_STDOUT = "output-file")
+
+ If 'set_cwd' is True, the current working directory is set to the
+ temporary directory before the command is executed. Input paths are
+ automatically turned into absolute paths in this case."""
+ self._proc = None
+ self._temp = None
+ self._running = False
+ self._command = map(str, safe_coerce_to_tuple(command))
+ self._set_cwd = set_cwd
+ if not self._command or not self._command[0]:
+ raise ValueError("Empty command in AtomicCmd constructor")
+
+ arguments = self._process_arguments(id(self), self._command, kwargs)
+ self._files = self._build_files_dict(arguments)
+ self._file_sets = self._build_files_map(self._command, arguments)
+
+ # Dry-run, to catch errors early
+ self._generate_call("/tmp")
+
+ def run(self, temp, wrap_errors=True):
+ """Runs the given command, saving files in the specified temp folder.
+ To move files to their final destination, call commit(). Note that in
+ contexts where the *Cmds classes are used, this function may block.
+
+ """
+ if self._running:
+ raise CmdError("Calling 'run' on already running command.")
+ self._temp = temp
+ self._running = True
+
+ # kwords for pipes are always built relative to the current directory,
+ # since these are opened before (possibly) CD'ing to the temp
+ # directory.
+ stdin = stdout = stderr = None
+ try:
+ kwords = self._generate_filenames(self._files, root=temp)
+ stdin = self._open_pipe(kwords, "IN_STDIN", "rb")
+ stdout = self._open_pipe(kwords, "OUT_STDOUT", "wb")
+ stderr = self._open_pipe(kwords, "OUT_STDERR", "wb")
+
+ cwd = temp if self._set_cwd else None
+ temp = "" if self._set_cwd else os.path.abspath(temp)
+ call = self._generate_call(temp)
+
+ # Explicitly set to DEVNULL to ensure that STDIN is not left open.
+ if stdin is None:
+ stdin = self.DEVNULL
+
+ self._proc = procs.open_proc(call,
+ stdin=stdin,
+ stdout=stdout,
+ stderr=stderr,
+ cwd=cwd,
+ preexec_fn=os.setsid)
+ except StandardError, error:
+ if not wrap_errors:
+ raise
+
+ message = \
+ "Error running commands:\n" \
+ " Call = %r\n" \
+ " Error = %r"
+ raise CmdError(message % (self._command, error))
+ finally:
+ # Close pipes to allow the command to recieve SIGPIPE
+ for handle in (stdin, stdout, stderr):
+ if handle not in (None, self.PIPE, self.DEVNULL):
+ handle.close()
+
+ # Allow subprocesses to be killed in case of a SIGTERM
+ _add_to_killlist(self._proc)
+
+ def ready(self):
+ """Returns true if the command has been run to completion,
+ regardless of wether or not an error occured."""
+ return self._proc and self._proc.poll() is not None
+
+ def join(self):
+ """Similar to Popen.wait(), but returns the value wrapped in a list,
+ and ensures that any opened handles are closed. Must be called before
+ calling commit."""
+ if not self._proc:
+ return [None]
+
+ self._running = False
+ return_code = self._proc.wait()
+ if return_code < 0:
+ return_code = signals.to_str(-return_code)
+ return [return_code]
+
+ def wait(self):
+ """Equivalent to Subproces.wait. This function should only
+ be used in contexts where a AtomicCmd needs to be combined
+ with Subprocesses, as it does not exist for AtomicSets."""
+ return self.join()[0]
+
+ def terminate(self):
+ """Sends SIGTERM to process if it is still running.
+ Has no effect if the command has already finished."""
+ if self._proc and self._proc.poll() is None:
+ try:
+ os.killpg(self._proc.pid, signal.SIGTERM)
+ except OSError:
+ pass # Already dead / finished process
+
+ # Properties, returning filenames from self._file_sets
+ def _property_file_sets(key): # pylint: disable=E0213
+ def _get_property_files(self):
+ return self._file_sets[key] # pylint: disable=W0212
+ return property(_get_property_files)
+
+ executables = _property_file_sets("executable")
+ requirements = _property_file_sets("requirements")
+ input_files = _property_file_sets("input")
+ output_files = _property_file_sets("output")
+ auxiliary_files = _property_file_sets("auxiliary")
+ expected_temp_files = _property_file_sets("output_fname")
+ optional_temp_files = _property_file_sets("temporary_fname")
+
+ def commit(self, temp):
+ if not self.ready():
+ raise CmdError("Attempting to commit before command has completed")
+ elif self._running:
+ raise CmdError("Called 'commit' before calling 'join'")
+ elif not os.path.samefile(self._temp, temp):
+ raise CmdError("Mismatch between previous and current temp folders"
+ ": %r != %s" % (self._temp, temp))
+
+ missing_files = self.expected_temp_files - set(os.listdir(temp))
+ if missing_files:
+ raise CmdError("Expected files not created: %s"
+ % (", ".join(missing_files)))
+
+ temp = os.path.abspath(temp)
+ filenames = self._generate_filenames(self._files, temp)
+ committed_files = set()
+ try:
+ for (key, filename) in filenames.iteritems():
+ if isinstance(filename, types.StringTypes):
+ if key.startswith("OUT_"):
+ fileutils.move_file(filename, self._files[key])
+ committed_files.add(self._files[key])
+ elif key.startswith("TEMP_OUT_"):
+ fileutils.try_remove(filename)
+ except:
+ # Cleanup after failed commit
+ for fpath in committed_files:
+ fileutils.try_remove(fpath)
+ raise
+
+ self._proc = None
+ self._temp = None
+
+ def __str__(self):
+ return atomicpp.pformat(self)
+
+ def _generate_call(self, temp):
+ kwords = self._generate_filenames(self._files, root=temp)
+
+ try:
+ return [(field % kwords) for field in self._command]
+ except (TypeError, ValueError), error:
+ raise CmdError("Error building Atomic Command:\n"
+ " Call = %s\n Error = %s: %s"
+ % (self._command, error.__class__.__name__, error))
+ except KeyError, error:
+ raise CmdError("Error building Atomic Command:\n"
+ " Call = %s\n Value not specified for path = %s"
+ % (self._command, error))
+
+ @classmethod
+ def _process_arguments(cls, proc_id, command, kwargs):
+ arguments = collections.defaultdict(dict)
+ for (key, value) in kwargs.iteritems():
+ match = _KEY_RE.match(key)
+ if not match:
+ raise ValueError("Invalid keyword argument %r" % (key,))
+
+ # None is ignored, to make use of default arguments easier
+ if value is not None:
+ group, = match.groups()
+ arguments[group][key] = value
+
+ # Pipe stdout/err to files by default
+ executable = os.path.basename(command[0])
+ for pipe in ("STDOUT", "STDERR"):
+ has_out_pipe = ("OUT_" + pipe) in arguments["OUT"]
+ has_temp_out_pipe = ("TEMP_OUT_" + pipe) in arguments["TEMP_OUT"]
+ if not (has_out_pipe or has_temp_out_pipe):
+ filename = "pipe_%s_%i.%s" % (executable, proc_id,
+ pipe.lower())
+ arguments["TEMP_OUT"]["TEMP_OUT_" + pipe] = filename
+
+ cls._validate_arguments(arguments)
+ cls._validate_output_files(arguments)
+ cls._validate_pipes(arguments)
+
+ return arguments
+
+ @classmethod
+ def _validate_arguments(cls, arguments):
+ # Output files
+ for group in ("OUT", "TEMP_OUT"):
+ for (key, value) in arguments.get(group, {}).iteritems():
+ if isinstance(value, types.StringTypes):
+ continue
+
+ if key in ("OUT_STDOUT", "TEMP_OUT_STDOUT"):
+ if value not in (cls.PIPE, cls.DEVNULL):
+ raise TypeError("STDOUT must be a string, PIPE "
+ "or DEVNULL, not %r" % (value,))
+ elif key in ("OUT_STDERR", "TEMP_OUT_STDERR"):
+ if value is not cls.DEVNULL:
+ raise TypeError("STDERR must be a string, "
+ "or DEVNULL, not %r" % (value,))
+ else:
+ raise TypeError("%s must be string, not %r" % (key, value))
+
+ # Input files, including executables and auxiliary files
+ for group in ("IN", "TEMP_IN", "EXEC", "AUX"):
+ for (key, value) in arguments.get(group, {}).iteritems():
+ if isinstance(value, types.StringTypes):
+ continue
+
+ if key in ("IN_STDIN", "TEMP_IN_STDIN"):
+ if not isinstance(value, AtomicCmd) \
+ and value is not cls.DEVNULL:
+ raise TypeError("STDIN must be string, AtomicCmd, "
+ "or DEVNULL, not %r" % (value,))
+ else:
+ raise TypeError("%s must be string, not %r" % (key, value))
+
+ for (key, value) in arguments.get("CHECK", {}).iteritems():
+ if not isinstance(value, collections.Callable):
+ raise TypeError("%s must be callable, not %r" % (key, value))
+
+ for group in ("TEMP_IN", "TEMP_OUT"):
+ for (key, value) in arguments.get(group, {}).iteritems():
+ is_string = isinstance(value, types.StringTypes)
+ if is_string and os.path.dirname(value):
+ raise ValueError("%s cannot contain dir component: %r"
+ % (key, value))
+
+ return True
+
+ @classmethod
+ def _validate_output_files(cls, arguments):
+ output_files = collections.defaultdict(list)
+ for group in ("OUT", "TEMP_OUT"):
+ for (key, value) in arguments.get(group, {}).iteritems():
+ if isinstance(value, types.StringTypes):
+ filename = os.path.basename(value)
+ output_files[filename].append(key)
+
+ for (filename, keys) in output_files.iteritems():
+ if len(keys) > 1:
+ raise ValueError("Same output filename (%s) is specified for "
+ "multiple keys: %s"
+ % (filename, ", ".join(sorted(keys))))
+
+ @classmethod
+ def _validate_pipes(cls, arguments):
+ for (group, pipe) in _PIPES:
+ has_pipe = pipe in arguments[group]
+ has_temp_pipe = ("TEMP_" + pipe) in arguments["TEMP_" + group]
+ if has_pipe and has_temp_pipe:
+ raise CmdError("Pipes may only be specified once")
+
+ @classmethod
+ def _open_pipe(cls, kwords, pipe, mode):
+ filename = kwords.get(pipe, kwords.get("TEMP_" + pipe))
+ if filename in (None, cls.PIPE, cls.DEVNULL):
+ return filename
+ elif isinstance(filename, AtomicCmd):
+ # pylint: disable=W0212
+ return filename._proc and filename._proc.stdout
+
+ return open(filename, mode)
+
+ @classmethod
+ def _generate_filenames(cls, files, root):
+ filenames = {"TEMP_DIR": root}
+ for (key, filename) in files.iteritems():
+ if isinstance(filename, types.StringTypes):
+ if key.startswith("TEMP_") or key.startswith("OUT_"):
+ filename = os.path.join(root, os.path.basename(filename))
+ elif not root and (key.startswith("IN_") or key.startswith("AUX_")):
+ filename = os.path.abspath(filename)
+ filenames[key] = filename
+
+ return filenames
+
+ @classmethod
+ def _build_files_dict(cls, arguments):
+ files = {}
+ for groups in arguments.itervalues():
+ for (key, value) in groups.iteritems():
+ files[key] = value
+
+ return files
+
+ @classmethod
+ def _build_files_map(cls, command, arguments):
+ file_sets = dict((key, set()) for key in _FILE_MAP.itervalues())
+
+ file_sets["executable"].add(command[0])
+ for (group, files) in arguments.iteritems():
+ group_set = file_sets[_FILE_MAP[group]]
+
+ for (key, filename) in files.iteritems():
+ is_string = isinstance(filename, types.StringTypes)
+ if is_string or key.startswith("CHECK_"):
+ group_set.add(filename)
+
+ file_sets["output_fname"] = map(os.path.basename, file_sets["output"])
+
+ return dict(zip(file_sets.iterkeys(),
+ map(frozenset, file_sets.itervalues())))
+
+
+# The following ensures proper cleanup of child processes, for example in the
+# case where multiprocessing.Pool.terminate() is called.
+_PROCS = None
+
+
+def _cleanup_children(signum, _frame):
+ for proc_ref in list(_PROCS):
+ proc = proc_ref()
+ if proc:
+ try:
+ os.killpg(proc.pid, signal.SIGTERM)
+ except OSError:
+ # Ignore already closed processes, etc.
+ pass
+ sys.exit(-signum)
+
+
+def _add_to_killlist(proc):
+ global _PROCS
+
+ if _PROCS is None:
+ signal.signal(signal.SIGTERM, _cleanup_children)
+ _PROCS = set()
+
+ _PROCS.add(weakref.ref(proc, _PROCS.remove))
diff --git a/paleomix/atomiccmd/pprint.py b/paleomix/atomiccmd/pprint.py
new file mode 100644
index 0000000..94e624e
--- /dev/null
+++ b/paleomix/atomiccmd/pprint.py
@@ -0,0 +1,198 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# pylint: disable=W0212
+
+from __future__ import print_function
+
+import os
+import sys
+import types
+import subprocess
+
+
+def _is_cls(obj, *cls_names):
+ return obj.__class__.__name__ in cls_names
+
+def _get_pipe_name(files, pipe):
+ if pipe in files:
+ return pipe.split("_")[-1] + " "
+ return pipe.split("_")[-1] + "*"
+
+def _get_pipe_file(files, pipe):
+ pipe_filename = files.get(pipe)
+ if pipe_filename:
+ return pipe_filename
+ return files.get("TEMP_%s" % (pipe,))
+
+def _describe_cls(atomiccmd):
+ if _is_cls(atomiccmd, "ParallelCmds"):
+ return "Parallel commands"
+ elif _is_cls(atomiccmd, "SequentialCmds"):
+ return "Sequential commands"
+ assert False # pragma: no coverage
+
+
+def _collect_stats(atomiccmd, stats):
+ assert atomiccmd not in stats["id"]
+
+ if _is_cls(atomiccmd, "AtomicCmd"):
+ stats["id"][atomiccmd] = len(stats["id"])
+ pipe = _get_pipe_file(atomiccmd._files, "IN_STDIN")
+ if _is_cls(pipe, "AtomicCmd"):
+ stats["pipe"][pipe] = atomiccmd
+ elif _is_cls(atomiccmd, "ParallelCmds", "SequentialCmds"):
+ for subcmd in atomiccmd._commands:
+ _collect_stats(subcmd, stats)
+ else:
+ assert False # pragma: no coverage
+
+ return stats
+
+
+def _build_status(atomiccmd, _stats, indent, lines):
+ prefix = " " * indent + "Status = "
+ if atomiccmd._proc:
+ if atomiccmd.ready():
+ return_code = tuple(atomiccmd.join())
+ if isinstance(return_code[0], types.StringTypes):
+ lines.append(prefix + "Terminated with signal %s" % return_code)
+ else:
+ lines.append(prefix + "Exited with return-code %i" % return_code)
+ else:
+ lines.append(prefix + "Running ...")
+
+
+def _build_stdin(atomiccmd, files, stats, indent, lines):
+ pipe_name = _get_pipe_name(files, "IN_STDIN")
+ pipe = _get_pipe_file(files, "IN_STDIN")
+ prefix = "%s%s = " % (" " * indent, pipe_name)
+ if pipe and pipe in stats["id"]:
+ lines.append("%s<%02i>" % (prefix, stats["id"][pipe],))
+ elif isinstance(pipe, types.StringTypes):
+ if atomiccmd._set_cwd and (pipe_name == "STDIN*"):
+ pipe = os.path.basename(pipe)
+ lines.append("%s'%s'" % (prefix, pipe))
+ elif pipe:
+ lines.append("%s<PIPE>" % (prefix,))
+
+
+def _build_out_pipe(atomiccmd, files, stats, indent, lines, pipe):
+ pipe_name = _get_pipe_name(files, pipe)
+ prefix = "%s%s = " % (" " * indent, pipe_name)
+
+ if (atomiccmd in stats["pipe"]) and (pipe == "OUT_STDOUT"):
+ pipe = stats["pipe"].get(atomiccmd)
+ lines.append("%s<%02i>" % (prefix, stats["id"][pipe],))
+ return
+
+ filename = _get_pipe_file(files, pipe)
+ if filename is not subprocess.PIPE:
+ lines.append("%s'%s'" % (prefix, filename))
+ else:
+ lines.append("%s<PIPE>" % (prefix,))
+
+
+def _build_cwd(atomiccmd, indent, lines):
+ prefix = " " * indent + "CWD = "
+ if atomiccmd._temp:
+ if atomiccmd._set_cwd:
+ lines.append("%s'%s'" % (prefix, atomiccmd._temp,))
+ else:
+ lines.append("%s'%s'" % (prefix, os.getcwd()))
+ elif atomiccmd._set_cwd:
+ lines.append("%s'%s'" % (prefix, "${TEMP_DIR}"))
+
+
+def _pformat(atomiccmd, stats, indent, lines, include_prefix = True):
+ s_prefix = ""
+ if include_prefix:
+ s_prefix = " " * indent + "- "
+ if _is_cls(atomiccmd, "AtomicCmd"):
+ cmd_id = stats["id"][atomiccmd]
+ s_prefix += "<%02i> " % (cmd_id,)
+ s_prefix_len = len(s_prefix)
+
+ if _is_cls(atomiccmd, "AtomicCmd"):
+ temp = "" if atomiccmd._set_cwd else (atomiccmd._temp or "${TEMP_DIR}")
+ files = atomiccmd._generate_filenames(atomiccmd._files, temp)
+
+ c_prefix = s_prefix + "Command = "
+ for line in _pformat_list(atomiccmd._generate_call(temp)).split("\n"):
+ lines.append("%s%s" % (c_prefix, line))
+ c_prefix = " " * len(c_prefix)
+
+ if not s_prefix_len:
+ s_prefix_len += 1
+
+ _build_status(atomiccmd, stats, s_prefix_len, lines)
+ _build_stdin(atomiccmd, files, stats, s_prefix_len, lines)
+ _build_out_pipe(atomiccmd, files, stats, s_prefix_len, lines, "OUT_STDOUT")
+ _build_out_pipe(atomiccmd, files, stats, s_prefix_len, lines, "OUT_STDERR")
+ _build_cwd(atomiccmd, s_prefix_len, lines)
+ elif _is_cls(atomiccmd, "ParallelCmds", "SequentialCmds"):
+ lines.append("%s%s:" % (s_prefix, _describe_cls(atomiccmd)))
+ for subcmd in atomiccmd._commands:
+ _pformat(subcmd, stats, s_prefix_len + 2, lines)
+ else:
+ assert False # pragma: no coverage
+
+
+def _pformat_list(lst, width = 80):
+ """Return a printable representation of a list, where line-breaks
+ are inserted between items to minimize the number of lines with a
+ width greater than 'width'. Very long items may cause this maximum
+ to be exceeded."""
+ result = [[]]
+ current_width = 0
+ for item in map(repr, lst):
+ if current_width + len(item) + 2 > width:
+ if not result[-1]:
+ result[-1] = [item]
+ current_width = len(item) + 2
+ else:
+ result.append([item])
+ current_width = len(item) + 2
+ else:
+ result[-1].append(item)
+ current_width += len(item) + 2
+
+ return "[%s]" % (",\n ".join(", ".join(line) for line in result))
+
+
+def pformat(atomiccmd):
+ """Returns a human readable description of an Atomic Cmd or Atomic Set
+ of commands. This is currently equivalent to str(cmd_obj)."""
+ if not _is_cls(atomiccmd, "AtomicCmd", "ParallelCmds", "SequentialCmds"):
+ raise TypeError("Invalid type in pformat: %r" % atomiccmd.__class__.__name__)
+
+ lines = []
+ stats = _collect_stats(atomiccmd, {"id" : {}, "pipe" : {}})
+ _pformat(atomiccmd, stats, 0, lines, False)
+ return "<%s>" % "\n".join(lines)
+
+
+def pprint(atomiccmd, out = sys.stdout):
+ """Prints a human readable description of an Atomic Cmd or Atomic Set
+ of commands. This is currently equivalent to print(str(cmd_obj), ...)."""
+ print(pformat(atomiccmd), file = out)
+
diff --git a/paleomix/atomiccmd/sets.py b/paleomix/atomiccmd/sets.py
new file mode 100644
index 0000000..3844f3a
--- /dev/null
+++ b/paleomix/atomiccmd/sets.py
@@ -0,0 +1,191 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import time
+import collections
+
+import paleomix.atomiccmd.pprint as atomicpp
+
+from paleomix.atomiccmd.command import AtomicCmd, CmdError
+from paleomix.common.utilities import safe_coerce_to_tuple
+from paleomix.common.fileutils import try_remove
+
+
+class _CommandSet:
+ def __init__(self, commands):
+ self._commands = safe_coerce_to_tuple(commands)
+ if not self._commands:
+ raise CmdError("Empty list passed to command set")
+
+ self._validate_commands()
+
+ def commit(self, temp):
+ committed_files = set()
+ try:
+ for command in self._commands:
+ command.commit(temp)
+ committed_files.update(command.output_files)
+ except:
+ # Cleanup after failed commit
+ for fpath in committed_files:
+ try_remove(fpath)
+ raise
+
+ def _collect_properties(key): # pylint: disable=E0213
+ def _collector(self):
+ values = set()
+ for command in self._commands: # pylint: disable=W0212
+ values.update(getattr(command, key))
+ return values
+ return property(_collector)
+
+ input_files = _collect_properties("input_files")
+ output_files = _collect_properties("output_files")
+ auxiliary_files = _collect_properties("auxiliary_files")
+ executables = _collect_properties("executables")
+ requirements = _collect_properties("requirements")
+ expected_temp_files = _collect_properties("expected_temp_files")
+ optional_temp_files = _collect_properties("optional_temp_files")
+
+ @property
+ def stdout(self):
+ raise CmdError("%s does not implement property 'stdout'!" \
+ % (self.__class__.__name__,))
+
+ def terminate(self):
+ for command in self._commands:
+ command.terminate()
+
+ def __str__(self):
+ return atomicpp.pformat(self)
+
+ def _validate_commands(self):
+ if len(self._commands) != len(set(self._commands)):
+ raise ValueError("Same command included multiple times in %s" \
+ % (self.__class__.__name__,))
+
+ filenames = collections.defaultdict(int)
+ for command in self._commands:
+ for filename in command.expected_temp_files:
+ filenames[filename] += 1
+ for filename in command.optional_temp_files:
+ filenames[filename] += 1
+
+ clobbered = [filename for (filename, count) in filenames.items() if (count > 1)]
+ if any(clobbered):
+ raise CmdError("Commands clobber each others' files: %s" % (", ".join(clobbered),))
+
+
+class ParallelCmds(_CommandSet):
+ """This class wraps a set of AtomicCmds, running them in parallel.
+ This corresponds to a set of piped commands, which only terminate
+ when all parts of the pipe have terminated. For example:
+ $ dmesg | grep -i segfault | gzip > log.txt.gz
+
+ In case of any one sub-command failing, the remaining commands are
+ automatically terminated. This is done to ensure that commands waiting
+ on pipes are not left running indefinetly.
+
+ Note that only AtomicCmds and ParallelCmds are allowed as
+ sub-commands for this class, since the model requires non-
+ blocking commands."""
+
+ def __init__(self, commands):
+ self._joinable = False
+
+ commands = safe_coerce_to_tuple(commands)
+ for command in commands:
+ if not isinstance(command, (AtomicCmd, ParallelCmds)):
+ raise CmdError("ParallelCmds must only contain AtomicCmds or other ParallelCmds!")
+ _CommandSet.__init__(self, commands)
+
+ def run(self, temp):
+ for command in self._commands:
+ command.run(temp)
+ self._joinable = True
+
+ def ready(self):
+ return all(cmd.ready() for cmd in self._commands)
+
+ def join(self):
+ sleep_time = 0.05
+ commands = list(enumerate(self._commands))
+ return_codes = [[None]] * len(commands)
+ while commands and self._joinable:
+ for (index, command) in list(commands):
+ if command.ready():
+ return_codes[index] = command.join()
+ commands.remove((index, command))
+ sleep_time = 0.05
+ elif any(any(codes) for codes in return_codes):
+ command.terminate()
+ return_codes[index] = command.join()
+ commands.remove((index, command))
+ sleep_time = 0.05
+
+ time.sleep(sleep_time)
+ sleep_time = min(1, sleep_time * 2)
+ return sum(return_codes, [])
+
+
+
+
+class SequentialCmds(_CommandSet):
+ """This class wraps a set of AtomicCmds, running them sequentially.
+ This class therefore corresponds a set of lines in a bash script,
+ each of which invokes a forground job. For example:
+ $ bcftools view snps.bcf | bgzip > snps.vcf.bgz
+ $ tabix snps.vcf.bgz
+
+ The list of commands may include any type of command. Note that
+ the run function only returns once each sub-command has completed.
+ A command is only executed if the previous command in the sequence
+ was succesfully completed, and as a consequence the return codes
+ of a failed SequentialCommand may contain None."""
+
+ def __init__(self, commands):
+ self._ready = False
+
+ commands = safe_coerce_to_tuple(commands)
+ for command in commands:
+ if not isinstance(command, (AtomicCmd, _CommandSet)):
+ raise CmdError("ParallelCmds must only contain AtomicCmds or other ParallelCmds!")
+ _CommandSet.__init__(self, commands)
+
+ def run(self, temp):
+ self._ready = False
+ for command in self._commands:
+ command.run(temp)
+ if any(command.join()):
+ break
+
+ self._ready = True
+
+ def ready(self):
+ return self._ready
+
+ def join(self):
+ return_codes = []
+ for command in self._commands:
+ return_codes.extend(command.join())
+
+ return return_codes
diff --git a/paleomix/common/__init__.py b/paleomix/common/__init__.py
new file mode 100644
index 0000000..cd42802
--- /dev/null
+++ b/paleomix/common/__init__.py
@@ -0,0 +1,22 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
diff --git a/paleomix/common/bamfiles.py b/paleomix/common/bamfiles.py
new file mode 100644
index 0000000..a056be1
--- /dev/null
+++ b/paleomix/common/bamfiles.py
@@ -0,0 +1,138 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import itertools
+
+# BAM flags as defined in the BAM specification
+BAM_SUPPLEMENTARY_ALIGNMENT = 0x800
+BAM_PCR_DUPLICATE = 0x400
+BAM_QUALITY_CONTROL_FAILED = 0x200
+BAM_SECONDARY_ALIGNMENT = 0x100
+BAM_IS_LAST_SEGMENT = 0x80
+BAM_IS_FIRST_SEGMENT = 0x40
+BAM_NEXT_IS_REVERSED = 0x20
+BAM_READ_IS_REVERSED = 0x10
+BAM_NEXT_IS_UNMAPPED = 0x8
+BAM_READ_IS_UNMAPPED = 0x4
+BAM_PROPER_SEGMENTS = 0x2
+BAM_SEGMENTED = 0x1
+
+# Default filters when processing reads
+EXCLUDED_FLAGS = \
+ BAM_SUPPLEMENTARY_ALIGNMENT | \
+ BAM_PCR_DUPLICATE | \
+ BAM_QUALITY_CONTROL_FAILED | \
+ BAM_SECONDARY_ALIGNMENT | \
+ BAM_READ_IS_UNMAPPED
+
+
+class BAMRegionsIter(object):
+ """Iterates over a BAM file, yield a separate iterator for each contig
+ in the BAM or region in the list of regions if these are species, which in
+ turn iterates over individual positions. This allows for the following
+ pattern when parsing BAM files:
+
+ for region in BAMRegionsIter(handle):
+ # Setup per region
+ for (position, records) in region:
+ # Setup per position
+ ...
+ # Teardown per position
+ # Teardown per region
+
+ The list of regions given to the iterator is expected to be in BED-like
+ records (see e.g. paleomix.common.bedtools), with these properties:
+ - contig: Name of the contig in the BED file
+ - start: 0-based offset for the start of the region
+ - end: 1-based offset (i.e. past-the-end) of the region
+ - name: The name of the region
+ """
+
+ def __init__(self, handle, regions=None, exclude_flags=EXCLUDED_FLAGS):
+ """
+ - handle: BAM file handle (c.f. module 'pysam')
+ - regions: List of BED-like regions (see above)
+ """
+ self._handle = handle
+ self._regions = regions
+ self._excluded = exclude_flags
+
+ def __iter__(self):
+ if self._regions:
+ for region in self._regions:
+ records = self._handle.fetch(region.contig,
+ region.start,
+ region.end)
+ records = self._filter(records)
+
+ tid = self._handle.gettid(region.contig)
+ yield _BAMRegion(tid, records,
+ region.name,
+ region.start,
+ region.end)
+ else:
+ def _by_tid(record):
+ """Group by reference ID."""
+ return record.tid
+
+ # Save a copy, as these are properties generated upon every access!
+ names = self._handle.references
+ lengths = self._handle.lengths
+ records = self._filter(self._handle)
+ records = itertools.groupby(records, key=_by_tid)
+
+ for (tid, items) in records:
+ if tid >= 0:
+ name = names[tid]
+ length = lengths[tid]
+ else:
+ name = length = None
+
+ yield _BAMRegion(tid, items, name, 0, length)
+
+ def _filter(self, records):
+ """Filters records by flags, if 'exclude_flags' is set."""
+ if self._excluded:
+ pred = lambda record: not record.flag & self._excluded
+ return itertools.ifilter(pred, records)
+ return records
+
+
+class _BAMRegion(object):
+ """Implements iteration over sites in a BAM file. It is assumed that the
+ BAM file is sorted, and that the input records are from one contig.
+ """
+
+ def __init__(self, tid, records, name, start, end):
+ self._records = records
+ self.tid = tid
+ self.name = name
+ self.start = start
+ self.end = end
+
+ def __iter__(self):
+ def _by_pos(record):
+ """Group by position."""
+ return record.pos
+
+ for group in itertools.groupby(self._records, _by_pos):
+ yield group
diff --git a/paleomix/common/bedtools.py b/paleomix/common/bedtools.py
new file mode 100644
index 0000000..1b89adb
--- /dev/null
+++ b/paleomix/common/bedtools.py
@@ -0,0 +1,234 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import copy
+import types
+
+import paleomix.common.fileutils as fileutils
+import paleomix.common.text as text
+
+
+def _strand_type(value):
+ if value not in ("+", "-"):
+ raise ValueError("Strand must be '+' or '-', not %r" % (value,))
+ return value
+
+
+_BED_DEFAULTS = ("", 0, 0, "", 0, "+")
+_BED_KEYS = ("contig", "start", "end", "name", "score", "strand")
+_BED_TYPES = (str, int, int, str, int, _strand_type)
+
+
+class BEDError(RuntimeError):
+ pass
+
+
+class BEDRecord(object):
+ """Class for parsing and representing a BED records.
+
+ The class has the following properties:
+ .contig -> str
+ .start -> int (0-based)
+ .end -> int (1-based)
+ .name -> str
+ .score -> int
+ .strand -> '+' or '-'
+
+ These fields can also be accessed using the square brackets notation, which
+ also gives access to any additional values, after the strand column. Fields
+ default to 0 or empty string, except the strand which defaults to '+'.
+ """
+
+ def __init__(self, line=None, _len=None):
+ """Constructs a BED record from a line of text. The length of the
+ object matches the number of columns in the input line; in the case
+ incompatible values, a BEDError exception is raised.
+
+ The len parameter is unused, and included only for compatibility with
+ pysam parser objects, such as 'asBED'. No minimum number of columns are
+ required, and it is possible to construct an empty bed record.
+ """
+ self._fields = []
+
+ if line:
+ line = line.rstrip("\r\n").split('\t')
+ for column, (value, func) in enumerate(zip(line, _BED_TYPES)):
+ try:
+ self._fields.append(func(value))
+ except ValueError:
+ raise BEDError("Error parsing column %i in BED record "
+ "(%r); expected type %s, but found %r."
+ % (column, "\t".join(line),
+ func.__name__, value,))
+
+ if len(line) > len(self._fields):
+ self._fields.extend(line[len(self._fields):])
+
+ def __copy__(self):
+ """Needed for copy.copy to work correctly as expected."""
+ record = BEDRecord()
+ record._fields = copy.copy(self._fields)
+ return record
+
+ def __len__(self):
+ """Returns the number of fields in the record; 0 .. N."""
+ return len(self._fields)
+
+ def __str__(self):
+ """Returns a string suitable for writing to a .bed file."""
+ return "\t".join(str(value) for value in self._fields)
+
+ def __repr__(self):
+ """Returns a printable representation of the record."""
+ fields = []
+ for name, value in zip(_BED_KEYS, self._fields):
+ fields.append("%s=%r" % (name, value))
+
+ fields.extend(repr(value) for value in self._fields[len(_BED_KEYS):])
+
+ return "BEDRecord(%s)" % (", ".join(fields))
+
+ def __getitem__(self, index):
+ return self._fields[index]
+
+ def __setitem__(self, index, value):
+ if len(self._fields) <= index:
+ defaults = _BED_DEFAULTS[len(self._fields):index + 1]
+ self._fields.extend(defaults)
+ while len(self._fields) <= index:
+ self._fields.append("")
+
+ if index < len(_BED_TYPES):
+ if type(_BED_TYPES[index]) is type:
+ if not isinstance(value, _BED_TYPES[index]):
+ raise ValueError("Expected %s for BED field %i, got %r"
+ % (_BED_TYPES[index].__name__,
+ index + 1, value))
+ else:
+ value = _BED_TYPES[index](value)
+
+ self._fields[index] = value
+
+ def __cmp__(self, other):
+ if not isinstance(other, BEDRecord):
+ return cmp(self.__class__, other.__class__)
+
+ return cmp(self._fields, other._fields)
+
+ @classmethod
+ def _set_properties(cls):
+ for index, name in enumerate(_BED_KEYS):
+ setattr(cls, name, cls._new_attr(index))
+
+ @classmethod
+ def _new_attr(cls, index):
+ """Returns an getter / setter property for the given value."""
+ def _get(self):
+ return self._fields[index]
+
+ def _set(self, value):
+ self[index] = value
+
+ return property(_get, _set)
+
+
+# Fill out properties for BEDRecord
+BEDRecord._set_properties()
+
+
+def read_bed_file(filename, min_columns=3, contigs=None):
+ """Parses a (gzip/bzip2 compressed) BED file, and yields a sequence of
+ records. Comments and empty lines are skipped. If the number of columns in
+ the bed record is less than the specified ('min_columns'), a BEDError is
+ raised. If a dictionary of {contig: length} is supplied, and min_columns
+ is at least 6, then the coordinates are validated against the known contig
+ lengths.
+ """
+ if min_columns < 3:
+ raise ValueError("'min_columns' must be >= 3 in 'read_bed_file'")
+
+ infinite = float("inf")
+ handle = None
+ try:
+ handle = fileutils.open_ro(filename)
+
+ for (line_num, line) in enumerate(handle):
+ line = line.strip()
+ if not line or line.startswith("#"):
+ continue
+
+ try:
+ bed = BEDRecord(line)
+ except ValueError, error:
+ raise BEDError("Error parsing line %i in regions file:\n"
+ " Path = %r\n Line = %r\n\n%s"
+ % (line_num + 1, filename, line, error))
+
+ if len(bed) < min_columns:
+ url = "http://genome.ucsc.edu/FAQ/FAQformat.html#format1"
+ name = repr(bed.name) if len(bed) > 3 else "unnamed record"
+ raise BEDError("Region at line #%i (%s) does not "
+ "contain the expected number of fields; "
+ "the first %i fields are required. C.f. "
+ "defination at\n %s\n\nPath = %r"
+ % (line_num, name, min_columns,
+ url, filename))
+
+ if contigs is None:
+ contig_len = infinite
+ else:
+ contig_len = contigs.get(bed.contig)
+
+ if contig_len is None:
+ raise BEDError("Regions file contains contig not found "
+ "in reference:\n Path = %r\n Contig = "
+ "%r\n\nPlease ensure that all contig "
+ "names match the reference names!"
+ % (filename, bed.contig))
+ elif not (0 <= bed.start < bed.end <= contig_len):
+ raise BEDError("Regions file contains invalid region:\n"
+ " Path = %r\n Contig = %r\n"
+ " Start = %s\n End = %s\n\n"
+ "Expected 0 <= Start < End <= %i!"
+ % (filename, bed.contig, bed.start,
+ bed.end, contig_len))
+
+ yield bed
+ finally:
+ if handle:
+ handle.close()
+
+
+def sort_bed_by_bamfile(bamfile, regions):
+ """Orders a set of BED regions, such that processing matches
+ (as far as possible) the layout of the BAM file. This may be
+ used to ensure that extraction of regions occurs (close to)
+ linearly."""
+ if not regions:
+ return
+
+ indices = dict(zip(bamfile.references,
+ xrange(len(bamfile.references))))
+
+ def _by_bam_layout(region):
+ return (indices[region.contig], region.start, region.end)
+ regions.sort(key=_by_bam_layout)
diff --git a/paleomix/common/console.py b/paleomix/common/console.py
new file mode 100644
index 0000000..a9655a0
--- /dev/null
+++ b/paleomix/common/console.py
@@ -0,0 +1,99 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2013 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+from __future__ import print_function
+
+import sys
+
+
+# Global setting to enable, disable, or force the use console codes when the
+# print_* functions are used. By default (ON), color codes will only be used if
+# the destination is a TTY.
+COLORS_OFF, COLORS_ON, COLORS_FORCED = range(3)
+
+
+def set_color_output(clr):
+ """Set use of colors in print functions; possible values are COLORS_OFF,
+ COLORS_ON, and COLORS_FORCED.
+ """
+ global _COLORS
+ if clr not in (COLORS_OFF, COLORS_ON, COLORS_FORCED):
+ raise ValueError("Invalid value in set_color_output; must be one of "
+ "COLORS_OFF, COLORS_ON, or COLORS_FORCED.")
+
+ _COLORS = clr
+
+
+def print_msg(*vargs, **kwargs):
+ """Equivalent to print. Currently does not apply a color to the text"""
+ print(*vargs, **kwargs)
+
+
+def print_debug(*vargs, **kwargs):
+ """Equivalent to print, but prints using shell colorcodes (blue)."""
+ _do_print_color(*vargs, colorcode=36, **kwargs)
+
+
+def print_info(*vargs, **kwargs):
+ """Equivalent to print, but prints using shell colorcodes (green)."""
+ _do_print_color(*vargs, colorcode=32, **kwargs)
+
+
+def print_err(*vargs, **kwargs):
+ """Equivalent to print, but prints using shell colorcodes (red)."""
+ _do_print_color(*vargs, colorcode=31, **kwargs)
+
+
+def print_warn(*vargs, **kwargs):
+ """Equivalent to print, but prints using shell colorcodes (yellow)."""
+ _do_print_color(*vargs, colorcode=33, **kwargs)
+
+
+def print_disabled(*vargs, **kwargs):
+ """Equivalent to print, but prints using shell colorcodes (gray)."""
+ _do_print_color(*vargs, colorcode=30, **kwargs)
+
+
+def _do_print_color(*vargs, **kwargs):
+ """Utility function: Prints using shell colors."""
+ colorcode = kwargs.pop("colorcode")
+ destination = kwargs.pop("file", sys.stderr)
+
+ # No colors if output is redirected (e.g. less, file, etc.)
+ colors_on = _COLORS != COLORS_OFF
+ colors_forced = _COLORS == COLORS_FORCED
+ if colors_on and (destination.isatty() or colors_forced):
+ vargs = list(vargs)
+ for (index, varg) in enumerate(vargs):
+ varg_lines = []
+ # Newlines terminate the color-code for e.g. 'less', so ensure that
+ # each line is color-coded, while preserving the list of arguments
+ for line in str(varg).split("\n"):
+ varg_lines.append("\033[00;%im%s\033[00m" % (colorcode, line))
+ vargs[index] = "\n".join(varg_lines)
+
+ print(*vargs, file=destination, **kwargs)
+
+ if '\n' in kwargs.get('end', '\n'):
+ destination.flush()
+
+_COLORS = COLORS_ON
diff --git a/paleomix/common/fileutils.py b/paleomix/common/fileutils.py
new file mode 100644
index 0000000..aa5d2e6
--- /dev/null
+++ b/paleomix/common/fileutils.py
@@ -0,0 +1,344 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import bz2
+import gzip
+import uuid
+import errno
+import types
+import shutil
+
+from paleomix.common.utilities import safe_coerce_to_tuple, \
+ safe_coerce_to_frozenset
+
+
+def add_postfix(filename, postfix):
+ """Ads a postfix to a filename (before any extensions that filename may have)."""
+ filename, ext = os.path.splitext(filename)
+ return filename + postfix + ext
+
+
+def swap_ext(filename, ext):
+ """Replaces the existing extension of a filename with the specified extension,
+ ensuring that the extension is prefixed by a '.'. If no extension is specified,
+ other than potentially a dot, the existing extension is stripped."""
+ filename, _ = os.path.splitext(filename)
+ if ext in ("", "."):
+ return filename
+
+ if not ext.startswith("."):
+ ext = "." + ext
+
+ return filename + ext
+
+
+def reroot_path(root, filename):
+ """Returns the basename of filename, joined to root."""
+ return os.path.join(root, os.path.basename(filename))
+
+
+def create_temp_dir(root):
+ """Creates a temporary directory, accessible only by the owner,
+ at the specified location. The folder name is randomly generated,
+ and only the current user has access"""
+ def _generate_path():
+ return os.path.join(root, str(uuid.uuid4()))
+
+ path = _generate_path()
+ while not make_dirs(path, mode = 0750):
+ path = _generate_path()
+ return path
+
+
+def missing_files(filenames):
+ """Given a list of filenames, returns a list of those that
+ does not exist. Note that this function does not differentiate
+ between files and folders."""
+ result = []
+ for filename in safe_coerce_to_frozenset(filenames):
+ if not os.path.exists(filename):
+ result.append(filename)
+
+ return result
+
+
+def modified_after(younger, older):
+ """Returns true any of the files expected to be 'younger' have
+ been modified after any of the files expected to be 'older'."""
+ def get_mtimes(filenames):
+ for filename in filenames:
+ yield os.path.getmtime(filename)
+
+ younger_time = max(get_mtimes(safe_coerce_to_frozenset(younger)))
+ older_time = min(get_mtimes(safe_coerce_to_frozenset(older)))
+
+ return younger_time > older_time
+
+
+def is_executable(filename):
+ """Returns true if the specified file is an executable file."""
+ return os.path.isfile(filename) and os.access(filename, os.X_OK)
+
+
+def which_executable(filename):
+ """Returns the path of the first executable in the PATH which
+ matches the filename, or None if no match was found. If the
+ filename contains a directory component, only that path is
+ tested, and None is returned if that file is not an executable."""
+ if os.path.dirname(filename):
+ if is_executable(filename):
+ return filename
+ return None
+
+ path_variable = os.environ.get("PATH")
+ if not path_variable:
+ return None
+
+ for path in path_variable.split(os.pathsep):
+ fpath = os.path.join(path, filename)
+ if is_executable(fpath):
+ return fpath
+
+ return None
+
+
+def executable_exists(filename):
+ """Returns true if the filename refers to an executable file,
+ either by relative or full path, or if the executable is found
+ on the current PATH."""
+ exec_path = which_executable(filename)
+
+ return exec_path and is_executable(exec_path)
+
+
+def missing_executables(filenames):
+ result = []
+ for filename in safe_coerce_to_frozenset(filenames):
+ if not executable_exists(filename):
+ result.append(filename)
+ return result
+
+
+def make_dirs(directory, mode = 0777):
+ """Wrapper around os.makedirs to make it suitable for using
+ in a multithreaded/multiprocessing enviroment: Unlike the
+ regular function, this wrapper does not throw an exception if
+ the directory already exists, which may happen if another
+ thread/process created the directory during the function call.
+
+ Returns true if a new directory was created, false if it
+ already existed. Other errors result in exceptions."""
+ if not directory:
+ raise ValueError("Empty directory passed to make_dirs()")
+
+ try:
+ os.makedirs(directory, mode = mode)
+ return True
+ except OSError, error:
+ # make_dirs be called by multiple subprocesses at the same time,
+ # so only raise if the actual creation of the folder failed
+ if error.errno != errno.EEXIST:
+ raise
+ return False
+
+
+def move_file(source, destination):
+ """Wrapper around shutils which ensures that the
+ destination directory exists before moving the file."""
+ _sh_wrapper(shutil.move, source, destination)
+
+
+def copy_file(source, destination):
+ """Wrapper around shutils which ensures that the
+ destination directory exists before copying the file."""
+ _sh_wrapper(shutil.copy, source, destination)
+
+
+def open_ro(filename):
+ """Opens a file for reading, transparently handling
+ GZip and BZip2 compressed files. Returns a file handle."""
+ handle = open(filename)
+ try:
+ header = handle.read(2)
+
+ if header == "\x1f\x8b":
+ handle.close()
+ # TODO: Re-use handle (fileobj)
+ handle = gzip.open(filename)
+ elif header == "BZ":
+ handle.close()
+ handle = bz2.BZ2File(filename)
+ else:
+ handle.seek(0)
+
+ return handle
+ except:
+ handle.close()
+ raise
+
+
+def try_remove(filename):
+ """Tries to remove a file. Unlike os.remove, the function does not
+ raise an exception if the file does not exist, but does raise
+ exceptions on other errors. The return value reflects whether or
+ not the file was actually removed."""
+ return _try_rm_wrapper(os.remove, filename)
+
+
+def try_rmdir(filename):
+ """Tries to remove a directory. Unlike os.rmdir, the function does not raise
+ an exception if the file does not exist, but does raise exceptions on other
+ errors. The return value reflects whether or not the file was actually
+ removed."""
+ return _try_rm_wrapper(os.rmdir, filename)
+
+
+def try_rmtree(filename):
+ """Tries to remove a dir-tree. Unlike shutil.rmtree, the function does not raise
+ an exception if the file does not exist, but does raise exceptions on other
+ errors. The return value reflects whether or not the file was actually
+ removed."""
+ return _try_rm_wrapper(shutil.rmtree, filename)
+
+
+def describe_files(files):
+ """Return a text description of a set of files."""
+ files = _validate_filenames(files)
+
+ if not files:
+ return "No files"
+ elif len(files) == 1:
+ return repr(files[0])
+
+ glob_files = _get_files_glob(files, max_differences = 2)
+ if glob_files:
+ return repr(glob_files)
+
+ paths = set(os.path.dirname(filename) for filename in files)
+ if len(paths) == 1:
+ return "%i files in '%s'" % (len(files), paths.pop())
+ return "%i files" % (len(files),)
+
+
+def describe_paired_files(files_1, files_2):
+ """Return a text description of a set of paired filenames; the
+ sets must be of the same length, a the description will depend
+ on the overall similarity of the filenames / paths. If 'files_2'
+ is empty, this function is the equivalent of calling
+ 'describe_files' with 'files_1' as the argument. In all other
+ cases the length of the two sets must be the same."""
+ files_1 = _validate_filenames(files_1)
+ files_2 = _validate_filenames(files_2)
+
+ if files_1 and not files_2:
+ return describe_files(files_1)
+ elif len(files_1) != len(files_2):
+ raise ValueError("Unequal number of files for mate 1 vs mate 2 reads: %i vs %i" \
+ % (len(files_1), len(files_2)))
+
+ glob_files_1 = _get_files_glob(files_1, 3)
+ glob_files_2 = _get_files_glob(files_2, 3)
+ if glob_files_1 and glob_files_2:
+ final_glob = _get_files_glob((glob_files_1, glob_files_2), 1, show_differences = True)
+ if final_glob:
+ return repr(final_glob)
+
+ fnames = files_1 + files_2
+ paths = set(os.path.dirname(fname) for fname in fnames)
+ if len(paths) == 1:
+ return "%i pair(s) of files in '%s'" % (len(files_1), paths.pop())
+ return "%i pair(s) of files" % (len(files_1),)
+
+
+def _get_files_glob(filenames, max_differences = 1, show_differences = False):
+ """Tries to generate a glob-string for a set of filenames, containing
+ at most 'max_differences' different columns. If more differences are
+ found, or if the length of filenames vary, None is returned."""
+ # File lengths must be the same, otherwise we'd have to do MSA
+ if len(set(map(len, filenames))) > 1:
+ return None
+
+ glob_fname, differences = [], 0
+ for chars in zip(*filenames):
+ if "?" in chars:
+ chars = ('?',)
+
+ if len(frozenset(chars)) > 1:
+ if show_differences:
+ chars = ("[%s]" % ("".join(sorted(chars))),)
+ else:
+ chars = ("?",)
+ differences += 1
+ glob_fname.append(chars[0])
+
+ if differences > max_differences:
+ return None
+
+ return "".join(glob_fname)
+
+
+def _validate_filenames(filenames):
+ """Sanity checks for filenames handled by
+ 'describe_files' and 'describe_paired_files."""
+ filenames = safe_coerce_to_tuple(filenames)
+ for filename in filenames:
+ if not isinstance(filename, types.StringTypes):
+ raise ValueError("Only string types are allowed for filenames, not %s" \
+ % (filename.__class__.__name__,))
+ return filenames
+
+
+def _sh_wrapper(func, source, destination):
+ """Runs an 'shutil' function ('func') which takes an 'source' and
+ a 'destination' argument (e.g. copy/move/etc.), but silently
+ handles the case where the destination directory does not exist.
+
+ If this is the case, the function will first create the destination
+ directory, and then retry the function."""
+ try:
+ func(source, destination)
+ except IOError, error:
+ if (error.errno == errno.ENOENT):
+ if source and destination and os.path.exists(source):
+ dirname = os.path.dirname(destination)
+ make_dirs(dirname)
+ func(source, destination)
+ return
+ elif (error.errno == errno.ENOSPC):
+ # Not enough space; remove partial file
+ os.unlink(destination)
+ raise
+
+
+def _try_rm_wrapper(func, fpath):
+ """Takes a function (e.g. os.remove / os.rmdir), and attempts to remove a
+ path; returns true if that path was succesfully remove, and false if it did
+ not exist."""
+ try:
+ func(fpath)
+ return True
+ except OSError, error:
+ if error.errno != errno.ENOENT:
+ raise
+ return False
+
diff --git a/paleomix/common/formats/__init__.py b/paleomix/common/formats/__init__.py
new file mode 100644
index 0000000..c0288a4
--- /dev/null
+++ b/paleomix/common/formats/__init__.py
@@ -0,0 +1,25 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# pylint: disable=W0611
+
+from paleomix.common.formats._common import FormatError
diff --git a/paleomix/common/formats/_common.py b/paleomix/common/formats/_common.py
new file mode 100644
index 0000000..e4c89d2
--- /dev/null
+++ b/paleomix/common/formats/_common.py
@@ -0,0 +1,25 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+
+class FormatError(RuntimeError):
+ pass
diff --git a/paleomix/common/formats/_graph.py b/paleomix/common/formats/_graph.py
new file mode 100644
index 0000000..f550223
--- /dev/null
+++ b/paleomix/common/formats/_graph.py
@@ -0,0 +1,295 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2013 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+"""
+
+
+"""
+
+from paleomix.common.utilities import \
+ safe_coerce_to_frozenset, \
+ get_in, \
+ set_in
+
+from paleomix.common.formats import FormatError
+
+
+class GraphError(FormatError):
+ pass
+
+
+
+class _Graph:
+ """Internal representation of an unrooted graph, allowing various forms of
+ manipulation directly on the graph. To ensure that all manipulations can be
+ carried out, it is required that branch-lengths are present for ALL branches,
+ or for NO branches.
+
+ Note that neither the root-length, nor node-ordering is preserved."""
+
+ def __init__(self):
+ self.names = {}
+ self.connections = {}
+ self.has_branch_lengths = None
+
+
+ def is_leaf(self, node):
+ """Returns true if the node is a leaf, defined as having a single connection."""
+ return len(self.connections[node]) == 1
+
+
+ def get_path_length(self, *nodes):
+ """Returns the length of a path through the graph. Calling the function
+ with two nodes is the equivalent of getting the branch-length between
+ those two nodes."""
+ if not self.has_branch_lengths:
+ return None
+
+ path_length = 0.0
+ for (node_a, node_b) in zip(nodes, nodes[1:]):
+ segment_length = float(self.connections[node_a][node_b])
+ path_length += segment_length
+
+ return path_length
+
+
+ def set_name(self, node_id, name):
+ self.names[node_id] = name
+
+
+ def add_connection(self, node_id_a, node_id_b, blength = None):
+ if (blength is not None) and float(blength) < 0:
+ raise GraphError("Branch-lengths must be non-negative")
+ elif (blength is not None) != self.has_branch_lengths:
+ if not self.has_branch_lengths is None:
+ raise GraphError("Tree contains branches with and without lengths")
+ self.has_branch_lengths = (blength is not None)
+
+ set_in(self.connections, (node_id_a, node_id_b), blength)
+ set_in(self.connections, (node_id_b, node_id_a), blength)
+
+
+ def remove_connection(self, node_a, node_b):
+ length_a = self.connections[node_a].pop(node_b)
+ length_b = self.connections[node_b].pop(node_a)
+ assert length_a == length_b, (length_a, length_b)
+ return length_a
+
+
+ def remove_node(self, node):
+ connections = self.connections.pop(node)
+ for node_b in connections:
+ self.connections[node_b].pop(node)
+ self.names.pop(node)
+
+
+ def rebuild_tree(self, parent_id, node_id):
+ """Rebuilds a tree starting at a node with id
+ 'node_id' and a parent with id 'parent_id' (or the
+ same value as 'node_id' if a root node)."""
+ raise NotImplementedError("Subclasses must implement 'rebuild_nodes'.") # pragma: no coverage
+
+
+ def prune_uninformative_nodes(self):
+ """Removes nodes without names, and which are connected
+ to two other nodes, extending the branch lengths of the
+ two connected nodes. This process is repreated, until no
+ further nodes are pruned. A rooted tree will typically
+ contain just 1 such node, namely the old root node.
+
+ For example, the tree "(A:5,(B:6):3);" would be reduced to
+ the tree "(A:5,B:9);", whereas the trees "(A:5,(B:6)C:3);"
+ and "(A:5,(B:6,C:2):3);" would not be pruned.
+
+ For a node to be pruned, both adjacent nodes must have a
+ length specified, or both must not have a length specified."""
+ while True:
+ for (cur_node, connections) in self.connections.iteritems():
+ if not self.names[cur_node] and (len(connections) == 2):
+ conn_a, conn_b = connections
+
+ blength = self.get_path_length(conn_a, cur_node, conn_b)
+
+ # Splice out the current node
+ self.remove_node(cur_node)
+ self.add_connection(conn_a, conn_b, blength)
+ break
+ else:
+ # Nothing was pruned this round, terminate
+ break
+
+
+
+ ################################################################################
+ ################################################################################
+ ## Functions relating to NEWICK rooting on midpoint
+
+ def reroot_on_midpoint(self):
+ if not self.has_branch_lengths:
+ raise GraphError("Cannot reroot on midpoint for tree without branch-lengths")
+
+ longest_path, length = self._find_longest_path()
+ root = self._create_root_at(longest_path, length / 2.0)
+
+ return self.rebuild_tree(root, root)
+
+
+ def _find_longest_path(self):
+ """This function determines the longest non-overlapping path possible,
+ and returns a list of the sequence of nodes in this path, as well as
+ the total length of this path."""
+ path_blengths = {}
+ path_guides = {}
+ def _collect_paths(guide, length, p_node, c_node):
+ length += self.get_path_length(p_node, c_node)
+
+ guide.append(c_node)
+ key = frozenset(guide)
+ path_blengths[key] = length
+ path_guides[key ] = guide
+
+ for other in self.connections[c_node]:
+ if other not in key:
+ _collect_paths(list(guide), length, c_node, other)
+
+ for (p_node, connections) in self.connections.iteritems():
+ for c_node in connections:
+ _collect_paths([p_node], 0, p_node, c_node)
+
+ key, length = max(path_blengths.iteritems(), key = lambda item: item[1])
+ return path_guides[key], length
+
+
+ def _create_root_at(self, path, root_at):
+ """Finds the midpoint of a path through a tree, and
+ either creates a new node at that point, or selects
+ the node already present at that point (if any). The
+ mid-point is assumed to be at distance of 'root_at'
+ from the starting node.
+
+ E.g. if the path is the longest path, and 'root_at' is
+ half the length of this path, then this corresponds to
+ rooting at the midpoint.
+
+ The id of the new / selected node is returned. New
+ nodes (if created) are always given the id None."""
+ for (c_node, n_node) in zip(path, path[1:]):
+ branch_length = self.get_path_length(c_node, n_node)
+
+ if (branch_length > root_at):
+ left_len = root_at
+ right_len = branch_length - root_at
+
+ self.remove_connection(c_node, n_node)
+ self.add_connection(None, c_node, left_len)
+ self.add_connection(None, n_node, right_len)
+
+ return None
+ elif branch_length == root_at:
+ return n_node
+ root_at -= branch_length
+ assert False # pragma: no coverage
+
+
+ ################################################################################
+ ################################################################################
+ ## Functions relating to NEWICK rooting on taxa
+
+ def reroot_on_taxa(self, taxa):
+ taxa = safe_coerce_to_frozenset(taxa)
+ if not taxa:
+ raise ValueError("No taxa in outgroup")
+
+ clades = self._collect_clades()
+ root_on = self._collect_nodes_from_names(taxa)
+ # Because None is the id of the root atm: # pylint: disable=W1111
+ root = self._create_root_with_clade(clades, root_on)
+
+ return self.rebuild_tree(root, root)
+
+
+ def _collect_nodes_from_names(self, taxa):
+ known_taxa = set()
+ for (node_id, name) in self.names.iteritems():
+ if self.is_leaf(node_id):
+ known_taxa.add(name)
+
+ unknown_taxa = taxa - known_taxa
+ if unknown_taxa:
+ raise ValueError("Cannot root on unknown taxa: %s" % (", ".join(unknown_taxa),))
+ elif not (known_taxa - taxa):
+ raise ValueError("Cannot root on every taxa in tree")
+
+ return frozenset(key for (key, name) in self.names.iteritems() if name in taxa)
+
+
+ def _collect_clades(self):
+ clades = {}
+ for (node_a, connections) in self.connections.iteritems():
+ for node_b in connections:
+ self._collect_clade_from(clades, node_a, node_b)
+ return clades
+
+
+ def _collect_clade_from(self, cache, p_node, c_node):
+ c_clade = get_in(cache, (p_node, c_node), set())
+ if not c_clade:
+ if self.is_leaf(c_node):
+ c_clade.add(c_node)
+
+ for n_node in self.connections[c_node]:
+ if n_node != p_node:
+ c_clade.update(self._collect_clade_from(cache, c_node, n_node))
+ set_in(cache, (p_node, c_node), frozenset(c_clade))
+ return c_clade
+
+
+ def _create_root_with_clade(self, clades, taxa):
+ root_key, root_clade, root_length = None, None, None
+ for (p_node, connections) in clades.iteritems():
+ for (n_node, clade) in connections.iteritems():
+ if (root_clade is None) or (len(clade) < len(root_clade)):
+ if taxa.issubset(clade):
+ root_key = (p_node, n_node)
+ root_clade = clade
+ root_length = self.get_path_length(p_node, n_node)
+
+ p_node, n_node = root_key
+ if root_length is not None:
+ root_length = float(root_length) / 2.0
+
+ self.remove_connection(p_node, n_node)
+ self.add_connection(None, p_node, root_length)
+ self.add_connection(None, n_node, root_length)
+
+ return None
+
+
+ ################################################################################
+ ################################################################################
+ ## Functions relating to calculating bootstrap support
+ def get_clade_names(self):
+ result = set()
+ for (_, connections) in self._collect_clades().iteritems():
+ for (_, clade) in connections.iteritems():
+ result.add(frozenset(self.names[node_id] for node_id in clade))
+ return result
diff --git a/paleomix/common/formats/fasta.py b/paleomix/common/formats/fasta.py
new file mode 100644
index 0000000..ba8d91f
--- /dev/null
+++ b/paleomix/common/formats/fasta.py
@@ -0,0 +1,150 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import sys
+import types
+
+import pysam
+
+from paleomix.common.utilities import \
+ fragment, \
+ split_before, \
+ Immutable, \
+ TotallyOrdered
+from paleomix.common.fileutils import open_ro
+from paleomix.common.formats._common import FormatError
+
+
+class FASTAError(FormatError):
+ pass
+
+
+class FASTA(TotallyOrdered, Immutable):
+ def __init__(self, name, meta, sequence):
+ if not (name and isinstance(name, types.StringTypes)):
+ raise FASTAError("FASTA name must be a non-empty string")
+ elif not (isinstance(meta, types.StringTypes) or (meta is None)):
+ raise FASTAError("FASTA meta must be a string, or None")
+ elif not isinstance(sequence, types.StringTypes):
+ raise FASTAError("FASTA sequence must be a string")
+
+ Immutable.__init__(self,
+ name=name,
+ meta=meta,
+ sequence=sequence)
+
+ def write(self, fileobj=sys.stdout):
+ """Prints a FASTA sequence (iterable), wrapping long sequences at 60
+ characters."""
+ fileobj.write(repr(self))
+
+ @classmethod
+ def from_lines(cls, lines):
+ """Parses FASTA sequences found in a sequence of lines, and returns
+ a tuple for each FASTA record: ((name, meta-information), sequence)
+ No assumptions are made about the line-lengths."""
+ lines = (line.rstrip() for line in lines)
+ for record in split_before(lines, lambda v: v.startswith(">")):
+ name = record[0]
+ if (not name.startswith(">")) or (len(name) == 1):
+ raise FASTAError("Unnamed FASTA record")
+ elif len(record) == 1:
+ raise FASTAError("FASTA record does not contain sequence: %s"
+ % (name[1:],))
+
+ # Split out any meta information
+ name_and_meta = name[1:].split(None, 1)
+ if len(name_and_meta) < 2:
+ name_and_meta.append(None)
+ name, meta = name_and_meta
+
+ yield FASTA(name=name,
+ meta=meta,
+ sequence="".join(record[1:]))
+
+ @classmethod
+ def from_file(cls, filename):
+ """Reads an unindexed FASTA file, returning a sequence of
+ tuples containing the name and sequence of each entry in
+ the file. The FASTA file may be GZIP/BZ2 compressed."""
+ fasta_file = open_ro(filename)
+ try:
+ for record in FASTA.from_lines(fasta_file):
+ yield record
+ finally:
+ fasta_file.close()
+
+ @classmethod
+ def index_and_collect_contigs(cls, filename):
+ """Creates an index (.fai; if it does not already exist) for a FASTA
+ file using 'pysam', and returns a dictionary of {contig: length} listed
+ in that file; if the .fai file can not be created, or if the FASTA file
+ contains sequences with identical names, then a FASTAError is raised.
+ """
+ fai_filename = filename + ".fai"
+ if not os.path.exists(fai_filename):
+ if not os.access(os.path.dirname(filename), os.W_OK):
+ message = \
+ "FASTA index is missing, but folder is\n" \
+ "not writable, so it cannot be created:\n" \
+ " Filename = %s\n\n" \
+ "Either change permissions on the folder, or move\n" \
+ "the FASTA file to different location." % (filename,)
+ raise FASTAError(message)
+
+ # Use pysam to index the file
+ pysam.Fastafile(filename).close()
+
+ names = set()
+ contigs = []
+ with open(fai_filename) as faihandle:
+ for line in faihandle:
+ name, length, _ = line.split(None, 2)
+ if name in names:
+ raise FASTAError("Reference contains multiple identically "
+ "named sequences:\n Path = %r\n Name = "
+ "%r\nPlease ensure that sequences have "
+ "unique names" % (filename, name))
+ names.add(name)
+ contigs.append((name, int(length)))
+
+ return contigs
+
+ def __lt__(self, other):
+ if not isinstance(other, FASTA):
+ return NotImplemented
+
+ return (self.name, self.meta, self.sequence) \
+ < (other.name, other.meta, other.sequence)
+
+ def __hash__(self):
+ return hash((self.name, self.meta, self.sequence))
+
+ def __repr__(self):
+ """Returns string representation of FASTA sequence, using the standard,
+ FASTA file format, wrapping long sequences at 60 characters.
+ """
+ name = self.name
+ if self.meta:
+ name = "%s %s" % (name, self.meta)
+ return ">%s\n%s\n" % (name, "\n".join(fragment(60, self.sequence)))
diff --git a/paleomix/common/formats/fastq.py b/paleomix/common/formats/fastq.py
new file mode 100644
index 0000000..f45f8d3
--- /dev/null
+++ b/paleomix/common/formats/fastq.py
@@ -0,0 +1,76 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+
+
+# Quality score offsets for Phred (or similar) scores in FASTQ reads (33 or 64)
+OFFSET_33 = 33
+OFFSET_64 = 64
+# Quality score found in both ranges that are unique to each offset,
+# suggesting that the list contains mixed quality offsets, or invalid data.
+OFFSET_BOTH = "BOTH"
+# No quality scores in the expected range of ASCII values (33 .. 105)
+OFFSET_MISSING = "MISSING"
+# Quality scores are in the ASCII range 59 .. 74, which could signify
+# low-quality reads with offset 64, or high-quality reads with offset 33
+OFFSET_AMBIGIOUS = "AMBIGIOUS"
+
+
+def classify_quality_strings(quality_strings):
+ """Takes a sequence of quality strings from FASTQ"""
+ counts = [0] * 256
+ for read in quality_strings:
+ for char in read:
+ counts[ord(char)] += 1
+
+ return _identify_format(counts)
+
+
+def _identify_format(counts):
+ """Given a list representing counts of ASCII characters found in one or
+ more FASTQ quality strins, this function attempts to identify the offset
+ used to encode the quality scores.
+
+ The following constants may be returned:
+ - OFFSET_33: Offset identified as being 33
+ - OFFSET_64: Offset identified as being 64
+ - OFFSET_BOTH: Both offset 33 and 64 found, mixed file? (error)
+ - OFFSET_MISSING: No quality scores found, wrong file? (error)
+ - OFFSET_AMBIGIOUS: Qualities could be either offset. (warning)
+ """
+ # The range of scores that can unambigiously be identified
+ # as belonging to Phred scores with offset 33 or 64. Scores
+ # in between could potentially signify either offset
+ # See e.g. http://en.wikipedia.org/wiki/FASTQ_format#Encoding
+ has_offset_33_scores = any(counts[33:59])
+ has_ambigious_scores = any(counts[59:75])
+ has_offset_64_scores = any(counts[75:105])
+
+ if has_offset_33_scores:
+ if has_offset_64_scores:
+ return OFFSET_BOTH
+ return OFFSET_33
+ elif has_offset_64_scores:
+ return OFFSET_64
+ elif has_ambigious_scores:
+ return OFFSET_AMBIGIOUS
+ return OFFSET_MISSING
diff --git a/paleomix/common/formats/msa.py b/paleomix/common/formats/msa.py
new file mode 100644
index 0000000..f0ba2b7
--- /dev/null
+++ b/paleomix/common/formats/msa.py
@@ -0,0 +1,230 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+from itertools import izip
+from collections import defaultdict
+
+from paleomix.common.sequences import split
+from paleomix.common.fileutils import open_ro
+from paleomix.common.formats.fasta import FASTA, FASTAError
+from paleomix.common.sequences import NT_CODES, encode_genotype
+from paleomix.common.utilities import safe_coerce_to_frozenset
+
+
+class MSAError(FASTAError):
+ pass
+
+
+class MSA(frozenset):
+ """Represents a Multiple Sequence Alignment of FASTA records."""
+
+ def __new__(cls, sequences):
+ records, names = [], set()
+ for record in sequences:
+ if record.name in names:
+ raise MSAError("Duplicate name found in FASTA records: %r" % record.name)
+ records.append(record)
+ names.add(record.name)
+
+ if not records:
+ raise MSAError("MSA does not contain any sequences")
+
+ instance = frozenset.__new__(cls, records)
+ MSA.validate(instance)
+ return instance
+
+ def seqlen(self):
+ """Retrurns the length of the sequences in the MSA."""
+ return len(iter(self).next().sequence)
+
+ def exclude(self, names):
+ """Builds a new MSA that excludes the named set of records."""
+ _, excluded, _ = self._group(names)
+ return MSA(excluded)
+
+ def select(self, names):
+ """Builds a new MSA that includes only the named set of records."""
+ included, _, _ = self._group(names)
+ return MSA(included)
+
+ def reduce(self):
+ columns = []
+ uncalled = frozenset("Nn-")
+ for column in izip(*(record.sequence for record in self)):
+ if (frozenset(column) - uncalled):
+ columns.append(column)
+
+ if not columns:
+ return None
+
+ records = []
+ for (record, sequence) in izip(self, izip(*columns)):
+ records.append(FASTA(record.name, record.meta, "".join(sequence)))
+
+ return MSA(records)
+
+ def filter_singletons(self, to_filter, filter_using):
+ included, excluded, to_filter \
+ = self._group(filter_using, to_filter)
+
+ sequence = list(to_filter.sequence)
+ sequences = [record.sequence.upper() for record in included]
+ for (index, nts) in enumerate(zip(*sequences)):
+ current_nt = sequence[index].upper()
+ if current_nt in "N-":
+ continue
+
+ allowed_nts = set()
+ for allowed_nt in nts:
+ if allowed_nt not in "N-":
+ allowed_nts.update(NT_CODES[allowed_nt])
+ filtered_nts = frozenset(NT_CODES[current_nt]) & allowed_nts
+
+ if not filtered_nts:
+ filtered_nts = "N"
+
+ genotype = encode_genotype(filtered_nts)
+ if genotype != current_nt:
+ sequence[index] = genotype.lower()
+ new_record = FASTA(to_filter.name,
+ to_filter.meta,
+ "".join(sequence))
+
+ return MSA([new_record] + included + excluded)
+
+
+ def split(self, split_by = "123"):
+ """Splits a MSA and returns a dictionary of keys to MSAs,
+ using the keys in the 'split_by' parameter at the top
+ level. See also paleomix.common.sequences.split."""
+ self.validate(self)
+ if not split_by:
+ raise TypeError("No partitions to split by specified")
+
+ results = dict((key, set()) for key in split_by)
+ for record in self:
+ for (key, partition) in split(record.sequence, split_by).iteritems():
+ results[key].add(FASTA(record.name, None, partition))
+
+ for (key, value) in results.items():
+ results[key] = MSA(value)
+
+ return results
+
+ @classmethod
+ def join(cls, *msas):
+ """Merge multiple MSAs into a single MSA, by concatenating sequences in
+ the order of the passed MSAs. Sequences are joined by name, and all MSAs
+ must therefore contain the same set of sequence names. Meta information
+ is not preserved."""
+ cls.validate(*msas)
+
+ merged = defaultdict(list)
+ for msa in msas:
+ for record in msa:
+ merged[record.name].append(record.sequence)
+
+ sequences = []
+ for (name, sequence) in merged.iteritems():
+ sequences.append(FASTA(name, None, "".join(sequence)))
+ return MSA(sequences)
+
+ @classmethod
+ def from_lines(cls, lines):
+ """Parses a MSA from a file/list of lines, and returns a dictionary
+ of names to sequences. If read_meta is True, meta information included
+ after the first space in header of each sequence:
+ >NAME META-INFORMATION
+ SEQUENCE
+ As suggested above, sequences are expected to be in FASTA format."""
+ return MSA(FASTA.from_lines(lines))
+
+ @classmethod
+ def from_file(cls, filename):
+ """Reads a MSA from the specified filename. The file may
+ be uncompressed, gzipped or bzipped. See also 'MSA.from_lines'."""
+ fasta_file = open_ro(filename)
+ try:
+ return MSA.from_lines(fasta_file)
+ except MSAError, error:
+ raise MSAError("%s in file %r" % (error, filename))
+ finally:
+ fasta_file.close()
+
+ def to_file(self, fileobj):
+ for fst in sorted(self):
+ fileobj.write(str(fst))
+
+ @classmethod
+ def validate(cls, *msas):
+ """Validates one or more MSAs, requiring:
+ 1. Identical sets of sequence names across all MSAs.
+ 2. That all names are non-empty strings.
+ 3. That all sequences are of the same length (per MSA).
+ 4. That no empty MSA (no sequences) are specified."""
+ if not msas:
+ raise TypeError("No MSAs given as arguments")
+
+ seqs_all = msas[0].names()
+ seqs_common = set(seqs_all)
+ for msa in msas:
+ if len(set(len(record.sequence) for record in msa)) != 1:
+ raise MSAError("MSA contains sequences of differing lengths")
+
+ seqs_all.update(msa.names())
+ seqs_common &= set(msa.names())
+
+ if seqs_all != seqs_common:
+ raise MSAError("Some sequences not found in all MSAs: '%s'" \
+ % ("', '".join(seqs_all - seqs_common),))
+
+ def __repr__(self):
+ def _fasta_to_str(fst):
+ return "FASTA(%r, %r, %r)" % \
+ (fst.name, fst.meta, fst.sequence)
+ return "MSA(%s)" % (", ".join(map(_fasta_to_str, sorted(self))))
+
+ def names(self):
+ return set(record.name for record in self)
+
+
+ def _group(self, selection, extra = None):
+ selection = safe_coerce_to_frozenset(selection)
+ if (extra in selection):
+ raise MSAError("Key used for multiple selections: %r" % extra)
+ elif not selection:
+ raise ValueError("No FASTA names given")
+
+ missing_keys = selection - self.names()
+ if missing_keys:
+ raise KeyError("Key(s) not found: %r" % (", ".join(map(str, missing_keys))))
+
+ included, excluded, other = [], [], None
+ for record in self:
+ if record.name in selection:
+ included.append(record)
+ elif record.name != extra:
+ excluded.append(record)
+ else:
+ other = record
+
+ return included, excluded, other
diff --git a/paleomix/common/formats/newick.py b/paleomix/common/formats/newick.py
new file mode 100644
index 0000000..66e8028
--- /dev/null
+++ b/paleomix/common/formats/newick.py
@@ -0,0 +1,357 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# Required due to use of NotImplementedError in setattr:
+# pylint: disable=R0921
+import re
+
+from paleomix.common.utilities import \
+ safe_coerce_to_tuple, \
+ Immutable, \
+ TotallyOrdered
+from paleomix.common.formats._graph import \
+ GraphError, \
+ _Graph
+
+
+class NewickError(GraphError):
+ pass
+
+
+class NewickParseError(NewickError):
+ """Exception raised if errors occur during parsing
+ of Newick strings."""
+ pass
+
+
+class Newick(TotallyOrdered, Immutable):
+ """Immutable object representing a Newick node.
+
+ Nodes are classified as either internal nodes (have children),
+ or leaf nodes (does not have children). A node MUST either have
+ 1 or more child-nodes, or have a name and/or a length. This is to
+ ensure that nodes can be represented in an unambigious manner
+ using the Newick format.
+
+ No assumptions are made about the type of the 'name' and the 'length'
+ properties when simply parsing the tree, and these are simply converted
+ into strings when the Newick string is generated. However, additional
+ contraints apply when unrooting/rerooting trees (see below). """
+
+ def __init__(self, name = None, length = None, children = None):
+ """See class documentation for constraints."""
+
+ name = name or None
+ length = length or None
+ children = tuple(children or ())
+ nw_hash = hash((name, length, children))
+ Immutable.__init__(self,
+ name = name,
+ length = length,
+ children = children,
+ _hash = nw_hash)
+
+ if not (self.children or self.name or self.length):
+ raise NewickError("Leaf nodes MUST have either a name or a length")
+
+ # Ensure that these values are hashable
+ hash(self.name)
+ hash(self.length)
+
+ weight = 0
+ for child in self.children:
+ if not isinstance(child, Newick):
+ raise TypeError("Child nodes must be Newick nodes")
+ weight += 1
+ object.__setattr__(self, "_weight", weight)
+
+
+ @property
+ def is_leaf(self):
+ """Returns true if the node is a leaf (has no children)."""
+ return not self.children
+
+
+ def get_leaf_nodes(self):
+ """Returns iterable for leaf-nodes accessible from this node."""
+ if not self.is_leaf:
+ for child in self.children:
+ for leaf in child.get_leaf_nodes():
+ yield leaf
+ else:
+ yield self
+
+
+ def get_leaf_names(self):
+ for node in self.get_leaf_nodes():
+ yield node.name
+
+
+ def reroot_on_taxa(self, taxa):
+ """Returns the Newick tree from this node, but rooted on the midpoint
+ of the branch leading to one or more taxa. Note that the taxa are not
+ required to form a clade. If the taxa do not form a monophyletic clade,
+ then the outgroup will include more taxa than those passed to the
+ function."""
+ return _NewickGraph(self).reroot_on_taxa(taxa)
+
+
+ def reroot_on_midpoint(self):
+ """Returns the newick tree from this node, but rooted on the midpoint
+ of the tree. That is to say that a root node is added at the exact
+ midpoint of the longest path in the unrooted tree. If this midpoint
+ lies at an existing internal node, then this node is made the root.
+
+ Note that the sorting of nodes is not preserved, and that any
+ uninformative nodes (lacking name/length, while connecting two
+ other nodes, e.g. the old root) are spliced out.
+
+ All nodes must have a length of zero or greater (no missing values
+ are allowed), but note that rerooting behavior around nodes with
+ length zero may yield unexpected results."""
+ if len(list(self.get_leaf_nodes())) < 2:
+ return self # No meaningful way to reroot such trees
+
+ return _NewickGraph(self).reroot_on_midpoint()
+
+
+ def add_support(self, bootstraps, fmt = "{Support}"):
+ """Adds support values to the current tree, based on a set of trees containing
+ the same taxa. It is assumed that the support trees represent unrooted or
+ arbitarily rooted trees, and no weight is given to the rooted topology of these
+ trees.
+
+ The main tree should itself be rooted, and the the toplogy and ordering of this
+ tree is preserved, with node-names updated using the formatting string 'fmt'.
+
+ Formatting is carried out using str.format, with these fields:
+ {Support} -- The total number of trees in which a clade is supported.
+ {Percentage} -- The percentage of trees in which a clade is supported (float).
+ {Fraction} -- The fraction of trees in which a clade is supported (float).
+
+ For example, typical percentage support-values can be realized by setting 'fmt'
+ to the value "{Percentage:.0f}" to produce integer values.
+ """
+ clade_counts = {}
+ leaf_names_lst = list(self.get_leaf_names())
+ leaf_names = frozenset(leaf_names_lst)
+ if len(leaf_names) != len(leaf_names_lst):
+ raise NewickError("Cannot add support values to trees with duplicate leaf names")
+
+ bootstraps = safe_coerce_to_tuple(bootstraps)
+ for support_tree in bootstraps:
+ support_tree_names = frozenset(support_tree.get_leaf_names())
+ if leaf_names != support_tree_names:
+ raise NewickError("Support tree does not contain same set of leaf nodes")
+
+ support_graph = _NewickGraph(support_tree)
+ for clade in support_graph.get_clade_names():
+ clade_counts[clade] = clade_counts.get(clade, 0) + 1
+
+ return self._add_support(self, len(bootstraps), clade_counts, fmt)
+
+
+ @classmethod
+ def from_string(cls, string):
+ """Parses a Newick string and returns a representation of the tree.
+ See e.g. http://en.wikipedia.org/wiki/Newick_format
+
+ Note that implicit nodes, such as (), (A,), and the like are not
+ allowed, as they cannot always be represented/parsed in an unambigious
+ manner. Thus all leaf nodes must have a name and/or a length."""
+ tokens = _tokenize(string)
+ if tokens and tokens[0] == "(":
+ top_node = _parse_tokens(tokens)
+ else:
+ top_node = _parse_child(tokens)
+
+ if tokens != [";"]:
+ raise NewickParseError("Missing terminating semi-colon")
+
+ return top_node
+
+
+ def __lt__(self, other):
+ """See TotallyOrdered"""
+ if not isinstance(other, Newick):
+ return NotImplemented
+
+ # pylint: disable=W0212
+ return (-self._weight, self.name, self.length, self.children) \
+ < (-other._weight, other.name, other.length, other.children)
+
+
+ def __hash__(self):
+ """Hashing function, see 'hash'."""
+ return self._hash
+
+
+ def __repr__(self):
+ """Representation corresponds to the Newick string for the (sub)tree,
+ which can be parsed by 'from_string'."""
+ return "%s;" % (self._to_str(),)
+
+
+ def _to_str(self):
+ fields = []
+ if self.children:
+ fields.append("(")
+ for child in self.children:
+ fields.append(child._to_str()) # pylint: disable=W0212
+ fields.append(",")
+ fields.pop()
+ fields.append(")")
+ if self.name is not None:
+ fields.append(str(self.name))
+ if self.length is not None:
+ fields.append(":")
+ fields.append(str(self.length))
+ return "".join(fields)
+
+
+ def _add_support(self, node, total, clade_counts, fmt):
+ """Recursively annotates a subtree with support values,
+ excepting leaf nodes (where the name is preserved) and
+ the root node (where the name is cleared)."""
+ if node.is_leaf:
+ return node
+
+ clade = frozenset(leaf.name for leaf in node.get_leaf_nodes())
+ support = clade_counts.get(clade, 0)
+ name = fmt.format(Support = support,
+ Percentage = (support * 100.0) / (total or 1),
+ Fraction = (support * 1.0) / (total or 1))
+
+ children = []
+ for child in node.children:
+ children.append(self._add_support(child, total, clade_counts, fmt))
+
+ return Newick(name = (None if (node is self) else name),
+ length = node.length,
+ children = children)
+
+
+
+
+
+################################################################################
+################################################################################
+## Functions related to NEWICK parsing
+
+_TOKENIZER = re.compile("([():,;])")
+_NODE_KEYS = frozenset(("name", "length", "children"))
+
+
+def _tokenize(string):
+ result = []
+ for field in _TOKENIZER.split(string):
+ field = field.strip()
+ if field:
+ result.append(field)
+ return result
+
+
+def _parse_tokens(tokens):
+ assert tokens and tokens[0] == "("
+
+ tokens.pop(0)
+ child, children = None, []
+ while tokens and (tokens[0] not in ");"):
+ if tokens[0] == ",":
+ children.append(child)
+ tokens.pop(0)
+ child = _parse_child(tokens)
+ children.append(child)
+
+ if any(child is None for child in children):
+ raise NewickParseError("Implicit leaf nodes (no name OR length) are not allowed")
+ elif not tokens or (tokens[0] != ")"):
+ raise NewickParseError("Malformed Newick string, contains unbalanced parantheses")
+ tokens.pop(0)
+
+ return _parse_child(tokens, children = children)
+
+
+def _parse_child(tokens, children = None):
+ if tokens and tokens[0] == "(":
+ return _parse_tokens(tokens)
+
+ name, length = None, None
+ while tokens and (tokens[0] not in ",);"):
+ if (tokens[0] == ":"):
+ if length is not None:
+ raise NewickParseError("Node has multiple length values")
+ tokens.pop(0)
+ if tokens[0] in ",);":
+ raise NewickParseError("Missing length value")
+ length = tokens.pop(0).strip()
+ else:
+ name = tokens.pop(0).strip()
+
+ if not (name or length or children):
+ raise NewickParseError("Parsing of implicit nodes not supported")
+
+ return Newick(name = name,
+ length = length,
+ children = children)
+
+
+
+################################################################################
+################################################################################
+## Class related to tree manipulations
+
+class _NewickGraph(_Graph):
+ def __init__(self, node):
+ _Graph.__init__(self)
+ self._collect_names_and_blengths(node)
+ self.prune_uninformative_nodes()
+
+
+ def _collect_names_and_blengths(self, c_node):
+ c_node_id = id(c_node)
+
+ self.set_name(c_node_id, c_node.name)
+ for child in c_node.children:
+ child_id = id(child)
+ self.add_connection(c_node_id, child_id, child.length)
+ self._collect_names_and_blengths(child)
+
+
+ def rebuild_tree(self, parent_id, node_id):
+ """Rebuilds a newick tree starting at a node with id
+ 'node_id' and a parent with id 'parent_id' (or the
+ same value as 'node_id' if a root node)."""
+
+ children = []
+ for child_id in self.connections[node_id]:
+ if child_id != parent_id:
+ children.append(self.rebuild_tree(node_id, child_id))
+ children.sort()
+
+ blength = self.connections.get(parent_id).get(node_id)
+ if isinstance(blength, float):
+ blength = repr(blength)
+
+ return Newick(name = self.names.get(node_id),
+ length = blength,
+ children = children)
diff --git a/paleomix/common/formats/phylip.py b/paleomix/common/formats/phylip.py
new file mode 100644
index 0000000..f27029a
--- /dev/null
+++ b/paleomix/common/formats/phylip.py
@@ -0,0 +1,90 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+
+from paleomix.common.utilities import grouper
+from paleomix.common.formats.msa import MSA
+
+
+_NUM_BLOCKS = 6
+_BLOCK_SIZE = 10
+_BLOCK_SPACING = 2
+_MAX_NAME_LENGTH = 30
+_NAME_ENDS_AT = 36
+_LINE_SIZE = _NUM_BLOCKS * _BLOCK_SIZE + (_NUM_BLOCKS - 1) * _BLOCK_SPACING
+
+
+def sequential_phy(msa, add_flag = False, max_name_length = _MAX_NAME_LENGTH):
+ MSA.validate(msa)
+ header = "%i %i" % (len(msa), msa.seqlen())
+ if add_flag:
+ header += " S"
+
+ spacing = " " * _BLOCK_SPACING
+ result = [header, ""]
+ for record in sorted(msa):
+ result.append(record.name[:max_name_length])
+
+ blocks = grouper(_BLOCK_SIZE, record.sequence, fillvalue = "")
+ lines = grouper(_NUM_BLOCKS, blocks)
+ for line in lines:
+ result.append(spacing.join("".join(block) for block in line if block))
+
+ return "\n".join(result)
+
+
+
+def interleaved_phy(msa, add_flag = False, max_name_length = _MAX_NAME_LENGTH):
+ MSA.validate(msa)
+ header = "%i %i" % (len(msa), msa.seqlen())
+ if add_flag:
+ header += " I"
+ result = [header, ""]
+
+ padded_len = min(max_name_length, max(len(name) for name in msa.names())) + 2
+ padded_len -= padded_len % -(_BLOCK_SIZE + _BLOCK_SPACING) + _BLOCK_SPACING
+
+ streams = []
+ spacing = " " * _BLOCK_SPACING
+ for record in sorted(msa):
+ name = record.name[:max_name_length]
+ padding = (padded_len - len(name)) * " "
+
+ lines = []
+ line = [name, padding]
+ for block in grouper(_BLOCK_SIZE, record.sequence, fillvalue = ""):
+ block = "".join(block)
+ if sum(len(segment) for segment in line) >= _LINE_SIZE:
+ lines.append("".join(line))
+ line = [block]
+ else:
+ line.extend((spacing, block))
+
+ lines.append("".join(line))
+ streams.append(lines)
+
+ for rows in zip(*streams):
+ result.extend(row for row in rows)
+ result.append("")
+ result.pop()
+
+ return "\n".join(result)
diff --git a/paleomix/common/makefile.py b/paleomix/common/makefile.py
new file mode 100644
index 0000000..5b66dec
--- /dev/null
+++ b/paleomix/common/makefile.py
@@ -0,0 +1,901 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+"""Generalized methods for parsing/validating "makefiles" in YAML format.
+
+The following example will use a imagined makefile for 'vcf_filter', which
+takes a set of input files, each of which is assigned an output file, and
+each of which may have a set of filters (in addition to a set of defaults):
+
+Example makefile in YAML format:
+-------------------------------------------------------------------------------
+| Defaults:
+| --min-mapq-bias: 1e-4
+| --min-end-distance-bias: 1e-4
+|
+| VCF_Files:
+| "path/to/file1.vcf":
+| Output_File: "path/to/output1.vcf"
+| Options:
+| --min-strand-bias: 1e-4
+| --min-baseq-bias: 1e-4
+| "path/to/file2.vcf":
+| Output_File: "path/to/output2.vcf"
+-------------------------------------------------------------------------------
+
+Such a makefile can be parsed into a dictionary using YAML, but to help us
+ensure that the makefile fits the expected layout (as above), we need to
+specify the structure of the makefile.
+
+Firstly, note that the options are specified twice, so we will make a re-usable
+specification for those. In this case, this can accomplished like so:
+-------------------------------------------------------------------------------
+| _SPECIFICATION_OF_OPTIONS = {
+| StringStartsWith("--") : Or(IsInt, IsFloat),
+| }
+-------------------------------------------------------------------------------
+
+or as so:
+-------------------------------------------------------------------------------
+| _SPECIFICATION_OF_OPTIONS = IsDictOf(StringStartsWith("--"),
+| Or(IsInt, IsFloat))
+-------------------------------------------------------------------------------
+
+In either case, we require that the options be a dictionary with string keys
+that start with "--", and that the values are either floats or integers. In
+this case the two methods are equivalent, but normally the first method would
+be preferred for more complex structures, while the second method is required
+if different sub-structures are possible. For example, to require EITHER a
+list of integers, or a dict of strings -> integers would have to be specified
+as so:
+-------------------------------------------------------------------------------
+| Or(IsListOf(IsInt), IsDictOf(IsStr, IsInt))
+-------------------------------------------------------------------------------
+
+Note that specification objects that do not take any parameters (IsInt, etc.)
+do not need to be instantiated. Thus one can use both 'IsInt' or 'IsInt()',
+whereas 'IsListOf', 'IsDictOf', etc. needs to be instantiated. This is purely
+for convinience.
+
+
+Having specified the expected structure of the options, we can specify the
+remaining structure of the makefile:
+-------------------------------------------------------------------------------
+| _MAKEFILE_SPECIFICATION = {
+| Defaults" : _SPECIFICATION_OF_OPTIONS,
+|
+| "VCF_Files" : {
+| Str : {
+| "Output_File" : IsStr,
+| "Options" : _SPECIFICATION_OF_OPTIONS,
+| }
+| }
+| }
+-------------------------------------------------------------------------------
+
+Finally, we can specify default values. Defaults can be specified for almost
+all specification objects (excepting specifications for keys in dictionaries,
+sub-specification for logical operators, and a couple of others). Let's suppose
+that we always want a min/max depth set, even if the user did not include them
+in the defaults:
+-------------------------------------------------------------------------------
+| _SPECIFICATION_OF_OPTIONS = {
+| StringStartsWith("--") : Or(IsInt, IsFloat),
+| "--min-depth" : IsInt(default = 8),
+| "--max-depth" : IsInt(default = 100),
+| }
+-------------------------------------------------------------------------------
+
+These values would then be set, unless they were already set. Note that named
+keys are given precedence above specification objects, when validating
+key/value pairs. In other words, given this specification, the key
+"--min-depth" is ALWAYS valid (even if it would fail StringStartsWith("--"),
+and the value is ONLY checked against IsInt(default = 8).
+
+Bringing all this together, we could then parse the a file containing the YAML
+code shown above as follows:
+-------------------------------------------------------------------------------
+| makefile = read_makefile("/path/to/makefile.yaml",
+| _MAKEFILE_SPECIFICATION)
+-------------------------------------------------------------------------------
+
+which would yield the following dictionary:
+-------------------------------------------------------------------------------
+| {'Makefile': {'Defaults': {'--max-depth': 100,
+| '--min-depth': 8,
+| '--min-end-distance-bias': 0.001,
+| '--min-mapq-bias': 0.001},
+| 'VCF_Files': {'path/to/file1.vcf':
+| {'Options': {'--max-depth': 100,
+| '--min-baseq-bias': 0.005,
+| '--min-depth': 8,
+| '--min-strand-bias': 0.005},
+| 'Output_File': 'path/to/output1.vcf'},
+| 'path/to/file2.vcf':
+| {'Output_File': 'path/to/output1.vcf'}}},
+| 'Statistics': {'Filename': 'makefile.yaml',
+| 'Hash': 'c0138fd4ffcbbc0dff2c82c6e7595ec38b01f532',
+| 'MTime': '2013-08-13 10:22:46.000000 '}}
+-------------------------------------------------------------------------------
+
+Note the actual contents of the makefile is found in the sub-dictionary
+"Makefile", while the sub-dictionary "Statistics" contains various information
+about the file itself.
+
+Unfortunately, the defaults are being applied to BOTH "Options" sub-trees,
+which makes it impossible to tell which values are supposed to be over-ridden
+for the files. To prevent this from happening, we can specify that defaults
+should NOT be applied, by using the WithoutDefaults wrapper object:
+-------------------------------------------------------------------------------
+| _MAKEFILE_SPECIFICATION = {
+| Defaults" : _SPECIFICATION_OF_OPTIONS,
+|
+| "VCF_Files" : {
+| Str : {
+| "Output_File" : IsStr,
+| "Options" : WithoutDefaults(_SPECIFICATION_OF_OPTIONS),
+| }
+| }
+| }
+-------------------------------------------------------------------------------
+
+Which yields the following structure following processing:
+-------------------------------------------------------------------------------
+| {'Makefile': {'Defaults': {'--max-depth': 100,
+| '--min-depth': 8,
+| '--min-end-distance-bias': 0.001,
+| '--min-mapq-bias': 0.001},
+| 'VCF_Files': {'path/to/file1.vcf':
+| {'Options': {'--min-baseq-bias': 0.005,
+| '--min-strand-bias': 0.005},
+| 'Output_File': 'path/to/output1.vcf'},
+| 'path/to/file2.vcf':
+| {'Output_File': 'path/to/output2.vcf'}}},
+| 'Statistics': {'Filename': 'makefile.yaml',
+| 'Hash': 'c0138fd4ffcbbc0dff2c82c6e7595ec38b01f532',
+| 'MTime': '2013-08-13 10:22:46.000000 '}}
+-------------------------------------------------------------------------------
+
+If the file contents does not match the expected structure, a MakefileError is
+raised which describes the problem. For example, suppose that an "Output_File"
+value has accidentically been left blank ('IsStr' requires a NON-EMPTY string):
+-------------------------------------------------------------------------------
+| Makefile requirement not met at ...:
+| Expected value(s): a non-empty string
+| Observed value(s): ''
+-------------------------------------------------------------------------------
+"""
+import os
+import copy
+import types
+import hashlib
+import datetime
+import operator
+
+import paleomix.yaml
+from paleomix.common.utilities import group_by_pred
+
+
+class MakefileError(RuntimeError):
+ """Raised if a makefile is unreadable, or does not meet specifications."""
+
+
+def read_makefile(filename, specification):
+ """Reads and parses a makefile using the given specification.
+
+ Returns a dictionary of the form
+ {
+ "Makefile": <parsed makefile>,
+ "Statistics": {
+ "Filename": <filename>,
+ "Hash": <SHA1 hash of makefile>,
+ "MTime": <Modification time of makefile>,
+ }
+ }
+ """
+ try:
+ with open(filename) as makefile:
+ string = makefile.read()
+ data = paleomix.yaml.safe_load(string)
+ except paleomix.yaml.error.YAMLError, error:
+ raise MakefileError(error)
+
+ mtime = os.path.getmtime(os.path.realpath(filename))
+ mtime_str = datetime.datetime.fromtimestamp(mtime).strftime("%F %T")
+ return {"Makefile": process_makefile(data, specification),
+ "Statistics": {"Filename": filename,
+ "Hash": hashlib.sha1(string).hexdigest(),
+ "MTime": mtime_str}}
+
+
+def process_makefile(data, specification, path=("root",), apply_defaults=True):
+ """Validates a makefile and applies defaults to missing keys.
+
+ Note that that default values are deep-copied before being set.
+ """
+ if isinstance(specification, WithoutDefaults):
+ specification = specification.specification
+ data = process_makefile(data, specification, path,
+ apply_defaults=False)
+ elif isinstance(specification, PreProcessMakefile):
+ data, specification = specification(path, data)
+ data = process_makefile(data, specification, path, apply_defaults)
+ elif _is_spec(specification):
+ _instantiate_spec(specification)(path, data)
+ elif isinstance(data, (dict, types.NoneType)) \
+ and isinstance(specification, dict):
+ # A limitation of YAML is that empty subtrees are equal to None;
+ # this check ensures that empty subtrees to be handled properly
+ if data is None:
+ data = {}
+
+ _process_default_values(data, specification, path, apply_defaults)
+
+ for cur_key in data:
+ ref_key = _get_matching_spec_or_value(cur_key,
+ specification,
+ path + (cur_key,))
+ data[cur_key] = process_makefile(data[cur_key],
+ specification[ref_key],
+ path + (cur_key,),
+ apply_defaults)
+ elif isinstance(data, (list, types.NoneType)) \
+ and isinstance(specification, list):
+ if not all(_is_spec(spec) for spec in specification):
+ raise TypeError("Lists contains non-specification objects (%r): %r"
+ % (_path_to_str(path), specification))
+ elif data is None: # See comment above
+ data = []
+
+ specification = IsListOf(*specification)
+ _instantiate_spec(specification)(path, data)
+ elif not isinstance(specification, (dict, list)):
+ raise TypeError("Unexpected type in makefile specification at %r: %r!"
+ % (_path_to_str(path), specification))
+ else:
+ raise MakefileError("Inconsistency between makefile specification and "
+ "current makefile at %s:\n Expected %s, "
+ "found %s %r!" % (_path_to_str(path),
+ type(specification).__name__,
+ type(data).__name__,
+ data))
+
+ return data
+
+
+###############################################################################
+###############################################################################
+# Unique 'value' used to specify that a MakefileSpec lacks a default value.
+DEFAULT_NOT_SET = object()
+# Unique 'value' used to specify that the user MUST supply a value
+REQUIRED_VALUE = object()
+
+
+class WithoutDefaults(object):
+ """Wrapper object, that tells 'process_makefile' not to apply
+ default values for the wrapped specification. See module docs
+ for example usage.
+ """
+
+ def __init__(self, specification):
+ self.specification = specification
+
+
+class PreProcessMakefile(object):
+ """Allows pre-processing of a part of a makefile prior to validation; when
+ encountered, the object is called with the current value, and is expected
+ to return a tuple containing (value, specification), which are then used
+ subsequently. This allows transformation of fields for backwards
+ compatibility.
+ """
+
+ def __call__(self, path, value):
+ """Must return (value, specification) tuple."""
+ raise NotImplementedError
+
+
+class MakefileSpec(object):
+ """Base-class for specifications, from which ALL specification
+ objects are expected to derive. Sub-classes must implement the
+ 'meets_spec' function, which must return True or False depending
+ on whether or not the given value meets the specification.
+ """
+
+ def __init__(self, description, default=DEFAULT_NOT_SET):
+ """description -- A string describing the specification.
+ default -- A default value, or DEFAULT_NOT_SET if not used. If a
+ value is set, it is copied before being applied."""
+
+ self.description = description
+ self.default = default
+ if (default not in (DEFAULT_NOT_SET, REQUIRED_VALUE)) \
+ and not self.meets_spec(default):
+ raise ValueError(("Default value does not meet requirements:\n"
+ " Expected value(s): %s\n"
+ " Observed value(s): %r\n")
+ % (description, default))
+
+ def __call__(self, path, value):
+ if not self.meets_spec(value):
+ raise MakefileError(("Makefile requirement not met at %r:\n"
+ " Expected value(s): %s\n"
+ " Observed value(s): %r\n"
+ " Observed type: %s")
+ % (_path_to_str(path), self.description,
+ value, type(value).__name__))
+
+ def meets_spec(self, _value):
+ """Return True if value meets the specification, False otherwise."""
+ raise NotImplementedError
+
+
+###############################################################################
+###############################################################################
+# Tests for basic types
+
+class IsInt(MakefileSpec):
+ """Require that the value is either an Int or a Long."""
+
+ def __init__(self, description="an integer", default=DEFAULT_NOT_SET):
+ MakefileSpec.__init__(self, description, default)
+
+ def meets_spec(self, value):
+ return isinstance(value, (types.IntType, types.LongType)) \
+ and not isinstance(value, types.BooleanType)
+
+
+class IsUnsignedInt(IsInt):
+ """Require that the value is either an Int or a Long, and >= 0."""
+
+ def __init__(self, description="an unsigned integer",
+ default=DEFAULT_NOT_SET):
+ IsInt.__init__(self, description, default)
+
+ def meets_spec(self, value):
+ return IsInt.meets_spec(self, value) & (value >= 0)
+
+
+class IsFloat(MakefileSpec):
+ """Require that the value is a float (does not cover integer types)."""
+
+ def __init__(self, description="a float", default=DEFAULT_NOT_SET):
+ MakefileSpec.__init__(self, description, default)
+
+ def meets_spec(self, value):
+ return isinstance(value, types.FloatType)
+
+
+class IsBoolean(MakefileSpec):
+ """Require that the value is a boolean (True/False)."""
+
+ def __init__(self, description="a boolean", default=DEFAULT_NOT_SET):
+ MakefileSpec.__init__(self, description, default)
+
+ def meets_spec(self, value):
+ return isinstance(value, types.BooleanType)
+
+
+class IsStr(MakefileSpec):
+ """Require that the value is a non-empty string."""
+
+ def __init__(self, description="a non-empty string",
+ default=DEFAULT_NOT_SET):
+ MakefileSpec.__init__(self, description, default)
+
+ def meets_spec(self, value):
+ return isinstance(value, types.StringTypes) and value
+
+
+class IsNone(MakefileSpec):
+ """Require that the value is None, typically signifying that
+ the value was not set in the makefile."""
+
+ def __init__(self, description="None or not set", default=DEFAULT_NOT_SET):
+ if default is not DEFAULT_NOT_SET:
+ raise NotImplementedError("IsNone does not support default values")
+ MakefileSpec.__init__(self, description, default)
+
+ def meets_spec(self, value):
+ return value is None
+
+
+class ValueMissing(MakefileSpec):
+ """Used to signify empty substructures in the makefile specification."""
+
+ def __init__(self, description="no values"):
+ MakefileSpec.__init__(self, description, DEFAULT_NOT_SET)
+
+ def meets_spec(self, _value):
+ return False
+
+
+###############################################################################
+###############################################################################
+# BinaryOperators
+
+class _BinaryOperator(MakefileSpec):
+ """Base class for binary operations; takes a operation function which is
+ assumed to take parameters (lvalue, rvalue), a rvalue to use when calling
+ the function, and a description in the form 'operator {rvalue}' which is
+ used to generate a human readable description of the specification.
+
+ If list_kword is specified, the rvalue is assumed to be a sequence, and
+ _list_values is used to convert it to human readable form.
+ """
+
+ def __init__(self, description, default, opfunc, rvalue, key=None,
+ list_kword=None):
+ self._operator = opfunc
+ self._keyfunc = key
+ self._rvalue = rvalue
+
+ repr_func = repr
+ if list_kword is not None:
+ repr_func = lambda value: _list_values(value, list_kword)
+ description = description.format(rvalue=repr_func(rvalue))
+ MakefileSpec.__init__(self, description, default)
+
+ def meets_spec(self, value):
+ if self._keyfunc is not None:
+ value = self._keyfunc(value)
+ return self._operator(value, self._rvalue)
+
+
+def _create_binary_operator(operator_func, description, list_kword=None):
+ """Creates and returns a BinaryOperator class based on the given
+ operator_func function, which is assumed to be a function taking two
+ arguments (lvalue, rvalue) and returning a boolean value.
+ """
+
+ class _BinaryOperatorImpl(_BinaryOperator):
+ """Implements a binary operator specfication."""
+
+ def __init__(self, rvalue, key=None, description=description,
+ default=DEFAULT_NOT_SET):
+ _BinaryOperator.__init__(self, description, default, operator_func,
+ rvalue, key, list_kword)
+ return _BinaryOperatorImpl
+
+
+def _create_set_operator(operator_func, description):
+ """Creates and returns a BinaryOperator designed to operate on sets of
+ values. Thus, values in the makefile are expected to be either lists or
+ strings (for case sensitive operations).
+ """
+
+ def _operator(lvalue, rvalue):
+ """Operator function for set based operations."""
+ if not isinstance(lvalue, (types.ListType,) + types.StringTypes):
+ return False
+
+ return bool(operator_func(frozenset(lvalue), rvalue))
+
+ description = "%s {rvalue}" % (description,)
+ return _create_binary_operator(_operator, description, "or")
+
+
+ValueLT = _create_binary_operator(operator.lt, "value < {rvalue}")
+ValueLE = _create_binary_operator(operator.le, "value <= {rvalue}")
+ValueEQ = _create_binary_operator(operator.eq, "value = {rvalue}")
+ValueGE = _create_binary_operator(operator.ge, "value >= {rvalue}")
+ValueGT = _create_binary_operator(operator.gt, "value > {rvalue}")
+ValueIn = _create_binary_operator(lambda lvalue, rvalue: lvalue in rvalue,
+ "value in {rvalue}", "or")
+ValuesIntersect = _create_set_operator(frozenset.intersection, "contains")
+ValuesSubsetOf = _create_set_operator(frozenset.issubset, "subset of")
+
+
+###############################################################################
+###############################################################################
+# Logical operators
+
+class _MultipleSpecs(MakefileSpec): # pylint: disable=W0223
+ """Base-class for logical operators for one or more specifications."""
+
+ def __init__(self, specs, kwargs, name, prefix="", postfix="",
+ join_by=" ", fmt="%s"):
+ self._specs = [_instantiate_spec(spec) for spec in specs]
+ if not self._specs:
+ raise ValueError("No specification given to %r" % (name.title(),))
+ elif not all((spc.default is DEFAULT_NOT_SET) for spc in self._specs):
+ raise ValueError("Default values cannot be set in specs given to "
+ "logical operators")
+
+ description = [(fmt % (spec.description,)) for spec in self._specs]
+ description = "%s%s%s" % (prefix, join_by.join(description), postfix)
+ default_value = kwargs.get("default", DEFAULT_NOT_SET)
+ MakefileSpec.__init__(self, description, default_value)
+
+
+class And(_MultipleSpecs):
+ """Takes one or more specification objects, and requires that values meets
+ all of these specifications. A default value may be set for the 'And'
+ specification, but not for the specifications given to the 'And' object.
+ """
+
+ def __init__(self, *specs, **kwargs):
+ _MultipleSpecs.__init__(self, specs, kwargs, "And",
+ join_by=" and ", fmt="(%s)")
+
+ def meets_spec(self, value):
+ return all(spec.meets_spec(value) for spec in self._specs)
+
+
+class Or(_MultipleSpecs):
+ """Takes one or more specification objects, and requires that values meets
+ at least one these specifications. A default value may be set for the 'Or'
+ specification, but not for the specifications given to the 'Or' object.
+ """
+
+ def __init__(self, *specs, **kwargs):
+ _MultipleSpecs.__init__(self, specs, kwargs, "Or",
+ join_by=" or ", fmt="(%s)")
+
+ def meets_spec(self, value):
+ return any(spec.meets_spec(value) for spec in self._specs)
+
+
+class Xor(_MultipleSpecs):
+ """Takes two specification objects, and requires that values meets ONE and
+ ONLY ONE of these specifications. A default value may be set for the 'Xor'
+ specification, but not for the specifications given to the 'Xor' object.
+ """
+
+ def __init__(self, *specs, **kwargs):
+ if len(specs) != 2:
+ raise ValueError("'Xor' takes exactly 2 specifications, not %i"
+ % (len(specs),))
+
+ _MultipleSpecs.__init__(self, specs, kwargs, "Xor",
+ join_by=" xor ", fmt="(%s)")
+
+ def meets_spec(self, value):
+ return operator.xor(*(spec.meets_spec(value) for spec in self._specs))
+
+
+class Not(_MultipleSpecs):
+ """Takes a single specification object, and requires that values do NOT
+ meet this specification. A default value may be set for the 'Not'
+ specification, but not for the specifications given to the 'Not' object.
+ """
+
+ def __init__(self, spec, **kwargs):
+ _MultipleSpecs.__init__(self, [spec], kwargs, "Not",
+ prefix="not ", fmt="(%s)")
+
+ def meets_spec(self, value):
+ return not self._specs[0].meets_spec(value)
+
+
+###############################################################################
+###############################################################################
+# String operators
+#
+# In addition to providing string-specific operators (is uppercase, ends/starts
+# with), "in" and set operators are provided which do case-insensitive
+# comparsions. For case-sensitive operations, use the Value* specifications.
+
+
+class StringIn(_BinaryOperator):
+ """Require that values are found in a set of values. For strings, the
+ comparison is done in a case-insensitive. For case-sensitive comparisons,
+ see 'ValueIn'.
+ """
+
+ def __init__(self, rvalues, key=None,
+ description="one of {rvalue}, case-insentive",
+ default=DEFAULT_NOT_SET):
+ description = description.format(rvalue=_list_values(rvalues, "or"))
+ rvalues = frozenset(map(_safe_coerce_to_lowercase, rvalues))
+
+ _BinaryOperator.__init__(self, description, default,
+ self._string_in_operator, rvalues)
+
+ @classmethod
+ def _string_in_operator(cls, lvalue, rvalues):
+ """Implements case-insensitive 'in' operator."""
+ return _safe_coerce_to_lowercase(lvalue) in rvalues
+
+
+class _StrSetOperator(_BinaryOperator):
+ """Base class for set operations involving case-insensitive strings."""
+
+ def __init__(self, description, default, opfunc, rvalues, key=None):
+ rvalues = frozenset(map(_safe_coerce_to_lowercase, rvalues))
+ _BinaryOperator.__init__(self, description, default, opfunc, rvalues,
+ key)
+
+ def meets_spec(self, value):
+ if not isinstance(value, (types.ListType,) + types.StringTypes):
+ return False
+
+ lvalues = frozenset(map(_safe_coerce_to_lowercase, value))
+ return _BinaryOperator.meets_spec(self, lvalues)
+
+
+class StringsIntersect(_StrSetOperator):
+ """Require that a set of values overlap with a pre-defined set of values
+ (as set in the constructor). For strings, values are compared in a case-
+ insensitive manner. For case-sensitive comparisons, see 'ValuesIntersect'.
+ """
+
+ def __init__(self, rvalue, key=None,
+ description="contains {rvalue}, case-insentive",
+ default=DEFAULT_NOT_SET):
+ description = description.format(rvalue=_list_values(rvalue, "and/or"))
+
+ _StrSetOperator.__init__(self, description, default,
+ self._string_intersection_operator,
+ frozenset(rvalue), key)
+
+ @classmethod
+ def _string_intersection_operator(cls, lvalue, rvalues):
+ """Implements case-insensitive 'intersect' operator."""
+ return bool(frozenset(lvalue).intersection(rvalues))
+
+
+class StringsSubsetOf(_StrSetOperator):
+ """Require that a set of values are a subset of a pre-defined set of values
+ (as set in the constructor). For strings, values are compared in a case-
+ insensitive manner. For case-sensitive comparisons, see 'ValuesSubsetOf'.
+
+ Note that empty sets are always considered to be a subset of the
+ pre-defined set.
+ """
+
+ def __init__(self, rvalue, key=None,
+ description="subset of {rvalue}, case-insentive",
+ default=DEFAULT_NOT_SET):
+ description = description.format(rvalue=_list_values(rvalue, "and"))
+
+ _StrSetOperator.__init__(self, description, default,
+ self._operator_func, frozenset(rvalue), key)
+
+ @classmethod
+ def _operator_func(cls, lvalue, rvalue):
+ """Operator implementation."""
+ return bool(frozenset(lvalue).issubset(rvalue))
+
+
+class StringIsUppercase(IsStr):
+ """Require that the value is a uppercase, non-empty string."""
+
+ def __init__(self, default=DEFAULT_NOT_SET):
+ IsStr.__init__(self, "an uppercase non-empty string", default)
+
+ def meets_spec(self, value):
+ return IsStr().meets_spec(value) and value.isupper()
+
+
+class StringStartsWith(IsStr):
+ """Require that the value is a string with given prefix."""
+
+ def __init__(self, prefix, default=DEFAULT_NOT_SET):
+ assert prefix and isinstance(prefix, types.StringTypes)
+ self._prefix = prefix
+ description = "a string with the prefix %r" % (prefix,)
+ IsStr.__init__(self, description, default)
+
+ def meets_spec(self, value):
+ return IsStr.meets_spec(self, value) \
+ and value.startswith(self._prefix)
+
+
+class StringEndsWith(IsStr):
+ """Require that the value is a string with given postfix."""
+
+ def __init__(self, postfix, default=DEFAULT_NOT_SET):
+ assert postfix and isinstance(postfix, types.StringTypes)
+ self._postfix = postfix
+ description = "a string with the postfix %r" % (postfix,)
+ IsStr.__init__(self, description, default)
+
+ def meets_spec(self, value):
+ return IsStr.meets_spec(self, value) and value.endswith(self._postfix)
+
+
+###############################################################################
+###############################################################################
+# Tests for collections
+
+class IsListOf(_MultipleSpecs):
+ """Require that the value is a list, the contents of which matches one or
+ more of the provided specifications; if no default value (ie. a non-empty
+ list) is required, then using the following syntax is preferred:
+ [IsType1, IsType2, ...]
+ This is equivalent to the following:
+ IsListOf(IsType1, IsType2, ...)
+ """
+
+ def __init__(self, *specs, **kwargs):
+ _MultipleSpecs.__init__(self, specs, kwargs, "IsListOf",
+ prefix="[", postfix=", ...]",
+ join_by=" or ", fmt="(%s)")
+
+ def meets_spec(self, value):
+ if not isinstance(value, types.ListType):
+ return False
+
+ return all(any(spec.meets_spec(lstvalue) for spec in self._specs)
+ for lstvalue in value)
+
+
+class IsDictOf(MakefileSpec):
+ """Require that the value is a list, the keys/values of which matches
+ the specifications provided for keys/values; if no default value (ie. a
+ dictioanry) is required, then using the following syntax is preferred:
+ {IsType1: IsType2}
+
+ This is equivalent to the following:
+ IsDictOf(IsType1, IsType2)
+ but also allows multiple type-pairs to be specified.
+ """
+
+ def __init__(self, key_spec, value_spec, default=DEFAULT_NOT_SET):
+ self._key_spec = _instantiate_spec(key_spec)
+ self._value_spec = _instantiate_spec(value_spec)
+ if self._key_spec.default is not DEFAULT_NOT_SET:
+ raise ValueError("Default values cannot be set in key-specs")
+ elif self._value_spec.default is not DEFAULT_NOT_SET:
+ raise ValueError("Default values cannot be set in value-specs")
+
+ description = "{(%s) : (%s)}" \
+ % (self._key_spec.description, self._value_spec.description)
+ MakefileSpec.__init__(self, description, default)
+
+ def meets_spec(self, value):
+ if not isinstance(value, types.DictType):
+ return False
+
+ for (key, value) in value.iteritems():
+ if not (self._key_spec.meets_spec(key)
+ and self._value_spec.meets_spec(value)):
+ return False
+
+ return True
+
+
+###############################################################################
+###############################################################################
+# Helper functions
+
+def _is_spec(spec):
+ """Returns true if 'spec' is a specification instance or class."""
+ if isinstance(spec, MakefileSpec):
+ return True
+ elif isinstance(spec, types.TypeType) and issubclass(spec, MakefileSpec):
+ return True
+ return False
+
+
+def _instantiate_spec(spec):
+ """Takes a specification instance or class, and returns an instance."""
+ if isinstance(spec, MakefileSpec):
+ return spec
+ elif isinstance(spec, types.TypeType) and issubclass(spec, MakefileSpec):
+ return spec()
+ else:
+ raise TypeError("Specifications must derive from 'MakefileSpec'")
+
+
+def _safe_coerce_to_lowercase(value):
+ """Returns strings as lowercase, and any other types of value unchanged."""
+ if isinstance(value, types.StringTypes):
+ return value.lower()
+ return value
+
+
+def _list_values(values, sep):
+ """Returns list of values as '[values[0], values[1], ..., sep values[-1]]':
+
+ $ _list_values([1, 2, 3], "and")
+ "[1, 2, and 3]"
+ """
+ values = map(repr, values)
+ if len(values) > 2:
+ values = (", ".join(values[:-1]) + ",", values[-1])
+ if len(values) == 2:
+ values = (" ".join((values[0], sep, values[1])),)
+
+ return values[0]
+
+
+def _get_summary_spec(specs_or_keys):
+ """Returns a specification object that may be used to describe a set of
+ requirements. This is used if a key or value does not match the possible
+ specs, thereby describing the set of allowed values.
+ """
+ specs, keys = group_by_pred(_is_spec, specs_or_keys)
+ if specs and keys:
+ return Or(ValueIn(keys, description="key in {rvalue}"), *specs)
+ elif specs:
+ return Or(*specs)
+ elif keys:
+ return ValueIn(keys, description="key in {rvalue}")
+ return ValueMissing()
+
+
+def _get_matching_spec_or_value(value, specs, path):
+ """Returns the specification object or value that matches the observed
+ value; specs may be a list of specification objects and/or constant values
+ allowed by the makefile. If no matching specification or value is found,
+ an MakefileError is raised.
+ """
+ if value in specs:
+ return value
+
+ for spec in specs:
+ if _is_spec(spec) and _instantiate_spec(spec).meets_spec(value):
+ return spec
+
+ # No matching key or spec; create combined spec to raise error message
+ _get_summary_spec(specs)(path, value)
+ assert False # pragma: no coverage
+
+
+def _process_default_values(data, specification, path, apply_defaults):
+ """Checks a subtree against a specification, verifies that required values
+ have been set, and (optionally) sets values for keys where defaults have
+ been specified.
+ """
+
+ for cur_key in specification:
+ if (not _is_spec(cur_key)) and (cur_key not in data):
+ default_value = specification[cur_key]
+ default_value_from_spec = False
+
+ while isinstance(default_value, PreProcessMakefile):
+ data, default_value = default_value(path, data)
+
+ if _is_spec(default_value):
+ default_value = _instantiate_spec(default_value)
+ if default_value.default is DEFAULT_NOT_SET:
+ continue
+ elif default_value.default is REQUIRED_VALUE:
+ raise MakefileError("A value MUST be supplified for %r"
+ % (_path_to_str(path + (cur_key,))))
+ default_value = default_value.default
+ default_value_from_spec = True
+
+ if apply_defaults \
+ and not isinstance(default_value, (PreProcessMakefile,
+ WithoutDefaults)):
+ if isinstance(default_value, dict):
+ # Setting of values in the dict will be accomplished
+ # in subsequent calls to _process_default_values
+ default_value = {}
+ elif isinstance(default_value, list):
+ # Lists of specs defaults to empty lists
+ if not default_value_from_spec:
+ default_value = []
+
+ # Prevent clobbering of values when re-using sub-specs
+ data[cur_key] = copy.deepcopy(default_value)
+
+
+def _path_to_str(path):
+ """Converts a path (tuple of strings) to a printable string."""
+ return ":".join(str(field) for field in path)
+
+
+CLI_PARAMETERS = Or(IsListOf(IsStr, IsInt, IsFloat),
+ Or(IsStr, IsInt, IsFloat, IsNone))
diff --git a/paleomix/common/procs.py b/paleomix/common/procs.py
new file mode 100644
index 0000000..93c66fc
--- /dev/null
+++ b/paleomix/common/procs.py
@@ -0,0 +1,108 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+"""
+Tools used for working with subprocesses.
+"""
+import os
+import sys
+import time
+
+from subprocess import *
+
+
+DEVNULL = object()
+
+
+def open_proc(call, *args, **kwargs):
+ """Wrapper around subprocess.Popen, which records the system call as a
+ tuple assigned to the .call property of the Popen object. In addition, the
+ 'close_fds' option defaults to True, similar to Python 3+, and the DEVNULL
+ value may be passed for 'stdin', 'stdout', and 'stderr' to pipe to / from
+ /dev/null, with stdin defaulting to DEVNULL (set to None or another value
+ to override).
+ """
+ # Reduce the chance of unexpected behavior
+ kwargs.setdefault("close_fds", True)
+ # Unless specifically requested
+ kwargs.setdefault("stdin", DEVNULL)
+
+ devnull = None
+
+ try:
+ for key in ("stdin", "stderr", "stdout"):
+ if kwargs.get(key) is DEVNULL:
+ if devnull is None:
+ devnull = os.open(os.devnull, os.O_RDWR)
+ kwargs[key] = devnull
+
+ proc = Popen(call, *args, **kwargs)
+ proc.call = tuple(call)
+
+ return proc
+ finally:
+ if devnull is not None:
+ os.close(devnull)
+
+
+def join_procs(procs, out=sys.stderr):
+ """Joins a set of Popen processes. If a processes fail, the remaining
+ processes are terminated. The function returns a list of return-code,
+ containing the result of each call. Status messages are written to STDERR
+ by default.
+ """
+ sleep_time = 0.05
+ commands = list(enumerate(procs))
+ assert all(hasattr(cmd, "call") for (_, cmd) in commands)
+
+ return_codes = [None] * len(commands)
+ out.write("Joinining subprocesses:\n")
+ while commands:
+ for (index, command) in list(commands):
+ if command.poll() is not None:
+ return_codes[index] = command.wait()
+ commands.remove((index, command))
+ sleep_time = 0.05
+
+ out.write(" - Command finished: %s\n"
+ " - Return-code: %s\n"
+ % (" ".join(command.call),
+ return_codes[index]))
+ out.flush()
+ elif any(return_codes):
+ out.write(" - Terminating command: %s\n"
+ % (" ".join(command.call),))
+ out.flush()
+
+ command.terminate()
+ return_codes[index] = command.wait()
+ commands.remove((index, command))
+ sleep_time = 0.05
+
+ time.sleep(sleep_time)
+ sleep_time = min(1, sleep_time * 2)
+
+ if any(return_codes):
+ out.write("Errors occured during processing!\n")
+ out.flush()
+
+ return return_codes
diff --git a/paleomix/common/rtools.py b/paleomix/common/rtools.py
new file mode 100644
index 0000000..67c2661
--- /dev/null
+++ b/paleomix/common/rtools.py
@@ -0,0 +1,41 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2016 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import paleomix.resources
+
+import paleomix.common.versions as versions
+
+from paleomix.resources import rscript
+
+
+def requirement(module, checks=versions.Any(), cache={}):
+ key = (module, checks)
+ result = cache.get(key)
+ if result is None:
+ filename = rscript("common", "requires.r")
+ result = versions.Requirement(call=("Rscript", filename, module),
+ search="d0fd3ea6: (\d+)\.(\d+)(?:\.(\d+))?",
+ checks=checks,
+ name="R module: {}".format(module))
+
+ cache[(module, checks)] = result
+
+ return result
diff --git a/paleomix/common/sampling.py b/paleomix/common/sampling.py
new file mode 100644
index 0000000..fb0435d
--- /dev/null
+++ b/paleomix/common/sampling.py
@@ -0,0 +1,67 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import types
+import bisect
+import random
+
+
+def weighted_sampling(choices, weights, rng=random):
+ choices = list(choices)
+ weights = list(weights)
+ if not weights or (len(weights) != len(choices)):
+ raise ValueError("Choices and probabilities must be non-empty lists "
+ "of identical length, not lengths %i and %i"
+ % (len(choices), len(weights)))
+
+ total = 0
+ totals = []
+ for (index, weight) in enumerate(weights, start=1):
+ if weight <= 0:
+ raise ValueError("Probablities must be > 0, not %r for weight %i"
+ % (weight, index))
+ total += weight
+ totals.append(total)
+
+ while True:
+ rand = rng.random() * total
+ index = bisect.bisect_right(totals, rand)
+ yield choices[index]
+
+
+def reservoir_sampling(items, downsample_to, rng=random):
+ if not isinstance(downsample_to, (types.IntType, types.LongType)):
+ raise TypeError("Unexpected type for 'downsample_to': %r"
+ % (type(downsample_to),))
+ elif downsample_to < 0:
+ raise ValueError("Negative value for 'downsample_to': %i"
+ % (downsample_to,))
+
+ reservoir = []
+ for (index, item) in enumerate(items):
+ if index >= downsample_to:
+ index = rng.randint(0, index)
+ if index < downsample_to:
+ reservoir[index] = item
+ else:
+ reservoir.append(item)
+ return reservoir
diff --git a/paleomix/common/sequences.py b/paleomix/common/sequences.py
new file mode 100644
index 0000000..989a7cb
--- /dev/null
+++ b/paleomix/common/sequences.py
@@ -0,0 +1,205 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+"""Various functions relating to DNA sequence manipulation."""
+
+import itertools
+
+
+# Pairs of complementary bases and ambigious basees
+_COMPL = [ "AT", "CG",
+ "NN", "RY",
+ "KM", "SS",
+ "WW", "BV",
+ "DH", "XX" ]
+_COMPL_TABLE = ["N"] * 256
+for (_a, _b) in _COMPL :
+ # Complement both upper/lower-case bases
+ for _func in (str.upper, str.lower):
+ _COMPL_TABLE[ord(_func(_a))] = _func(_b)
+ _COMPL_TABLE[ord(_func(_b))] = _func(_a)
+_COMPL_TABLE = "".join(_COMPL_TABLE)
+
+
+
+# Table of nt codes (IUPAC codes) used to encode (ambigious) bases:
+# Nomenclature for incompletely specified bases in nucleic acid sequences.
+# Recommendations 1984. J Biol Chem. 1986 Jan 5;261(1):13-7.
+# PubMed PMID: 2416744.
+NT_CODES = [
+ ["A", "A"],
+ ["C", "C"],
+ ["G", "G"],
+ ["T", "T"],
+ ["N", "N"],
+ ["R", "AG"],
+ ["Y", "CT"],
+ ["K", "GT"],
+ ["M", "AC"],
+ ["S", "CG"],
+ ["W", "AT"],
+ ["B", "CGT"],
+ ["D", "AGT"],
+ ["H", "ACT"],
+ ["V", "ACG"],
+ ["N", "ACGT"]]
+
+_NT_CODES_TABLE = {}
+for (_abr, _nts) in NT_CODES:
+ _NT_CODES_TABLE[frozenset(_nts)] = _abr
+ _NT_CODES_TABLE[frozenset(_nts + ",")] = _abr
+
+NT_CODES = dict(NT_CODES)
+
+
+CODONS = {
+ "+": {
+ "TTT": "Phe", "TCT": "Ser", "TAT": "Tyr", "TGT": "Cys",
+ "TTC": "Phe", "TCC": "Ser", "TAC": "Tyr", "TGC": "Cys",
+ "TTA": "Leu", "TCA": "Ser", "TAA": "Stop", "TGA": "Stop",
+ "TTG": "Leu", "TCG": "Ser", "TAG": "Stop", "TGG": "Trp",
+
+ "CTT": "Leu", "CCT": "Pro", "CAT": "His", "CGT": "Arg",
+ "CTC": "Leu", "CCC": "Pro", "CAC": "His", "CGC": "Arg",
+ "CTA": "Leu", "CCA": "Pro", "CAA": "Gln", "CGA": "Arg",
+ "CTG": "Leu", "CCG": "Pro", "CAG": "Gln", "CGG": "Arg",
+
+ "ATT": "Ile", "ACT": "Thr", "AAT": "Asn", "AGT": "Ser",
+ "ATC": "Ile", "ACC": "Thr", "AAC": "Asn", "AGC": "Ser",
+ "ATA": "Ile", "ACA": "Thr", "AAA": "Lys", "AGA": "Arg",
+ "ATG": "Met", "ACG": "Thr", "AAG": "Lys", "AGG": "Arg",
+
+ "GTT": "Val", "GCT": "Ala", "GAT": "Asp", "GGT": "Gly",
+ "GTC": "Val", "GCC": "Ala", "GAC": "Asp", "GGC": "Gly",
+ "GTA": "Val", "GCA": "Ala", "GAA": "Glu", "GGA": "Gly",
+ "GTG": "Val", "GCG": "Ala", "GAG": "Glu", "GGG": "Gly"},
+ "-": {}}
+
+
+def complement(sequence):
+ """Returns the complement of a DNA sequence (string)."""
+ return sequence.translate(_COMPL_TABLE)
+
+
+def reverse_complement(sequence):
+ """Returns the reverse complement of a DNA sequence."""
+ return complement(sequence)[::-1]
+
+
+for _codon, _aa in CODONS["+"].iteritems():
+ assert not set(_codon) - set("ACGT")
+ CODONS["-"][reverse_complement(_codon)] = _aa
+
+assert len(CODONS["+"]) == 64
+assert len(CODONS["-"]) == 64
+
+
+def encode_genotype(nucleotides):
+ """Parses a string representing a set of nucleotides observed at a loci,
+ and returns the corresponding IUPAC code. Commas are allowed, but are
+ simply ignored if found in the string. Does not handle lower-case
+ nucleotides, due to lack of clear criteria for mixed case input.
+ See e.g. http://www.ebi.ac.uk/2can/tutorials/aa.html"""
+ try:
+ return _NT_CODES_TABLE[frozenset(nucleotides)]
+ except KeyError:
+ raise ValueError("Invalid input for 'encode_genotype': %s" % (repr(nucleotides), ))
+
+
+
+def count_nts(sequence):
+ """Given a nucleotide sequence (str), this function returns
+ the number of each type of nucleotide representable using
+ IUPAC codes. The sequence must not contain non-IUPAC
+ nucleotides, or other annotation. IUPAC nucleotides are
+ handled in a case-insensitive manner."""
+ counts = {}
+ sequence = sequence.upper()
+ for nucleotide in NT_CODES:
+ count = sequence.count(nucleotide)
+ if count:
+ counts[nucleotide] = count
+
+ if len(sequence) != sum(counts.itervalues()):
+ raise ValueError("Sequence contains non-(IUPAC-)nucleotides: %s" % \
+ ", ".join(set(sequence) - set(counts)))
+
+ return counts
+
+
+def count_gc_diploid(sequence):
+ """Given a sequence, this function returns a tuple containing the
+ the total number of bases that were G/C, as well as the total number
+ of bases. The sequence is assumed to represent a diploid genome, with
+ the total number of bases being twice the sequence length, and
+ hence IUPAC codes representing one of or both of G/C are treated as
+ reflecting both strands. Thus R counts for 1, while S counts for 2.
+
+ The sequence must only contain valid IUPAC codes, and no other
+ form of annotation. Both uppercase/lowercase G/Cs are counted.
+ Ambigious site (n/N) are not counted, neither in the number of G/C,
+ nor in the total number of bases."""
+ total_nts = total_gc = 0
+ counts = count_nts(sequence)
+ for (code, count) in counts.iteritems():
+ value = 0
+ if code == "N":
+ continue
+ elif code in "CGcg":
+ value = 2
+ else:
+ code_represents = NT_CODES[code]
+ if (len(code_represents) > 2) and (code != 'N'):
+ raise ValueError("calculate_gcp assumes diploid genome, nt code for tri-valued SNP observed: " + code)
+
+ if 'G' in code_represents:
+ value += 1
+ if 'C' in code_represents:
+ value += 1
+
+ total_nts += count * 2
+ total_gc += count * value
+
+ return (total_gc, total_nts)
+
+
+def split(sequence, split_by = "123"):
+ """Splits a sequence by position, as specified by the 'split_by' parameter. By
+ default, the function will split by codon position, and return a dictionary
+ containing the keys '1', '2' and '3'.
+
+ The 'split_by' parameter may contain any non-zero number of values, which must
+ however be hashable. If a value is specified multiple times, then those positions
+ are interleaved (e.g. split_by = "112" returns the first two positions in a codon
+ as one sequence, as well as the last positions as one sequence."""
+ if not split_by:
+ raise TypeError("No partitions to split by specified")
+
+ results = dict((key, []) for key in split_by)
+ keys = itertools.chain(itertools.cycle(split_by))
+ for (key, nucleotide) in itertools.izip(keys, sequence):
+ results[key].append(nucleotide)
+
+ for key in results:
+ results[key] = "".join(results[key])
+
+ return results
diff --git a/paleomix/common/signals.py b/paleomix/common/signals.py
new file mode 100644
index 0000000..950f03b
--- /dev/null
+++ b/paleomix/common/signals.py
@@ -0,0 +1,54 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import signal
+import types
+
+
+def from_str(value):
+ if not isinstance(value, types.StringTypes):
+ raise TypeError("'from_str' takes strings, not %r" % (value.__class__.__name__,))
+ return FROM_STRING[value]
+
+
+def to_str(value):
+ if not isinstance(value, (types.IntType, types.LongType)):
+ raise TypeError("'from_str' takes strings, not %r" % (value.__class__.__name__,))
+ return FROM_SIGNAL[value]
+
+
+def _get_signals():
+ signals = {}
+ for key in dir(signal):
+ if key.startswith("SIG") and not key.startswith("SIG_"):
+ signals[getattr(signal, key)] = key
+
+ # The following signals may have synonyms, so specify which to use.
+ # For example, SIGIOT is a synonym of SIGABTR on OSX.
+ for key in ("SIGABRT", "SIGCHLD", "SIGIO"):
+ value = getattr(signal, key)
+ signals[value] = key
+
+ return signals
+
+FROM_SIGNAL = _get_signals()
+FROM_STRING = dict(zip(FROM_SIGNAL.values(), FROM_SIGNAL.keys()))
diff --git a/paleomix/common/system.py b/paleomix/common/system.py
new file mode 100644
index 0000000..8bd695f
--- /dev/null
+++ b/paleomix/common/system.py
@@ -0,0 +1,54 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2013 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import sys
+import resource
+
+
+def set_procname(name=os.path.basename(sys.argv[0])):
+ """Attempts to set the current process-name to the given name."""
+ import setproctitle
+
+ setproctitle.setproctitle(name)
+
+
+def get_max_open_files():
+ """Returns the maximum number of open files per process
+ (soft limit) or None if this could not be determined.
+ """
+ soft_limit = None
+
+ try:
+ key = resource.RLIMIT_NOFILE
+ except AttributeError:
+ try:
+ key = resource.RLIMIT_OFILE
+ except AttributeError:
+ return
+
+ try:
+ soft_limit, _ = resource.getrlimit(key)
+ except resource.error:
+ pass
+
+ return soft_limit
diff --git a/paleomix/common/testing.py b/paleomix/common/testing.py
new file mode 100644
index 0000000..b23791a
--- /dev/null
+++ b/paleomix/common/testing.py
@@ -0,0 +1,112 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import sys
+import shutil
+import tempfile
+
+import nose
+from nose.tools import assert_equal
+
+from paleomix.common.fileutils import \
+ make_dirs
+
+
+def assert_list_equal(iter_a, iter_b):
+ """Compare two values, after first converting them to lists.
+ This ensures that lazily generated results can be compared."""
+ list_a = list(iter_a)
+ list_b = list(iter_b)
+
+ assert_equal(list_a, list_b)
+
+
+def with_temp_folder(func):
+ """Decorator for unit-tests:
+ Creates a unique temporary folder before running 'func'. The
+ function is is assumed to take at least one parameter, the first
+ of which is assumed to represent the temporary folder."""
+ temp_root = os.path.join(tempfile.gettempdir(), os.getlogin())
+ make_dirs(temp_root) # Ensure that this subdirectory exists
+
+ @nose.tools.istest
+ def _wrapper(*args, **kwargs):
+ try:
+ temp_folder = None
+ temp_folder = tempfile.mkdtemp(dir = temp_root,
+ prefix = "paleomix_unit")
+ func(temp_folder, *args, **kwargs)
+ finally:
+ if temp_folder:
+ shutil.rmtree(temp_folder)
+ _wrapper.__name__ = func.__name__ + "__wrapped_by_with_temp_folder"
+ return _wrapper
+
+
+class Monkeypatch:
+ """Replaces a function/object in a module with the specified wrapper
+ upon entry, reverting the change upon exit from the with statement.
+ A full path to the given function is required, for example
+ 'os.path.join'."""
+ def __init__(self, path, wrapper):
+ self.wrapper = wrapper
+
+ parts = path.split(".")
+ assert len(parts) > 1
+ self.module, self.object = None, sys.modules[parts[0]]
+ for path_cmp in parts[1:]:
+ self.module, self.object = self.object, getattr(self.object, path_cmp)
+ self.name = parts[-1]
+
+ def __enter__(self):
+ setattr(self.module, self.name, self.wrapper)
+ return self
+
+ def __exit__(self, _type, _value, _traceback):
+ setattr(self.module, self.name, self.object)
+
+
+class SetWorkingDirectory:
+ """Sets the current working directory upon entry to that specified,
+ in the constructor upon entry, and reverts to the previously used
+ directory upon exiting a with statement."""
+ def __init__(self, path):
+ self._old_cwd = None
+ self._new_cwd = path
+
+ def __enter__(self):
+ self._old_cwd = os.getcwd()
+ os.chdir(self._new_cwd)
+
+ def __exit__(self, _type, _value, _traceback):
+ os.chdir(self._old_cwd)
+
+
+def set_file_contents(fname, contents):
+ with open(fname, "w") as handle:
+ handle.write(contents)
+
+
+def get_file_contents(fname):
+ with open(fname) as handle:
+ return handle.read()
diff --git a/paleomix/common/text.py b/paleomix/common/text.py
new file mode 100644
index 0000000..1820d04
--- /dev/null
+++ b/paleomix/common/text.py
@@ -0,0 +1,120 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import re
+import types
+import collections
+
+
+class TableError(RuntimeError):
+ pass
+
+
+_MIN_PADDING = 4
+_WHITESPACE_OR_EMPTY = re.compile(r"\s|^$")
+
+
+def padded_table(table):
+ """Takes a sequence of iterables, each of which represents a row in a
+ table. Values are converted to string, and padded with whitespace such that
+ each column is separated from its adjacent columns by at least 4 spaces.
+ Empty cells or whitespace in values are not allowed.
+
+ If a string is included instead of a row, this value is added as is. Note
+ that these lines should be whitespace only, or start with a '#' if the
+ resulting table is to be readable with 'parse_padded_table'.
+ """
+ str_rows = []
+ nsizes, sizes = None, []
+ for row in table:
+ if not isinstance(row, types.StringTypes):
+ row = map(str, row)
+ if (len(row) != nsizes):
+ if nsizes is not None:
+ raise TableError("Malformed table; rows with different "
+ "number of columns: %r" % row)
+ nsizes = len(row)
+ sizes = [0] * nsizes
+ sizes = map(max, zip(sizes, map(len, row)))
+ str_rows.append(row)
+
+ sizes = [(size + _MIN_PADDING) for size in sizes]
+ for row in str_rows:
+ if not isinstance(row, types.StringTypes):
+ row = "".join(field.ljust(padding)
+ for (field, padding) in zip(row, sizes)).rstrip()
+ yield row
+
+
+def parse_padded_table(lines, header=None):
+ """Parses a padded table generated using 'padded_table', or any table which
+ consists of a fixed number of columns seperated by whitespace, with no
+ whitespace in the cells. Empty lines and lines starting with '#' (comments)
+ are ignored. Each row is returned as a dictionary, using the values found
+ in the first row as keys.
+ """
+ for line in lines:
+ stripped = line.strip()
+ if not stripped or stripped.startswith("#"):
+ continue
+ elif header is None:
+ header = stripped.split()
+ nheader = len(header)
+ continue
+
+ fields = stripped.split()
+ if len(fields) != nheader:
+ raise TableError("Malformed table; #columns does not match header:"
+ " %r vs %r" % (header, fields))
+
+ yield dict(zip(header, fields))
+
+
+def parse_lines(lines, parser):
+ """Parses a set of lines using the supplied callable:
+ lambda (line, length): ...
+
+ Supports the parser functions available in 'pysam': asGTF, asBED, etc.
+ """
+ if not isinstance(parser, collections.Callable):
+ raise TypeError("'parser' must be a callable, not %r"
+ % parser.__class__.__name__)
+
+ for line in lines:
+ stripped = line.lstrip()
+ if stripped and not stripped.startswith("#"):
+ stripped = line.rstrip()
+ yield parser(stripped, len(stripped))
+
+
+def parse_lines_by_contig(lines, parser):
+ """Reads the lines of a text file, parsing each line with the specified
+ parser, and aggregating results by the 'contig' property of reach record.
+ """
+ table = {}
+ for record in parse_lines(lines, parser):
+ try:
+ table[record.contig].append(record)
+ except KeyError:
+ table[record.contig] = [record]
+
+ return table
diff --git a/paleomix/common/timer.py b/paleomix/common/timer.py
new file mode 100644
index 0000000..e5aa606
--- /dev/null
+++ b/paleomix/common/timer.py
@@ -0,0 +1,111 @@
+#!/usr/bin/python -3
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+from __future__ import print_function
+
+import sys
+import time
+
+from paleomix.common.utilities import fragment, cumsum
+
+
+_DESC = "Processed {Records} records ({Progress}) in {Time}, est. {Remaining} left. Last {RecordsDelta} records in {TimeDelta}, now at {Contig}: {Position} ..."
+_FINAL = "Processed {Records} records in {Time}. Last {RecordsDelta} records in {TimeDelta} ..."
+
+class BAMTimer:
+ def __init__(self, bamfile, desc = None, step = 1e6, out = sys.stderr):
+ self._bam = None
+ self._out = out
+ self._desc = desc
+ self._step = int(step)
+ self._count = 0
+ self._last_count = 0
+ self._last_time = time.time()
+ self._start_time = self._last_time
+ self._last_fract = -1.0
+
+ self._bam_references = None
+
+ self._total = 0.0
+ self._counts = []
+ if bamfile and bamfile.header.get("HD", {}).get("SO", "NA") == "coordinate":
+ self._bam = bamfile
+ self._bam_references = self._bam.references
+
+ lengths = bamfile.lengths
+ self._total = float(sum(lengths)) or 1.0
+ self._counts.append(0)
+ self._counts.extend(cumsum(lengths))
+
+
+ def increment(self, count = 1, read = None):
+ self._count += count
+ if (self._count - self._last_count) >= self._step:
+ current_time = time.time()
+ self._print(current_time, read)
+ self._last_time = current_time
+ self._last_count = self._count
+ return self
+
+
+ def finalize(self):
+ self._print(time.time(), None)
+
+
+ def _print(self, current_time, read):
+ desc = _FINAL
+ contig, position, progress, remaining = "NA", "NA", "NA", "NA"
+ if read and not read.is_unmapped and self._bam:
+ fraction = ((read.pos + self._counts[read.tid]) / self._total)
+ if fraction >= self._last_fract:
+ self._last_fract = fraction
+ contig = self._bam_references[read.tid]
+ position = self._format_int(read.pos + 1)
+ progress = "%.2f%%" % (fraction * 100,)
+
+ current_running = current_time - self._start_time
+ remaining = self._format_time(current_running / fraction - current_running)
+ desc = _DESC
+ else:
+ print("File appears to be unsorted, cannot estimate progress ...", file = self._out)
+ self._bam = None
+
+ if self._desc:
+ print("%s: " % self._desc, end = "", file = self._out)
+
+ print(desc.format(Records = self._format_int(self._count),
+ RecordsDelta = self._format_int(self._count - self._last_count),
+ Time = self._format_time(current_time - self._start_time),
+ TimeDelta = self._format_time(current_time - self._last_time),
+ Contig = contig,
+ Position = position,
+ Progress = progress,
+ Remaining = remaining),
+ file = self._out)
+
+
+ def _format_time(self, ftime):
+ utc = time.gmtime(ftime)
+ return "%02i:%02i:%02is" % (utc.tm_hour, utc.tm_min, utc.tm_sec)
+
+ def _format_int(self, value):
+ return (",".join(fragment(3, str(value)[::-1])))[::-1]
diff --git a/paleomix/common/utilities.py b/paleomix/common/utilities.py
new file mode 100644
index 0000000..1c640e3
--- /dev/null
+++ b/paleomix/common/utilities.py
@@ -0,0 +1,312 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import binascii
+import copy
+import cPickle
+import heapq
+import itertools
+import pickle
+import types
+
+
+def _safe_coerce(cls):
+ def _do_safe_coerce(value):
+ if isinstance(value, (types.StringTypes, types.DictType)):
+ return cls((value,))
+
+ try:
+ return cls(value)
+ except TypeError:
+ return cls((value,))
+
+ _do_safe_coerce.__doc__ = \
+ """Takes a value which be a single object, or an an iterable
+ and returns the content wrapped in a {0}. In the case of strings,
+ and dictionaries the original string object is returned in a {0},
+ and not as a {0} of chars. A TypeError is raised if this is not
+ possible (e.g. dict in frozenset).""".format(cls.__name__)
+ _do_safe_coerce.__name__ = \
+ "safe_coerce_to_{0}".format(cls.__name__)
+
+ return _do_safe_coerce
+
+safe_coerce_to_tuple = _safe_coerce(tuple)
+safe_coerce_to_frozenset = _safe_coerce(frozenset)
+
+
+def try_cast(value, cast_to):
+ try:
+ return cast_to(value)
+ except (ValueError, TypeError):
+ return value
+
+
+def crc32(data):
+ return binascii.crc32(data) & 0xffffffff
+
+
+def set_in(dictionary, keys, value):
+ """Traverses a set of nested dictionaries using the given keys,
+ and assigns the specified value to the inner-most
+ dictionary (obtained from the second-to-last key), using
+ the last key in keys. Thus calling set_in is(d, [X, Y, Z], v)
+ is equivalent to calling
+ d.setdefault(X, {}).setdefault(Y, {})[Z] = v
+
+ Behavior on non-dictionaries is undefined."""
+ keys = list(keys)
+ if not keys:
+ raise ValueError("No keys passed to 'set_in'!")
+
+ for key in keys[:-1]:
+ try:
+ dictionary = dictionary[key]
+ except KeyError:
+ new_dict = {}
+ dictionary[key] = new_dict
+ dictionary = new_dict
+
+ dictionary[keys[-1]] = value
+
+
+def get_in(dictionary, keys, default=None):
+ """Traverses a set of nested dictionaries using the keys in
+ kws, and returns the value assigned to the final keyword
+ in the innermost dictionary. Calling get_in(d, [X, Y])
+ is equivalent to calling d.get(X).get(Y), with the
+ difference that any missing keys causes the default value
+ to be returned.
+
+ Behavior on non-dictgionaries is undefined."""
+ keys = list(keys)
+ for key in keys[:-1]:
+ try:
+ dictionary = dictionary[key]
+ except KeyError:
+ return default
+
+ return dictionary.get(keys[-1], default)
+
+
+def split_before(iterable, pred):
+ """Takes a sequence and splits it before every value where pred(v) is true.
+ Thus split_before(range(10), key = lambda x: x % 2 == 0) would return the
+ sequence [[1], [2,3], [4,5], [6,7], [7,8], [9]]"""
+ items = []
+ for value in iterable:
+ if pred(value) and items:
+ yield items
+ items = []
+ items.append(value)
+
+ if items:
+ yield items
+
+
+def is_strictly_increasing(lst):
+ """Returns true if the contents of the list is strictly increasing."""
+ pairs = itertools.izip(lst, itertools.islice(lst, 1, None))
+
+ return all(x < y for (x, y) in pairs)
+
+
+# Copied from the Python 'itertools' module documentation
+def grouper(size, iterable, fillvalue=None):
+ "grouper(3, 'ABCDEFG', 'x') --> ABC DEF Gxx"
+ args = [iter(iterable)] * size
+ return itertools.izip_longest(fillvalue=fillvalue, *args)
+
+
+def group_by_pred(pred, iterable):
+ """Splits items in a sequence into two lists, one containing
+ items matching the predicate, and another containing those that
+ do not."""
+ is_true, is_false = [], []
+ for item in iterable:
+ if pred(item):
+ is_true.append(item)
+ else:
+ is_false.append(item)
+
+ return is_true, is_false
+
+
+def fragment(size, lstlike):
+ """Faster alternative to grouper for lists/strings."""
+ return (lstlike[i:i + size] for i in range(0, len(lstlike), size))
+
+
+def cumsum(lst, initial=0):
+ """Yields the cummulative sums of the values in a
+ iterable, starting with the specified initial value."""
+ for item in lst:
+ initial += item
+ yield initial
+
+
+def fast_pickle_test(obj):
+ """Attempts to pickle an object, raising a PicklingError
+ if the object is unpicklable. This function uses cPickle
+ to determine if the object is pickable, but 'pickle' to
+ generate the exception, since the python module produces
+ more informative error messages."""
+ try:
+ cPickle.dumps(obj)
+ except (TypeError, cPickle.PicklingError):
+ pickle.dumps(obj)
+ assert False # pragma: no coverage
+
+
+def fill_dict(destination, source):
+ """Returns a copy of 'destination' after setting missing key-
+ pairs with copies of those of 'source' recursively."""
+ if not isinstance(destination, dict) or not isinstance(source, dict):
+ raise TypeError("Non-dictionary parameters in 'fill_dict'")
+
+ def _fill_dict(cur_dest, cur_src):
+ for key in cur_src:
+ if isinstance(cur_src[key], dict) \
+ and isinstance(cur_dest.get(key), dict):
+ _fill_dict(cur_dest[key], cur_src[key])
+ elif key not in cur_dest:
+ cur_dest[key] = cur_src[key]
+ return cur_dest
+
+ return _fill_dict(copy.deepcopy(destination), copy.deepcopy(source))
+
+
+def chain_sorted(*sequences, **kwargs):
+ """Chains together sorted sequences, and yields the contents
+ in the same order, such that the result is also a sorted sequence.
+ The function accepts a 'key'-function keyword, following sort().
+
+ chain_sorted is intended for a few long sequences, and not many short
+ sequences. Behavior is undefined if the sequences are not sorted.
+
+ Example:
+ >>> tuple(chain_sorted((1, 3, 5), (0, 2, 4)))
+ (0, 1, 2, 3, 4, 5)
+ """
+ key = kwargs.pop('key', None)
+ if kwargs:
+ raise TypeError("chain_sorted expected keyword 'key', got %r"
+ % (', '.join(kwargs)))
+
+ iterators = []
+ for index, sequence_iter in enumerate(map(iter, sequences)):
+ try:
+ current = sequence_iter.next()
+ key_value = current if key is None else key(current)
+
+ iterators.append((key_value, index, current, sequence_iter))
+ except StopIteration:
+ pass
+
+ heapq.heapify(iterators)
+
+ _len, _heappop, _heapreplace = len, heapq.heappop, heapq.heapreplace
+
+ while _len(iterators) > 1:
+ last_key_value, index, current, sequence_iter = iterators[0]
+ yield current
+
+ for current in sequence_iter:
+ key_value = current if key is None else key(current)
+
+ # Optimization for runs of repeated values
+ if key_value != last_key_value:
+ _heapreplace(iterators,
+ (key_value, index, current, sequence_iter))
+ break
+ else:
+ yield current
+ else:
+ # No items remaining in top iterator
+ _heappop(iterators)
+
+ if _len(iterators) == 1:
+ _, _, current, sequence_iter = iterators[0]
+
+ yield current
+ for current in sequence_iter:
+ yield current
+
+
+class Immutable(object):
+ """Mixin implementing a immutable class; member variables are specified in
+ the init function, cannot be changed afterwards; note that this does not
+ prevent changes to the member variables themselves (if not immutable)."""
+
+ def __init__(self, **kwargs):
+ object.__init__(self)
+ for (key, value) in kwargs.iteritems():
+ object.__setattr__(self, key, value)
+
+ def __setattr__(self, _name, _value):
+ raise NotImplementedError("Object is immutable")
+
+ def __delattr__(self, _name):
+ raise NotImplementedError("Object is immutable")
+
+
+class TotallyOrdered(object):
+ """Mixin implementing a rich-comparison interface, provided
+ that the subclass implements the less-than operator (__lt__).
+ The __lt__ function should return NotImplemented if the other
+ object is not the same type.
+
+ The implementation assumes total order:
+ http://en.wikipedia.org/wiki/Total_order
+ """
+
+ def __lt__(self, other):
+ raise NotImplementedError("__lt__ must be implemented!")
+
+ def __eq__(self, other):
+ if not isinstance(other, type(self)):
+ return NotImplemented
+ return not ((self < other) or (other < self))
+
+ def __ne__(self, other):
+ if not isinstance(other, type(self)):
+ return NotImplemented
+ return not (self == other)
+
+ def __le__(self, other):
+ if not isinstance(other, type(self)):
+ return NotImplemented
+ return not (other < self)
+
+ def __ge__(self, other):
+ if not isinstance(other, type(self)):
+ return NotImplemented
+ return not (self < other)
+
+ def __gt__(self, other):
+ if not isinstance(other, type(self)):
+ return NotImplemented
+ return (other < self)
+
+ # Shut up warning; if hashable, then the subclass will have
+ # to implement the __hash__ member function.
+ __hash__ = None
diff --git a/paleomix/common/vcffilter.py b/paleomix/common/vcffilter.py
new file mode 100644
index 0000000..51a25d8
--- /dev/null
+++ b/paleomix/common/vcffilter.py
@@ -0,0 +1,411 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+from __future__ import with_statement
+
+import sys
+import optparse
+import collections
+
+import pysam
+
+import paleomix.common.vcfwrap as vcfwrap
+
+
+_INF = float("inf")
+# Rough number of records to keep in memory at once
+_CHUNK_SIZE = 10000
+
+
+def add_varfilter_options(parser):
+ group = optparse.OptionGroup(parser, "varFilter: Novel options")
+ group.add_option("--homozygous-chromosome", action="append", default=[],
+ help="Filter heterozygous SNPs observed on this "
+ "chromosome (e.g. chrX) %default.")
+ group.add_option("-q", "--min-quality", type=int, default=30,
+ help="Minimum Phred score recorded in the QUAL column "
+ "[%default]")
+ group.add_option("-f", "--min-allele-frequency", type=float, default=0.2,
+ help="Minimum frequency of the alleles at heterozygous "
+ "sites [%default]. WARNING: A pileup must be "
+ "provided for multi-allelic sites to be filtered!")
+ group.add_option("-b", "--pileup", default=None,
+ help="Tabix indexed pileup for multi-allelic sites. This "
+ "is required for such sites to be filtered using "
+ "the --min-allele-frequency filter.")
+ group.add_option("-k", "--keep-ambigious-genotypes",
+ default=False, action="store_true",
+ help="Keep SNPs without a most likely genotype "
+ "(based on PL) [%default]")
+ parser.add_option_group(group)
+
+ group = optparse.OptionGroup(parser, "varFilter: Derived options")
+ # Options adapted from varFilter
+ group.add_option("-Q", "--min-mapping-quality", type=int, default=10,
+ help="Minimum RMS mapping quality for SNPs [%default]")
+ group.add_option("-d", "--min-read-depth", type=int, default=8,
+ help="Minimum read depth [%default]")
+ group.add_option("-D", "--max-read-depth", type=int, default=10000000,
+ help="Maximum read depth [%default]")
+ group.add_option("-a", "--min-num-alt-bases", type=int, default=2,
+ help="Minimum number of alternative bases observed for "
+ "variants [%default]")
+ group.add_option("-w", "--min-distance-to-indels", type=int, default=3,
+ help="SNP within INT bp around a gap to be filtered "
+ "[%default]")
+ group.add_option("-W", "--min-distance-between-indels",
+ type=int, default=10,
+ help="Window size for filtering adjacent gaps "
+ "[%default]")
+ group.add_option("-1", "--min-strand-bias", type=float, default=1e-4,
+ help="Min P-value for strand bias (given PV4) "
+ "[%default]")
+ group.add_option("-2", "--min-baseq-bias", type=float, default=1e-100,
+ help="Min P-value for baseQ bias (given PV4) "
+ "[%default]")
+ group.add_option("-3", "--min-mapq-bias", type=float, default=0,
+ help="Min P-value for mapQ bias (given PV4) "
+ "[%default]")
+ group.add_option("-4", "--min-end-distance-bias", type=float, default=1e-4,
+ help="Min P-value for end distance bias (given PV4) "
+ "[%default]")
+ parser.add_option_group(group)
+
+
+def describe_filters(options):
+ return {
+ "HET": "Heterozygous SNPs observed on homozygous chromosome (e.g. chrX)",
+ "q:%i" % options.min_quality: "Minimum Phred score recorded in the QUAL column",
+ "f:%.4f" % options.min_allele_frequency: "Minimum frequency of the alleles at heterozygous sites",
+ "k": "SNPs without a most likely genotype (based on PL)",
+ "Q:%i" % options.min_mapping_quality: "Minimum RMS mapping quality",
+ "d:%i" % options.min_read_depth: "Minimum read depth",
+ "D:%i" % options.max_read_depth: "Maximum read depth",
+ "a:%i" % options.min_num_alt_bases: "Minimum number of alternative bases observed for variants",
+ "w:%i" % options.min_distance_to_indels: "SNP within INT bp around a gap",
+ "W:%i" % options.min_distance_between_indels: "Indel within INT bp of another indel",
+ "1:%e" % options.min_strand_bias: "Min P-value for strand bias (given PV4)",
+ "2:%e" % options.min_baseq_bias: "Min P-value for baseQ bias (given PV4)",
+ "3:%e" % options.min_mapq_bias: "Min P-value for mapQ bias (given PV4)",
+ "4:%e" % options.min_end_distance_bias: "Min P-value for end distance bias (given PV4)",
+ }
+
+
+def filter_vcfs(options, vcfs):
+ vcfs = iter(vcfs)
+ chunk = collections.deque()
+ filename = options.pileup
+ min_freq = options.min_allele_frequency
+
+ with AlleleFrequencies(filename, min_freq) as frequencies:
+ while _read_chunk(vcfs, chunk):
+ chunk = _filter_chunk(options, chunk, frequencies)
+ for vcf in _trim_chunk(options, chunk):
+ if vcf.filter == ".":
+ vcf.filter = "PASS"
+
+ yield vcf
+
+
+class AlleleFrequencies:
+ VALID, INVALID, NA = range(3)
+
+ def __init__(self, filename, min_freq):
+ assert min_freq >= 0
+ self._min_freq = min_freq
+ self._handle = None
+
+ if filename and min_freq:
+ self._handle = pysam.Tabixfile(filename)
+ self.frequency_is_valid = self._frequency_is_valid
+ else:
+ self.frequency_is_valid = self._frequency_is_always_valid
+
+ def _frequency_is_always_valid(self, contig, position, _ref, _first, _second):
+ if self._min_freq:
+ sys.stderr.write("WARNING: Multi-allelic SNP found at %s:%i, but --pileup has not been specified.\n" \
+ % (contig, position + 1))
+ return self.VALID
+
+ def _frequency_is_valid(self, contig, position, ref, first, second):
+ assert self._handle
+
+ if any((len(nt) > 1) for nt in (ref, first, second)):
+ assert len(first) != len(second)
+ first = len(first) - len(ref)
+ second = len(second) - len(ref)
+
+ counts = self._fetch(contig, position)
+ n_first = counts.get(first, 0)
+ n_second = counts.get(second, 0)
+
+ n_minor = min(n_first, n_second)
+ n_major = max(n_first, n_second)
+ if not n_major:
+ return self.NA
+ elif n_minor / float(n_minor + n_major) < self._min_freq:
+ return self.INVALID
+ return self.VALID
+
+ def close(self):
+ if self._handle:
+ self._handle.close()
+ self._handle = None
+ self.frequency_is_valid = self._frequency_is_always_valid
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, _exc_type, _exc_value, _traceback):
+ self.close()
+
+ def _fetch(self, contig, position):
+ fetched = False
+ for line in self._handle.fetch(contig, position, position + 1):
+ fields, fetched = line.split("\t"), True
+ assert len(fields) == 6
+ break
+
+ if not fetched:
+ raise RuntimeError("Pileup did not contain position %s:%i, please rebuild." \
+ % (contig, position + 1))
+ elif (fields[0] != contig) or (int(fields[1]) != position + 1):
+ raise RuntimeError("Got wrong record (%s:%i vs %s:%s), is index corrupt?" \
+ % (contig, position + 1, fields[0], fields[1]))
+
+ counts = {}
+ bases = list(fields[4][::-1].upper())
+ ref = fields[2]
+ while bases:
+ current = bases.pop()
+ if current in "ACGTN":
+ counts[current] = counts.get(current, 0) + 1
+ elif current in ",.":
+ counts[ref] = counts.get(ref, 0) + 1
+ elif current in "+-":
+ indel_length = [current]
+ while bases[-1].isdigit():
+ indel_length.append(bases.pop())
+ indel_length = int("".join(indel_length))
+
+ for _ in xrange(abs(indel_length)):
+ bases.pop()
+
+ counts[indel_length] = counts.get(indel_length, 0) + 1
+ elif current == "*":
+ counts[-1] = counts.get(-1, 0) + 1
+ elif current == "^":
+ bases.pop()
+ elif current != "$":
+ raise RuntimeError("Error parsing pileup (unexpected char '%s'): %s" \
+ % (current, repr(line)))
+ return counts
+
+
+def _read_chunk(vcfs, chunk):
+ try:
+ while len(chunk) < _CHUNK_SIZE:
+ chunk.append(vcfs.next())
+ except StopIteration:
+ chunk.append(None)
+
+ return len(chunk) > 1
+
+
+def _trim_chunk(options, chunk):
+ min_distance = max(options.min_distance_between_indels,
+ options.min_distance_to_indels)
+
+ if not chunk:
+ return
+ elif chunk[-1] is None:
+ end_chr = "!@#$%^&*()_+"
+ end_pos = _INF
+ chunk.pop()
+ else:
+ end_chr = chunk[-1].contig
+ end_pos = chunk[-1].pos
+
+ while chunk:
+ vcf = chunk[0]
+ if (vcf.contig == end_chr):
+ # 'length' will become a too large value for heterozygous SNPs,
+ # but it is faster than having to parse every position, and has
+ # no effect on the final results.
+ length = max(len(vcf.ref), len(vcf.alt))
+ if (vcf.pos + length + min_distance) >= end_pos:
+ break
+
+ yield chunk.popleft()
+
+
+def _group_indels_near_position(indels, distance):
+ """Returns a dictionary of positions that are either directly covered by, or
+ adjacent to indels, given some arbitrary distance. For each position, a list
+ of adjacent/overlapping indels are provided."""
+ positions = collections.defaultdict(list)
+ if not distance:
+ return positions
+
+ for vcf in indels:
+ # The number of bases covered (excluding the prefix)
+ # For ambigious indels (e.g. in low complexity regions), this ensures
+ # that the entire region is considered. Note that we do not need to
+ # consider the alternative sequence(s)
+ length = len(vcf.ref) - 1
+
+ # Inclusive start/end positions for bases that should be blacklisted
+ # Note that vcf.pos is the base just before the insertion/deletion
+ start = vcf.pos + 1 - distance
+ end = vcf.pos + 1 + distance + length
+
+ for position in xrange(start, end + 1):
+ positions[position].append(vcf)
+
+ return positions
+
+
+def _select_best_indel(indels):
+ """Select the highest quality indel, based on the quality,
+ prefering low earlier positions above later positions in
+ case of ties."""
+ def _indel_by_quality_and_position(indel):
+ # The negative position is used to select the first
+ # of equally quality indels
+ return (float(indel.qual), -indel.pos)
+
+ return max(indels, key = _indel_by_quality_and_position)
+
+
+def _filter_by_indels(options, chunk):
+ """Filters a list of SNPs and Indels, such that no SNP is closer to
+ an indel than the value set in options.min_distance_to_indels, and
+ such that no two indels too close. If two or more indels are within
+ this distance, the indel with the highest QUAL score is retained. When
+ no unique highest QUAL score exists, an arbitrary indel is retained
+ among those indels with the highest QUAL score. SNPs are filtered
+ based on prefiltered Indels."""
+ indels = [vcf for vcf in chunk if vcfwrap.is_indel(vcf)]
+
+ distance_between = options.min_distance_between_indels
+ indel_blacklist = _group_indels_near_position(indels, distance_between)
+ distance_to = options.min_distance_to_indels
+ snp_blacklist = _group_indels_near_position(indels, distance_to)
+
+ for vcf in chunk:
+ if vcfwrap.is_indel(vcf):
+ blacklisted = indel_blacklist.get(vcf.pos + 1, [vcf])
+ if vcf is not _select_best_indel(blacklisted):
+ _mark_as_filtered(vcf, "W:%i" % distance_between)
+ elif (vcf.alt != ".") and (vcf.pos in snp_blacklist):
+ # TODO: How to handle heterozygous SNPs near
+ _mark_as_filtered(vcf, "w:%i" % distance_to)
+
+
+def _filter_by_properties(options, vcfs, frequencies):
+ """Filters a list of SNPs/indels based on the various properties recorded in
+ the info column, and others. This mirrors most of the filtering carried out
+ by vcfutils.pl varFilter."""
+ for vcf in vcfs:
+ if float(vcf.qual) < options.min_quality:
+ _mark_as_filtered(vcf, "q:%i" % options.min_quality)
+
+ properties = {}
+ for field in vcf.info.split(";"):
+ if "=" in field:
+ key, value = field.split("=")
+ else:
+ key, value = field, None
+ properties[key] = value
+
+ read_depth = float(properties["DP"])
+ if options.min_read_depth > read_depth:
+ _mark_as_filtered(vcf, "d:%i" % options.min_read_depth)
+ elif options.max_read_depth < read_depth:
+ _mark_as_filtered(vcf, "D:%i" % options.max_read_depth)
+
+ if "MQ" in properties:
+ if float(properties["MQ"]) < options.min_mapping_quality:
+ _mark_as_filtered(vcf, "Q:%i" % options.min_mapping_quality)
+
+ if "PV4" in properties:
+ pv4 = [float(value) for value in properties["PV4"].split(",")]
+ if (pv4[0] < options.min_strand_bias):
+ _mark_as_filtered(vcf, "1:%e" % options.min_strand_bias)
+ if (pv4[1] < options.min_baseq_bias):
+ _mark_as_filtered(vcf, "2:%e" % options.min_baseq_bias)
+ if (pv4[2] < options.min_mapq_bias):
+ _mark_as_filtered(vcf, "3:%e" % options.min_mapq_bias)
+ if (pv4[3] < options.min_end_distance_bias):
+ _mark_as_filtered(vcf, "4:%e" % options.min_end_distance_bias)
+
+ if vcf.alt != ".":
+ ref_fw, ref_rev, alt_fw, alt_rev = map(int, properties["DP4"].split(","))
+ if (alt_fw + alt_rev) < options.min_num_alt_bases:
+ _mark_as_filtered(vcf, "a:%i" % options.min_num_alt_bases)
+
+ ml_genotype = vcfwrap.get_ml_genotype(vcf)
+ if (ml_genotype == ("N", "N")) and not options.keep_ambigious_genotypes:
+ # No most likely genotype
+ _mark_as_filtered(vcf, "k")
+
+ if (ml_genotype[0] != ml_genotype[1]):
+ if vcf.contig in options.homozygous_chromosome:
+ _mark_as_filtered(vcf, "HET")
+
+ # Filter by frequency of minor allele
+ if vcf.ref in ml_genotype:
+ n_minor = min(ref_fw + ref_rev, alt_fw + alt_rev)
+ n_major = max(ref_fw + ref_rev, alt_fw + alt_rev)
+
+ if (n_minor / float(n_minor + n_major)) < options.min_allele_frequency:
+ _mark_as_filtered(vcf, "f:%.4f" % options.min_allele_frequency)
+ else:
+ state = frequencies.frequency_is_valid(vcf.contig, vcf.pos, vcf.ref, *ml_genotype)
+ if state is frequencies.INVALID:
+ _mark_as_filtered(vcf, "f:%.4f" % options.min_allele_frequency)
+ elif state is frequencies.NA:
+ if _mark_as_filtered(vcf, "F:%.4f" % options.min_allele_frequency):
+ sys.stderr.write("WARNING: Could not determine allele-counts for SNP at %s:%s, filtering ...\n" % (vcf.contig, vcf.pos + 1))
+
+
+def _filter_chunk(options, chunk, frequencies):
+ at_end = False
+ if chunk[-1] is None:
+ at_end = True
+ chunk.pop()
+
+ _filter_by_indels(options, chunk)
+ _filter_by_properties(options, chunk, frequencies)
+
+ if at_end:
+ chunk.append(None)
+ return chunk
+
+
+def _mark_as_filtered(vcf, filter_name):
+ if vcf.filter in (".", "PASS"):
+ vcf.filter = filter_name
+ return True
+ elif filter_name not in vcf.filter.split(";"):
+ vcf.filter += ";" + filter_name
+ return True
diff --git a/paleomix/common/vcfwrap.py b/paleomix/common/vcfwrap.py
new file mode 100644
index 0000000..a7fb417
--- /dev/null
+++ b/paleomix/common/vcfwrap.py
@@ -0,0 +1,143 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+"""Wrapper and utility functions for VCF handling, using
+the VCF data-structures from pysam."""
+
+import re
+import collections
+
+
+_re_tmpl = "(^|;)%s=([^;]+)(;|$)"
+_re_cache = {}
+
+
+def get_info(vcf, field, default = None, type = str):
+ """Returns the value of the specified field from the info column
+ of a VCF record. The type is determined by 'type' parameter, which
+ may be any function. If no matching key is found, or if the key is
+ not associated with a value, the function returns None by default."""
+ try:
+ regexp = _re_cache[field]
+ except KeyError:
+ regexp = _re_cache[field] = re.compile(_re_tmpl % (field,))
+
+ match = regexp.search(vcf.info)
+ if not match:
+ return default
+
+ return type(match.groups()[1])
+
+
+Indel = collections.namedtuple("Indel", ["in_reference", "pos", "prefix", "what", "postfix"])
+
+def parse_indel(vcf):
+ """Parses the VCF record of an indel, and returns a tuple containing the
+ position (0-based) of the previous base, a boolean indicating whether or
+ not the subsequent sequence is found in the reference sequence, and a
+ string containing the bases added to / removed from the reference.
+
+ Thus (7, False, "ACGT", "AC") indicates that the sequence ACGT has been
+ inserted following the 8th nucleotide, compared with the reference, and
+ that the insertion is followed by the bases "AC" on the reference."""
+ if not is_indel(vcf):
+ raise ValueError("SNP passed to 'parse_indel'!")
+ elif "," in vcf.alt:
+ raise ValueError("VCF records with multiple indels not supported!")
+ elif vcf.ref[0] != vcf.alt[0]:
+ raise ValueError("Sequences do not match VCF spec, first base differs: "
+ "%s:%s -- %s > %s" % (vcf.contig, vcf.pos + 1, vcf.ref, vcf.alt))
+
+ ref_len, alt_len = len(vcf.ref), len(vcf.alt)
+ # The length of the insertion / deletion
+ len_diff = abs(alt_len - ref_len)
+
+ # Wheter or not the sequence 'what' is found in the reference
+ in_reference = (ref_len >= alt_len)
+
+ # The sequence added or removed from the reference
+ longest = max(vcf.ref, vcf.alt, key = len)
+ shortest = min(vcf.ref, vcf.alt, key = len)
+ what = longest[1:len_diff + 1]
+
+ postfix = shortest[1:]
+ if longest[len_diff + 1:] != postfix:
+ raise ValueError("Sequence postfix does not match; malformed indel!")
+
+ return Indel(in_reference, vcf.pos, vcf.ref[0], what, postfix)
+
+
+def is_indel(vcf):
+ """Returns true if the VCF entry represents an indel."""
+ # FIXME: Is this a universal key for indels?
+ return "INDEL" in vcf.info
+
+
+def get_genotype(vcf, sample=0, _re=re.compile(r'[|/]')):
+ """Returns the most likely genotype of a sample in a vcf record. If no
+ single most likely genotype can be determined, the function returns 'N' for
+ both bases."""
+ nucleotides = []
+ nucleotides.extend(vcf.ref.split(","))
+ nucleotides.extend(vcf.alt.split(","))
+
+ result = []
+ for genotype in _re.split(get_format(vcf, sample)["GT"]):
+ result.append(nucleotides[int(genotype)])
+
+ return result
+
+
+# The corresponding nucleotides for each value in the VCF PL field
+_genotype_indices = [(jj, ii)
+ for ii in range(0, 10)
+ for jj in range(0, ii + 1)]
+
+
+def get_ml_genotype(vcf, sample=0):
+ """Returns the most likely genotype of a sample in a vcf record. If no
+ single most likely genotype can be determined, the function returns 'N' for
+ both bases."""
+ genotypes = []
+ genotypes.extend(vcf.ref.split(","))
+ genotypes.extend(vcf.alt.split(","))
+
+ PL = map(int, get_format(vcf, sample)["PL"].split(","))
+
+ expected_length = (len(genotypes) * (len(genotypes) + 1)) // 2
+ if len(PL) != expected_length:
+ raise ValueError("Expected %i PL values, found %i"
+ % (expected_length, len(PL)))
+
+ if PL.count(min(PL)) > 1:
+ # No single most likely genotype
+ return ("N", "N")
+
+ most_likely = min(xrange(len(PL)), key=PL.__getitem__)
+ prefix, postfix = _genotype_indices[most_likely]
+
+ return (genotypes[prefix], genotypes[postfix])
+
+
+def get_format(vcf, sample=0):
+ return dict(zip(vcf.format.split(":"),
+ vcf[sample].split(":")))
diff --git a/paleomix/common/versions.py b/paleomix/common/versions.py
new file mode 100644
index 0000000..5ac3948
--- /dev/null
+++ b/paleomix/common/versions.py
@@ -0,0 +1,432 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# pylint: disable=W0223
+#
+"""Version checks for apps or libraries required by PALEOMIX pipelines.
+
+The module contains to sets of classes: RequirementObj and Check. The
+RequirementObj class implements the determation of the current version for a
+given tool, while the Check (sub)classes implements various comparison
+to be carried against the detected version (less than, greater than or equal,
+etc.).
+
+To reduce the overhead of detmining versions (which mostly involve invoking
+external programs), the RequirementObj caches results. Additionally, to avoid
+redundant calls, RequirementObjs are created using the 'Requirement' function
+which caches RequirementObjs.
+
+For example, to check that the Java version is v1.7 or later:
+ obj = Requirement(call=("java", "-version"),
+ search='java version "(\\d+).(\\d+)',
+ checks=GE(1, 7),
+ name="Java Runtime Environment")
+ try:
+ obj()
+ except VersionRequirementError:
+ pass # requirements not met, or failure to determine version
+"""
+import re
+import operator
+import collections
+
+from paleomix.common.utilities import \
+ Immutable, \
+ TotallyOrdered, \
+ safe_coerce_to_tuple, \
+ try_cast
+from paleomix.common.fileutils import \
+ which_executable
+
+import paleomix.common.procs as procs
+
+
+# Cache used to store the output of cmd-line / function calls
+_CALL_CACHE = {}
+# Cache used to store Requirement object
+_REQUIREMENT_CACHE = {}
+
+
+class VersionRequirementError(StandardError):
+ """Raised if version requirements are not met, or if a version could not be
+ determined for a requirement check.
+ """
+
+
+def Requirement(call, search, checks, name=None, priority=0):
+ # Ignore function naming scheme
+ # pylint: disable=C0103
+ """Returns a singleton Requirement object, based on the parameters,
+ which may be used to check that version requirements are met for a
+ given program/utility/module, etc.
+
+ Parameters:
+ call -- A string, or a tuple containing strings for a system call,
+ or a tuple containing a function at the first position, and
+ a set of positional parameters. In the case of system calls,
+ stdout and stderr are returned as a single string, in the case
+ of a function call, the return value is expected to be a str.
+ search -- A regular expression (string or re object), used to search
+ the output of the "call". Groups are assumed to represent
+ version numbers.
+ checks -- A callable that implements the interface described in the
+ Check class.
+ name -- Descriptive name for the executable/module/etc. If not
+ specified, first value in 'call' will be used; if multiple
+ otherwise identical checks are made, the last name that
+ does not equal the first value of 'call' will be used.
+ priority -- Order in which requirements are checked; if multiple
+ otherwise identical checks are made with different priority,
+ the highest priority takes precedence.
+
+ Implementation detail: To reduce the need for performing calls or system-
+ calls multiple times, caches are implemented using the call object as keys.
+ Thus the same calls should be passed in a manner which allow equality
+ between the same calls to be established.
+ """
+ call = safe_coerce_to_tuple(call)
+ key = (call, search, checks)
+
+ try:
+ requirement = _REQUIREMENT_CACHE[key]
+
+ # Highest priority takes precedence
+ requirement.priority = max(requirement.priority, priority)
+ # Last explicitly specified name takes precedence
+ requirement.name = name or requirement.name
+ except KeyError:
+ requirement = RequirementObj(*key, name=name, priority=priority)
+ _REQUIREMENT_CACHE[key] = requirement
+
+ return requirement
+
+
+class RequirementObj(object):
+ """Represents a version requirement."""
+
+ def __init__(self, call, search, checks, name=None, priority=0):
+ """See function 'Requrement' for a description of parameters.
+ """
+ self._call = safe_coerce_to_tuple(call)
+ self._done = None
+ self.name = str(name or self._call[0])
+ self.priority = int(priority)
+ self.checks = checks
+ self._rege = re.compile(search)
+ self._version = None
+
+ @property
+ def version(self):
+ """The version determined for the application / library. If the version
+ could not be determined, a VersionRequirementError is raised,
+ describing the cause of the problem.
+ """
+ if self._version is None:
+ output = _do_call(self._call)
+ # Raise an exception if the JRE is outdated, even if the
+ # version could be determined (likely a false positive match).
+ self._check_for_outdated_jre(output)
+
+ match = self._rege.search(output)
+ if not match:
+ self._raise_failure(output)
+
+ self._version = tuple(0 if value is None else try_cast(value, int)
+ for value in match.groups())
+
+ return self._version
+
+ @property
+ def executable(self):
+ """Returns the executable invoked during version determination; if no
+ executable is invoked, None is returned.
+ """
+ if not isinstance(self._call[0], collections.Callable):
+ return self._call[0]
+
+ def __call__(self, force=False):
+ if force or self._done is None:
+ if not self.checks(self.version):
+ lines = ["Version requirements not met for %s; please refer\n"
+ "to the PALEOMIX documentation for more information."
+ "\n" % (self.name,)]
+ lines.extend(self._describe_call())
+
+ version = _pprint_version(self.version)
+ lines.append(" Version: %s" % version)
+ lines.append(" Required: %s" % self.checks)
+
+ raise VersionRequirementError("\n".join(lines))
+
+ self._done = True
+
+ def _check_for_outdated_jre(self, output):
+ """Checks for the error raised if the JRE is unable to run a JAR file.
+ This happens if the JAR was built with a never version of the JRE, e.g.
+ if Picard was built with a v1.7 JRE, and then run with a v1.6 JRE.
+ """
+ # This exception is raised if the JRE is incompatible with the JAR
+ if "UnsupportedClassVersionError" in output:
+ self._raise_failure(output)
+
+ def _raise_failure(self, output):
+ """Raises a VersionRequirementError when a version check failed; if the
+ output indicates that the JRE is outdated (i.e. the output contains
+ "UnsupportedClassVersionError") a special message is givenself.
+ """
+ lines = ["Version could not be determined for %s:" % (self.name,)]
+ lines.append("")
+ lines.extend(self._describe_call())
+ lines.append("")
+
+ # Raised if the JRE is too old compared to the JAR
+ if "UnsupportedClassVersionError" in output:
+ lines.extend([
+ "The version of the Java Runtime Environment on this",
+ "system is too old; please check the the requirement",
+ "for the program and upgrade your version of Java.",
+ "",
+ "See the documentation for more information.",
+ ])
+ else:
+ lines.append("Program may be broken or a version not supported by the")
+ lines.append("pipeline; please refer to the PALEOMIX documentation.\n")
+ lines.append(" Required: %s" % (self.checks,))
+ lines.append(" Search string: %r\n" % (self._rege.pattern))
+ lines.append("%s Command output %s" % ("-" * 22, "-" * 22))
+ lines.append(output)
+
+ raise VersionRequirementError("\n".join(lines))
+
+ def _describe_call(self):
+ """Yields string describing the current system call, if any.
+ """
+ if self.executable:
+ exec_path = which_executable(self.executable) or self.executable
+ yield " Executable: %s" % (exec_path,)
+
+ if not isinstance(self._call[0], collections.Callable):
+ yield " Call: %s" % (" ".join(self._call),)
+
+
+class Check(Immutable, TotallyOrdered):
+ # Ignore "missing" members; required due to use of Immutable
+ # pylint: disable=E1101
+ """Abstract base-class for version checks.
+
+ Callable with a tuple of version fields (typically integers), and returns
+ either True or False, depending on whether or not the specified check
+ passed.
+
+ The contructor takes a string describing the check ('description'), a
+ function with the signature 'func(version, values)', where version is the
+ version determined for a app/library, and where values are the values
+ passed to the Check constructor.
+ """
+
+ def __init__(self, description, func, *values):
+ if not callable(func):
+ raise TypeError('func must be callable, not %r' % (func,))
+
+ values = tuple(values)
+ Immutable.__init__(self,
+ _func=func,
+ _values=values,
+ _description=description,
+ _objs=(description, func, values))
+
+ def __str__(self):
+ return self._description
+
+ def __lt__(self, other):
+ if isinstance(other, Check):
+ return self._objs < other._objs # pylint: disable=W0212
+ return NotImplemented # pragma: no coverage
+
+ def __hash__(self):
+ return hash(self._objs)
+
+ def __call__(self, current):
+ """Takes a tuple of version fields (e.g. (1, 7)) and returns True if
+ this version matches the check.
+ """
+ return self._do_check_version(current, self._values)
+
+ def _do_check_version(self, current, reference):
+ """Invokes the actual check; may be overridden in subclasses."""
+ return self._func(current, reference)
+
+
+class CheckVersion(Check):
+ """Base class for comparisons involving versions; requires that the version
+ checks has at least as many fields as specified for the Check object. If
+ the version checked has more fields, these are truncated away.
+ """
+
+ def __init__(self, description, func, *version):
+ description = description.format(_pprint_version(version))
+ Check.__init__(self, description, func, *version)
+
+ def _do_check_version(self, current, reference):
+ if len(current) < len(reference):
+ raise ValueError("Expects at least %i fields, not %i: %r"
+ % (len(reference), len(current), current))
+
+ return Check._do_check_version(self,
+ current[:len(reference)],
+ reference)
+
+
+class EQ(CheckVersion):
+ """Checks that a version is Equal to this version; note that version fields
+ are truncated to the number of fields specified for this Check. As a
+ consequence, EQ(1, 5) is true for (1, 5), (1, 5, 7), (1, 5, 7, 1), etc. See
+ 'Check' for more information.
+ """
+
+ def __init__(self, *version):
+ CheckVersion.__init__(self, "{0}", operator.eq, *version)
+
+
+class GE(CheckVersion):
+ """Checks that a version is Greater-than or Equal to this version; note
+ that version fields are truncated to the number of fields specified for
+ this Check. See 'Check'.
+ """
+
+ def __init__(self, *version):
+ CheckVersion.__init__(self, "at least {0}", operator.ge, *version)
+
+
+class LT(CheckVersion):
+ """Checks that a version is Less Than this version; note that version
+ fields are truncated to the number of fields specified for this Check.
+ See 'Check'.
+ """
+
+ def __init__(self, *version):
+ CheckVersion.__init__(self, "prior to {0}", operator.lt, *version)
+
+
+class Any(CheckVersion):
+ """Dummy check; is always true."""
+
+ def __init__(self):
+ CheckVersion.__init__(self, "any version", _func_any)
+
+
+class Operator(Check):
+ """Base class for logical operations on Checks; and, or, etc."""
+
+ def __init__(self, keyword, func, *checks):
+ """Arguments:
+ keyword -- Keyword to join description of checks by.
+ func -- Function implementing the logical operation; is called as
+ func(*checks). See the 'func' argument for Check.__init__.
+ checks -- Zero or more Checks.
+ """
+ descriptions = []
+ for check in checks:
+ if isinstance(check, Operator):
+ descriptions.append("(%s)" % (check,))
+ elif isinstance(check, Check):
+ descriptions.append("%s" % (check,))
+ else:
+ raise ValueError("%r is not of type Check" % (check,))
+
+ description = keyword.join(descriptions)
+ Check.__init__(self, description, func, *checks)
+
+
+class And(Operator):
+ """Carries out 'and' on a set of checks; always true for no Checks"""
+
+ def __init__(self, *checks):
+ Operator.__init__(self, " and ", _func_and, *checks)
+
+
+class Or(Operator):
+ """Carries out 'or' on a set of checks; always false for no Checks"""
+
+ def __init__(self, *checks):
+ Operator.__init__(self, " or ", _func_or, *checks)
+
+
+###############################################################################
+###############################################################################
+# Check functions; must be available for pickle
+
+def _func_any(_current, _checks):
+ """Implementation of Any."""
+ return True
+
+
+def _func_and(current, checks):
+ """Implementation of And."""
+ return all(check(current) for check in checks)
+
+
+def _func_or(current, checks):
+ """Implementation of Or."""
+ return any(check(current) for check in checks)
+
+
+###############################################################################
+###############################################################################
+# Utility functions
+
+def _run(call):
+ """Carries out a system call and returns STDOUT and STDERR as a combined
+ string. If an OSError is raied (e.g. due to missing executables), the
+ resulting message is returned as a string.
+ """
+ try:
+ proc = procs.open_proc(call,
+ stdout=procs.PIPE,
+ # Merge STDERR with STDOUT output
+ stderr=procs.STDOUT)
+
+ return proc.communicate()[0]
+ except (OSError, procs.CalledProcessError), error:
+ return str(error)
+
+
+def _do_call(call):
+ """Performs a call; the result is cached, and returned upon subsequent
+ calls with the same signature (either a function call or system call).
+ """
+ try:
+ return _CALL_CACHE[call]
+ except KeyError:
+ if callable(call[0]):
+ result = call[0](*call[1:])
+ else:
+ result = _run(call)
+ _CALL_CACHE[call] = result
+ return result
+
+
+def _pprint_version(value):
+ """Pretty-print version tuple; takes a tuple of field numbers / values,
+ and returns it as a string joined by dots with a 'v' prepended.
+ """
+ return "v%s.x" % (".".join(map(str, value)),)
diff --git a/paleomix/config.py b/paleomix/config.py
new file mode 100644
index 0000000..a692952
--- /dev/null
+++ b/paleomix/config.py
@@ -0,0 +1,198 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2013 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import sys
+import types
+import socket
+import getpass
+import optparse
+import ConfigParser
+import multiprocessing
+
+from paleomix.common.fileutils import \
+ make_dirs
+from paleomix.common.console import \
+ print_info
+
+
+class ConfigError(RuntimeError):
+ pass
+
+
+class PerHostValue:
+ def __init__(self, value, is_path=False):
+ """Represents a config value that should be settable on a
+ per-host basis. If 'is_path' is set, the value is assumed
+ to represent a path, and ~ is expanded to the user's home
+ folder."""
+
+ self.value = value
+ self.is_path = is_path
+
+
+class PerHostConfig:
+ """Helper class for optparse.OptionParser use by pipelines; standardizes the
+ process of reading / writing overridable CLI options, while allowing per-host
+ options to be set in the .ini files. Values for options with a default value
+ that is an instance of PerHostValue will automatically be read-from /
+ written-to the per-user config file.
+
+ Given a pipeline with name "NAME", the class will read options from the
+ following configuration files:
+ - /etc/pypeline/NAME.ini
+ - ~/.pypeline/NAME.ini
+
+ These files are expected to contain a "Defaults" section (applies to all hosts),
+ and an optional section using the hostname of a server. A file containing the
+ current settings (as passed on the CLI) may be written using --write-config-file.
+
+ Example usage:
+ per_host_cfg = PerHostConfig("my_pypeline")
+ parser = OptionParser(...)
+ parser.add_option(..., default = PerHostValue(...))
+ config, args = per_host_cfg.parse_args(parser, sys.argv[1:])
+ """
+
+ def __init__(self, pipeline_name):
+ """Creates a PerHostConfig for a pipeline with the specified name."""
+ # Various common options
+ temp_root = os.path.join("/tmp", getpass.getuser(), pipeline_name)
+ self.temp_root = PerHostValue(temp_root, True)
+ # At least 2 threads are required for e.g. PE BWA nodes, and generally recommended anyway
+ self.max_threads = PerHostValue(max(2, multiprocessing.cpu_count()))
+
+ self._filenames = self._get_filenames(pipeline_name)
+ self._handle = ConfigParser.SafeConfigParser()
+ self._handle.read(self._filenames)
+ self._sections = []
+
+ hostname = socket.gethostname()
+ if self._handle.has_section(hostname):
+ self._sections.append(hostname)
+ self._sections.append("Defaults")
+
+ def parse_args(self, parser, argv):
+ """Calls 'parse_args' on the parser object after updating default values
+ using the settings-files. If --write-config-file is set, a config file
+ containing the resulting settings is written."""
+ self._add_per_host_options(parser)
+ defaults = self._update_defaults(parser)
+ config, args = parser.parse_args(argv)
+
+ if config.write_config_file:
+ self._write_config_file(config, defaults)
+
+ return config, args
+
+ def _write_config_file(self, config, defaults):
+ """Writes a basic config files, using the values previously found in the
+ config files, and specified on the command-line."""
+ defaults_cfg = ConfigParser.SafeConfigParser()
+ defaults_cfg.add_section("Defaults")
+ for key in defaults:
+ value = getattr(config, key)
+ if isinstance(value, (types.ListType, types.TupleType)):
+ value = ";".join(value)
+
+ defaults_cfg.set("Defaults", key, str(value))
+
+ filename = self._filenames[-1]
+ make_dirs(os.path.dirname(filename))
+ with open(filename, "w") as handle:
+ defaults_cfg.write(handle)
+
+ print_info("Wrote config file %r" % (filename,))
+ sys.exit(0)
+
+ def _add_per_host_options(self, parser):
+ """Adds options to a parser relating to the PerHostConfig."""
+ group = optparse.OptionGroup(parser, "Config files")
+ group.add_option("--write-config-file",
+ default=False, action="store_true",
+ help="Write config using current settings to %s"
+ % (self._filenames[-1],))
+ parser.add_option_group(group)
+
+ def _update_defaults(self, parser):
+ """Updates default values in a OptionParser, and returns a new
+ ConfigParser object containing a new default-values object derived
+ from current config-files / CLI options."""
+ defaults = {}
+ for opt in parser._get_all_options():
+ if isinstance(opt.default, PerHostValue):
+ defaults[opt.dest] = self._get_default(opt)
+ parser.set_defaults(**defaults)
+ return defaults
+
+ def _get_default(self, option):
+ value = option.default.value
+ getter = self._handle.get
+ if isinstance(value, types.BooleanType):
+ getter = self._handle.getboolean
+ elif isinstance(value, (types.IntType, types.LongType)):
+ getter = self._handle.getint
+ elif isinstance(value, (types.FloatType)):
+ getter = self._handle.getfloat
+ elif isinstance(value, (types.ListType, types.TupleType)):
+ def getter(section, key):
+ return filter(None, self._handle.get(section, key).split(";"))
+
+ for section in self._sections:
+ if self._handle.has_option(section, option.dest):
+ value = getter(section, option.dest)
+ break
+
+ if option.default.is_path:
+ value = os.path.expanduser(value)
+
+ return value
+
+ @classmethod
+ def _get_filenames(cls, name):
+ """Return standard list of config files for PALEOMIX pipelines:
+ - /etc/pypeline/{name}.ini
+ - /etc/paleomix/{name}.ini
+ - ~/.paleomix/{name}.ini
+ """
+ filename = "%s.ini" % (name,)
+ homefolder = os.path.expanduser('~')
+ return ["/etc/pypeline/%s" % (filename,),
+ "/etc/paleomix/%s" % (filename,),
+ os.path.join(homefolder, ".paleomix", filename)]
+
+
+def migrate_config():
+ """Checks for the existence of PALEOMIX config files in the old, deprecated
+ location (~/.pypeline), and moves these to the new location (~/.paleomix),
+ if no config files exist. The old location is replaced with a symbolic
+ link, to ensure that older versions of the pipeline do not break.
+ """
+ homefolder = os.path.expanduser('~')
+ old_root = os.path.join(homefolder, ".pypeline")
+ new_root = os.path.join(homefolder, ".paleomix")
+
+ if not os.path.exists(new_root):
+ if os.path.exists(old_root):
+ sys.stderr.write("INFO: Migrating ~/.pypeline to ~/.paleomix\n")
+ os.rename(old_root, new_root)
+ os.symlink(new_root, old_root)
diff --git a/paleomix/logger.py b/paleomix/logger.py
new file mode 100644
index 0000000..a3ff8bd
--- /dev/null
+++ b/paleomix/logger.py
@@ -0,0 +1,179 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2013 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import sys
+import errno
+import optparse
+import logging
+
+import paleomix.common.console as _cli
+import paleomix.common.fileutils as _fs
+
+
+def initialize(config, template=None):
+ """Takes an OptionParser object for which 'add_optiongroup' has
+ been called, as well as a filename template (containing one '%i'
+ field), and initializes logging for a PALEOMIX pipeline.
+
+ If --log-file has not been specified, the template is used to
+ create a new logfile in --temp-root, skipping existing logfiles
+ by incrementing the counter value. If a --log-file has been
+ specified, this file is always created / opened.
+
+ If neither --log-file nor 'template' has been specified, then
+ logging is only carried out by printing messages to STDERR."""
+ global _INITIALIZED # pylint: disable=W0603
+ if _INITIALIZED:
+ raise RuntimeError("Attempting to initialize logging more than once")
+
+ root = logging.getLogger()
+ root.setLevel(logging.INFO)
+ root.addHandler(_PrintToConsole(logging.INFO))
+
+ if config.log_file or template:
+ # Verify that the template is functional up front
+ template % (1,) # pylint: disable=W0104
+
+ level = _LOGLEVELS[config.log_level]
+ if config.log_file:
+ handler = logging.FileHandler(config.log_file)
+ else:
+ handler = _LazyLogfile(config.temp_root, template)
+ fmt = "%s\n%%(asctime)s -- %%(levelname)s:\n%%(message)s"
+ formatter = logging.Formatter(fmt % ("-" * 60,))
+ handler.setFormatter(formatter)
+ handler.setLevel(level)
+ root.addHandler(handler)
+
+ _INITIALIZED = True
+
+
+def add_optiongroup(parser, default = "warning"):
+ """Adds an option-group to an OptionParser object, with options
+ pertaining to logging. Note that 'initialize' expects the config
+ object to have these options."""
+ group = optparse.OptionGroup(parser, "Logging")
+ group.add_option("--log-file", default = None,
+ help = "Create the specified log-file and write any "
+ "messages to this file. By default, a log-file "
+ "will be generated in the folder specified using "
+ "--temp-root, but only when messages are logged")
+ group.add_option("--log-level", default = default, type = "choice",
+ choices = ("info", "warning", "error", "debug"),
+ help = "Log messages to log-file at and above the "
+ "specified level; one of 'info', 'warning', "
+ "'error', or 'debug' [%default]")
+ parser.add_option_group(group)
+
+
+def get_logfile():
+ return _LOGFILE
+
+
+class _PrintToConsole(logging.Handler):
+ """Logger that prints messages to the console using the
+ paleomix.ui functions for colored text. Colors are blue
+ for DEBUG, green for INFO (and unknown levels), yellow
+ for WARNING, and red for ERROR and CRITICAL."""
+ def __init__(self, level = logging.NOTSET):
+ logging.Handler.__init__(self, level)
+
+ def emit(self, record):
+ func = self.get_ui_function(record.levelno)
+ func(record.getMessage(), file = sys.stderr)
+
+ @classmethod
+ def get_ui_function(cls, level):
+ if level in (logging.ERROR, logging.CRITICAL):
+ return _cli.print_err
+ elif level == logging.WARNING:
+ return _cli.print_warn
+ elif level == logging.DEBUG:
+ return _cli.print_debug
+ return _cli.print_info
+
+
+class _LazyLogfile(logging.Handler):
+ def __init__(self, folder, template):
+ logging.Handler.__init__(self)
+ self._folder = folder
+ self._template = template
+ self._stream = None
+ self._handler = None
+ self._formatter = None
+
+
+ def emit(self, record):
+ if not self._handler:
+ global _LOGFILE # pylint: disable = W0603
+ _LOGFILE, self._stream = \
+ _open_logfile(self._folder, self._template)
+ self._handler = logging.StreamHandler(self._stream)
+ self._handler.setFormatter(self._formatter)
+ self._handler.emit(record)
+
+
+ def flush(self):
+ if self._handler:
+ self._handler.flush()
+
+
+ def setFormatter(self, form):
+ logging.Handler.setFormatter(self, form)
+ self._formatter = form
+
+
+ def close(self):
+ if self._handler:
+ self._handler.close()
+ self._stream.close()
+ self._handler = None
+ self._stream = None
+
+
+def _open_logfile(folder, template, start = 0):
+ """Try to open a new logfile, taking steps to ensure that
+ existing logfiles using the same template are not clobbered."""
+ if not os.path.exists(folder):
+ _fs.make_dirs(folder)
+
+ flags = os.O_WRONLY | os.O_CREAT | os.O_EXCL
+ while True:
+ filename = os.path.join(folder, template % (start,))
+ try:
+ if not os.path.exists(filename):
+ return filename, os.fdopen(os.open(filename, flags), "w")
+ except OSError, error:
+ if error.errno != errno.EEXIST:
+ raise
+ start += 1
+
+
+_INITIALIZED = False
+_LOGFILE = None
+_LOGLEVELS = {
+ 'info' : logging.INFO,
+ 'warning' : logging.WARNING,
+ 'error' : logging.ERROR,
+ 'debug' : logging.DEBUG,
+}
diff --git a/paleomix/main.py b/paleomix/main.py
new file mode 100755
index 0000000..2414f23
--- /dev/null
+++ b/paleomix/main.py
@@ -0,0 +1,236 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2014 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""Wrapper script around PALEOMIX commands.
+
+This script takes care of checking that various requirements are met, that the
+PALEOMIX module ('paleomix') is available, and forwards arguments to the
+appropriate commands.
+"""
+import os
+import sys
+import textwrap
+
+
+# List of tuples of commands: (name, module, help string).
+# If module is None, the command cannot be invoked directly (e.g. help), and
+# if help string is none, the command is considered a help header.
+def _commands():
+ yield ("Pipelines", None, None)
+ yield ("bam_pipeline", "paleomix.tools.bam_pipeline.pipeline",
+ "Pipeline for trimming and mapping of NGS reads.")
+ yield ("trim_pipeline", "paleomix.tools.bam_pipeline.trim_pipeline",
+ "Equivalent to 'bam_pipeline', but only runs the trimming steps.")
+ yield ("phylo_pipeline", "paleomix.tools.phylo_pipeline.pipeline",
+ "Pipeline for genotyping and phylogenetic inference from BAMs.")
+ yield ("zonkey", "paleomix.tools.zonkey.pipeline",
+ "Pipeline for detecting F1 (equine) hybrids.")
+
+ # Currently not documented; used internally by Zonkey
+ yield ("zonkey_db", "paleomix.tools.zonkey.build_db", None)
+ yield ("zonkey_tped", "paleomix.tools.zonkey.build_tped", None)
+ yield ("zonkey_mito", "paleomix.tools.zonkey.build_mito", None)
+
+ # In development: Integration of metabit into PALEOMIX
+ yield ("metabit", "paleomix.tools.metabit.metabit", None)
+
+ yield ("BAM/SAM tools", None, None)
+ yield ("cleanup", "paleomix.tools.cleanup",
+ "Reads SAM file from STDIN, and outputs sorted, tagged, and filter "
+ "BAM, for which NM and MD tags have been updated.")
+ yield ("coverage", "paleomix.tools.coverage",
+ "Calculate coverage across reference sequences or regions of "
+ "interest.")
+ yield ("depths", "paleomix.tools.depths",
+ "Calculate depth histograms across reference sequences or regions "
+ "of interest.")
+ yield ("duphist", "paleomix.tools.duphist",
+ "Generates PCR duplicate histogram; used with the 'Preseq' tool.")
+ yield ("rmdup_collapsed", "paleomix.tools.rmdup_collapsed",
+ "Filters PCR duplicates for collapsed paired-ended reads generated "
+ "by the AdapterRemoval tool.")
+
+ yield ("VCF/GTF/BED/Pileup tools", None, None)
+ yield ("genotype", "paleomix.tools.genotype",
+ "Creates bgzipped VCF for a set of (sparse) BED regions, or for "
+ "entire chromosomes / contigs using SAMTools / BCFTools.")
+ yield ("gtf_to_bed", "paleomix.tools.gtf_to_bed",
+ "Convert GTF file to BED files grouped by feature "
+ "(coding, RNA, etc).")
+ yield ("sample_pileup", "paleomix.tools.sample_pileup",
+ "Randomly sample sites in a pileup to generate a FASTA sequence.")
+ yield ("vcf_filter", "paleomix.tools.vcf_filter",
+ "Quality filters for VCF records, similar to "
+ "'vcfutils.pl varFilter'.")
+ yield ("vcf_to_fasta", "paleomix.tools.vcf_to_fasta",
+ "Create most likely FASTA sequence from tabix-indexed VCF file.")
+
+ yield ("Misc tools", None, None)
+ yield ("cat", "paleomix.tools.cat",
+ "Generalized cat command for gz, bz2 and uncompressed files.")
+
+ # In development:
+ # Prepares FASTQ reads recorded in BAM pipeline makefiles
+ # for submission to the European Nucleotide Archive.
+ yield ("ena", "paleomix.tools.ena", None)
+
+# Error message shown if the Pysam module ('pysam') cannot be imported
+_IMPORT_ERROR_PYSAM = """
+Error importing required python module 'pysam':
+ - %s
+
+The module may be installed for the current user using 'pip':
+ $ pip install --user pysam
+
+Alternatively, download the latest version from the Pysam repository at GitHub:
+ - https://github.com/pysam-developers/pysam
+
+A local install may be performed using the following command:
+ $ python setup.py install --user
+"""
+
+# Error message in case it is not possible to import the PALEOMIX module itself
+_IMPORT_ERROR_PALEOMIX = """
+Error importing PALEOMIX module 'paleomix':
+ - %s
+
+Please make sure that PALEOMIX is correctly installed, and that the PYTHONPATH
+environmental variable points to the location of the 'paleomix' module.
+"""
+
+_INCONSISTENT_IMPORT_ERROR = """
+Inconsistency importing PALEOMIX module 'paleomix'; the currently running
+script is not located in the same folder as the 'paleomix' module. This
+suggests that you have multiple, conflicting copies of PALEOMIX installed!
+
+ - The running script: %r
+ - The 'paleomix' module: %r
+
+It is strongly suggested that you remove all installed copies of PALEOMIX,
+and perform a clean install. If this is not possible, the 'virtualenv' tool
+for Python may be used to prevent conflict between the installed versions.
+"""
+
+
+_PALEOMIX_CITATION = """
+
+If you make use of PALEOMIX in your work, please cite
+ Schubert et al, "Characterization of ancient and modern genomes by SNP
+ detection and phylogenomic and metagenomic analysis using PALEOMIX".
+ Nature Protocols. 2014 May; 9(5): 1056-82. doi: 10.1038/nprot.2014.063
+"""
+
+
+def _are_requirements_met():
+ """Checks the current Python version, that the PALEOMIX modules are
+ available, and modules required by the pipeline (Pysam) is available and
+ up-to-date.
+ """
+ if tuple(sys.version_info)[:2] != (2, 7):
+ sys.stderr.write("ERROR: PALEOMIX requires Python version 2.7.x.\n")
+ sys.stderr.write("However, the current version of python is\n\tv%s\n\n"
+ % (sys.version.replace("\n", "\n\t"),))
+ sys.stderr.write("Please install Python v2.7 to continue.\n")
+ return False
+
+ modules = [('pysam', _IMPORT_ERROR_PYSAM),
+ ('paleomix', _IMPORT_ERROR_PALEOMIX)]
+
+ for (module, message) in modules:
+ try:
+ __import__(module)
+ except ImportError:
+ error = sys.exc_info()[1] # Python 2/3 compatible exception syntax
+ sys.stderr.write(message % (error,))
+ return False
+
+ # Sanity check, to catch multiple, conflicting PALEOMIX installations
+ import paleomix
+ if not os.path.samefile(os.path.dirname(__file__),
+ os.path.dirname(paleomix.__file__)):
+ sys.stderr.write(_INCONSISTENT_IMPORT_ERROR
+ % (os.path.dirname(__file__),
+ os.path.dirname(paleomix.__file__)))
+ return False
+
+ import pysam
+ version = [int(field) for field in pysam.__version__.split(".")]
+ if version[:3] < [0, 8, 3]:
+ error = "Pysam is outdated (v%s), version must be at least v0.8.3!"
+ error %= (pysam.__version__,)
+ sys.stderr.write(_IMPORT_ERROR_PYSAM % (error,))
+ return False
+
+ return True
+
+
+def _print_help():
+ """Prints description of commands and reference to PALEOMIX paper."""
+ import paleomix
+
+ template = " paleomix %s%s-- %s\n"
+ max_len = max(len(key) for (key, module, _) in _commands() if module)
+ help_len = 80 - len(template % (" " * max_len, " ", ""))
+ help_padding = (80 - help_len) * " "
+
+ sys.stderr.write("PALEOMIX - pipelines and tools for NGS data analyses.\n")
+ sys.stderr.write("Version: %s\n\n" % (paleomix.__version__,))
+ sys.stderr.write("Usage: paleomix <command> [options]\n")
+ for (key, module, help_str) in _commands():
+ if help_str is None:
+ if module is None:
+ sys.stderr.write("\n%s:\n" % (key,))
+ else:
+ lines = textwrap.wrap(help_str, help_len)
+ padding = (max_len - len(key) + 2) * " "
+ sys.stderr.write(template % (key, padding, lines[0]))
+
+ for line in lines[1:]:
+ sys.stderr.write("%s%s\n" % (help_padding, line))
+
+ sys.stderr.write(_PALEOMIX_CITATION)
+
+
+def main(argv):
+ """Main function; takes a list of arguments excluding argv[0]."""
+ if not _are_requirements_met():
+ return 1
+
+ # Process name defaults to the name of the python executable
+ import paleomix.common.system
+ paleomix.common.system.set_procname("paleomix")
+
+ if not argv or argv[0] == "help":
+ _print_help()
+ return 0
+
+ command = argv[0]
+ for (cmd_name, cmd_module, _) in _commands():
+ if cmd_module and (command == cmd_name):
+ module = __import__(cmd_module, globals(), locals(), ["main"], 0)
+ return module.main(argv[1:])
+
+ sys.stderr.write("ERROR: Unknown PALEOMIX command %r!\n" % (command,))
+ return 1
+
+
+if __name__ == '__main__':
+ sys.exit(main(sys.argv[1:]))
diff --git a/paleomix/node.py b/paleomix/node.py
new file mode 100644
index 0000000..ccd6334
--- /dev/null
+++ b/paleomix/node.py
@@ -0,0 +1,281 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import collections
+import os
+import sys
+import traceback
+import types
+
+import paleomix.common.fileutils as fileutils
+from paleomix.common.utilities import \
+ safe_coerce_to_frozenset
+
+from paleomix.atomiccmd.command import \
+ CmdError
+
+
+class NodeError(RuntimeError):
+ pass
+
+
+class CmdNodeError(NodeError):
+ pass
+
+
+class NodeUnhandledException(NodeError):
+ """This exception is thrown by Node.run() if a non-NodeError exception
+ is raised in a subfunction (e.g. _setup, _run, or _teardown). The text
+ for this exception will include both the original error message and a
+ stacktrace for that error."""
+ pass
+
+
+class Node(object):
+ def __init__(self, description = None, threads = 1,
+ input_files = (), output_files = (),
+ executables = (), auxiliary_files = (),
+ requirements = (), dependencies = ()):
+
+ if not isinstance(description, _DESC_TYPES):
+ raise TypeError("'description' must be None or a string, not %r" \
+ % (description.__class__.__name__,))
+
+ self.__description = description
+ self.input_files = self._validate_files(input_files)
+ self.output_files = self._validate_files(output_files)
+ self.executables = self._validate_files(executables)
+ self.auxiliary_files = self._validate_files(auxiliary_files)
+ self.requirements = self._validate_requirements(requirements)
+
+ self.threads = self._validate_nthreads(threads)
+ self.dependencies = self._collect_nodes(dependencies)
+
+ # If there are no input files, the node cannot be re-run based on
+ # changes to the input, and nodes with output but no input are not
+ # expected based on current usage.
+ if not self.input_files and self.output_files:
+ raise NodeError("Node not dependent upon input files: %s" % self)
+
+ def run(self, config):
+ """Runs the node, by calling _setup, _run, and _teardown in that order.
+ Prior to calling these functions, a temporary dir is created using the
+ 'temp_root' prefix from the config object. Both the config object and
+ the temporary dir are passed to the above functions. The temporary
+ dir is removed after _teardown is called, and all expected files
+ should have been removed/renamed at that point.
+
+ Any non-NodeError exception raised in this function is wrapped in a
+ NodeUnhandledException, which includes a full backtrace. This is needed
+ to allow showing these in the main process."""
+
+ try:
+ temp = None
+ temp = self._create_temp_dir(config)
+
+ self._setup(config, temp)
+ self._run(config, temp)
+ self._teardown(config, temp)
+ self._remove_temp_dir(temp)
+ except NodeError, error:
+ self._write_error_log(temp, error)
+ raise NodeError("Error(s) running Node:\n\tTemporary directory: %s\n\n%s" \
+ % (repr(temp), error))
+ except Exception, error:
+ self._write_error_log(temp, error)
+ raise NodeUnhandledException("Error(s) running Node:\n\tTemporary directory: %s\n\n%s" \
+ % (repr(temp), traceback.format_exc()))
+
+ def _create_temp_dir(self, config):
+ """Called by 'run' in order to create a temporary folder.
+
+ 'config' is expected to have a property .temp_root under
+ which the temporary folder is created."""
+ return fileutils.create_temp_dir(config.temp_root)
+
+ def _remove_temp_dir(self, temp):
+ """Called by 'run' in order to remove an (now) empty temporary folder."""
+ os.rmdir(temp)
+
+ def _setup(self, _config, _temp):
+ """Is called prior to '_run()' by 'run()'. Any code used to copy/link files,
+ or other steps needed to ready the node for running may be carried out in this
+ function. Checks that required input files exist, and raises an NodeError if
+ this is not the case."""
+ if fileutils.missing_executables(self.executables):
+ raise NodeError("Executable(s) does not exist for node: %s" % (self,))
+ self._check_for_missing_files(self.input_files, "input")
+ self._check_for_missing_files(self.auxiliary_files, "auxiliary")
+
+ def _run(self, _config, _temp):
+ pass
+
+ def _teardown(self, _config, _temp):
+ self._check_for_missing_files(self.output_files, "output")
+
+ def __str__(self):
+ """Returns the description passed to the constructor, or a default
+ description if no description was passed to the constructor."""
+ if self.__description:
+ return self.__description
+ return repr(self)
+
+ def __getstate__(self):
+ """Called by pickle/cPickle to determine what to pickle; this is
+ overridden to avoid pickling of requirements, dependencies, which would
+ otherwise greatly inflate the amount of information that needs to be
+ pickled."""
+ obj_dict = self.__dict__.copy()
+ obj_dict["requirements"] = None
+ obj_dict["dependencies"] = None
+ return obj_dict
+
+ def _write_error_log(self, temp, error):
+ if not (temp and os.path.isdir(temp)):
+ return
+
+ prefix = "\n "
+ message = ["Command = %r" % (" ".join(sys.argv),),
+ "CWD = %r" % (os.getcwd(),),
+ "PATH = %r" % (os.environ.get('PATH', ''),),
+ "Node = %s" % (str(self),),
+ "Threads = %i" % (self.threads,),
+ "Input files = %s" % (prefix.join(sorted(self.input_files)),),
+ "Output files = %s" % (prefix.join(sorted(self.output_files)),),
+ "Auxiliary files = %s" % (prefix.join(sorted(self.auxiliary_files)),),
+ "Executables = %s" % (prefix.join(sorted(self.executables)),),
+ "",
+ "Errors =\n%s\n" % (error,)]
+ message = "\n".join(message)
+
+ try:
+ with open(os.path.join(temp, "pipe.errors"), "w") as handle:
+ handle.write(message)
+ except OSError, oserror:
+ sys.stderr.write("ERROR: Could not write failure log: %s\n" % (oserror,))
+
+ def _collect_nodes(self, nodes):
+ if nodes is None:
+ return frozenset()
+
+ nodes = safe_coerce_to_frozenset(nodes)
+ bad_nodes = [node for node in nodes if not isinstance(node, Node)]
+
+ if bad_nodes:
+ bad_nodes = [repr(node) for node in bad_nodes]
+ message = "Dependency-list contain non-Node objects:\n" \
+ "\t- Command: %s\n\t- Objects: %s" \
+ % (self, "\n\t ".join(bad_nodes))
+ raise TypeError(message)
+
+ return nodes
+
+ def _check_for_missing_files(self, filenames, description):
+ missing_files = fileutils.missing_files(filenames)
+ if missing_files:
+ message = "Missing %s files for command:\n\t- Command: %s\n\t- Files: %s" \
+ % (description, self, "\n\t ".join(missing_files))
+ raise NodeError(message)
+
+ @classmethod
+ def _validate_requirements(cls, requirements):
+ requirements = safe_coerce_to_frozenset(requirements)
+ for requirement in requirements:
+ if not isinstance(requirement, collections.Callable):
+ raise TypeError("'requirements' must be callable, not %r"
+ % (type(requirement),))
+ return requirements
+
+ @classmethod
+ def _validate_files(cls, files):
+ files = safe_coerce_to_frozenset(files)
+ for filename in files:
+ if not isinstance(filename, types.StringTypes):
+ raise TypeError('Files must be strings, not %r' % filename.__class__.__name__)
+ return files
+
+ @classmethod
+ def _validate_nthreads(cls, threads):
+ if not isinstance(threads, (types.IntType, types.LongType)):
+ raise TypeError("'threads' must be a positive integer, not %s" % (type(threads),))
+ elif threads < 1:
+ raise ValueError("'threads' must be a positive integer, not %i" % (threads,))
+ return int(threads)
+
+
+class CommandNode(Node):
+ def __init__(self, command, description=None, threads=1,
+ dependencies=()):
+ Node.__init__(self,
+ description = description,
+ input_files = command.input_files,
+ output_files = command.output_files,
+ auxiliary_files = command.auxiliary_files,
+ executables = command.executables,
+ requirements = command.requirements,
+ threads = threads,
+ dependencies = dependencies)
+
+ self._command = command
+
+ def _run(self, _config, temp):
+ """Runs the command object provided in the constructor, and waits for it to
+ terminate. If any errors during the running of the command, this function
+ raises a NodeError detailing the returned error-codes."""
+ try:
+ self._command.run(temp)
+ except CmdError, error:
+ desc = "\n\t".join(str(self._command).split("\n"))
+ raise CmdNodeError("%s\n\n%s" % (desc, error))
+
+ return_codes = self._command.join()
+ if any(return_codes):
+ desc = "\n\t".join(str(self._command).split("\n"))
+ raise CmdNodeError(desc)
+
+
+ def _teardown(self, config, temp):
+ required_files = self._command.expected_temp_files
+ optional_files = self._command.optional_temp_files
+ current_files = set(os.listdir(temp))
+
+ missing_files = (required_files - current_files)
+ if missing_files:
+ raise CmdNodeError(("Error running Node, required files not created:\n"
+ "Temporary directory: %r\n"
+ "\tRequired files missing from temporary directory:\n\t - %s") \
+ % (temp, "\n\t - ".join(sorted(map(repr, missing_files)))))
+
+ extra_files = current_files - (required_files | optional_files)
+ if extra_files:
+ raise CmdNodeError("Error running Node, unexpected files created:\n"
+ "\tTemporary directory: %r\n"
+ "\tUnexpected files found in temporary directory:\n\t - %s" \
+ % (temp, "\n\t - ".join(sorted(map(repr, extra_files)))))
+
+ self._command.commit(temp)
+
+ Node._teardown(self, config, temp)
+
+
+# Types that are allowed for the 'description' property
+_DESC_TYPES = types.StringTypes + (types.NoneType,)
diff --git a/paleomix/nodegraph.py b/paleomix/nodegraph.py
new file mode 100644
index 0000000..b30cbd0
--- /dev/null
+++ b/paleomix/nodegraph.py
@@ -0,0 +1,456 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import collections
+import errno
+import logging
+import os
+
+import paleomix.common.versions as versions
+
+from paleomix.common.fileutils import \
+ reroot_path, \
+ missing_executables
+from paleomix.common.utilities import \
+ safe_coerce_to_frozenset
+
+
+# Max number of error messages of each type
+_MAX_ERROR_MESSAGES = 10
+
+
+class FileStatusCache(object):
+ """Cache used to avoid repeatedly checking the state (existance / mtime) of
+ files required / generated by nodes. A new cache is generated for every
+ operation (e.g. refreshing all states / manually setting the state of a
+ node) to avoid relying on the filesystem staying consistant for long
+ periods of time.
+ """
+
+ def __init__(self):
+ self._stat_cache = {}
+
+ def files_exist(self, fpaths):
+ """Returns true if all paths listed in fpaths exist."""
+ return all((self._get_state(fpath) is not None) for fpath in fpaths)
+
+ def missing_files(self, fpaths):
+ """Returns a list of paths in fpaths that do not exist."""
+ return [fpath for fpath in fpaths if (self._get_state(fpath) is None)]
+
+ def are_files_outdated(self, input_files, output_files):
+ """Returns true if any 'input' files have a time-stamp that post-date
+ any time-stamp for the 'output' files, indicating that one or more of
+ the 'input' files have changed since the creation of the 'output'.
+
+ The function also returns true if any files are missing, as this would
+ indicate either the 'output' files or both the 'input' and 'output'
+ files would need to be rebuilt.
+ """
+ input_timestamps = []
+ if not self._get_states(input_files, input_timestamps):
+ return True
+
+ output_timestamps = []
+ if not self._get_states(output_files, output_timestamps):
+ return True
+
+ return max(input_timestamps) > min(output_timestamps)
+
+ def _get_states(self, filenames, dst):
+ """Collects the mtimes for a set of filenames, returning true if all
+ could be collected, and aborting early and returning false otherwise.
+ """
+ for filename in filenames:
+ timestamp = self._get_state(filename)
+ if timestamp is None:
+ return False
+
+ dst.append(timestamp)
+
+ return True
+
+ def _get_state(self, fpath):
+ """Returns the mtime of a path, or None if the path does not exist."""
+ if fpath not in self._stat_cache:
+ try:
+ mtime = os.path.getmtime(fpath)
+ except OSError, error:
+ if error.errno != errno.ENOENT:
+ raise
+ mtime = None
+ self._stat_cache[fpath] = mtime
+ return self._stat_cache[fpath]
+
+
+class NodeGraphError(RuntimeError):
+ pass
+
+
+class NodeGraph:
+ NUMBER_OF_STATES = 6
+ DONE, RUNNING, RUNABLE, QUEUED, OUTDATED, ERROR \
+ = range(NUMBER_OF_STATES)
+
+ def __init__(self, nodes, cache_factory=FileStatusCache):
+ self._cache_factory = cache_factory
+ self._state_observers = []
+ self._states = {}
+
+ nodes = safe_coerce_to_frozenset(nodes)
+
+ self._logger = logging.getLogger(__name__)
+ self._reverse_dependencies = collections.defaultdict(set)
+ self._collect_reverse_dependencies(nodes, self._reverse_dependencies, set())
+ self._intersections = {}
+ self._top_nodes = [node for (node, rev_deps) in self._reverse_dependencies.iteritems() if not rev_deps]
+
+ self._logger.info(" - Checking file dependencies ...")
+ self._check_file_dependencies(self._reverse_dependencies)
+ self._logger.info(" - Checking for required executables ...")
+ self._check_required_executables(self._reverse_dependencies)
+ self._logger.info(" - Checking version requirements ...")
+ self._check_version_requirements(self._reverse_dependencies)
+ self._logger.info(" - Determining states ...")
+ self.refresh_states()
+ self._logger.info(" - Ready ...\n")
+
+ def get_node_state(self, node):
+ return self._states[node]
+
+ def set_node_state(self, node, state):
+ if state not in (NodeGraph.RUNNING, NodeGraph.ERROR, NodeGraph.DONE):
+ raise ValueError("Invalid state: %r" % (state,))
+ old_state = self._states[node]
+ if state == old_state:
+ return
+
+ self._states[node] = state
+ self._notify_state_observers(node, old_state, state, True)
+
+ intersections = self._calculate_intersections(node)
+
+ # Not all nodes may need to be updated, but we still need to
+ # traverse the "graph" (using the intersection counts) in order
+ # to ensure that all nodes that need to be updated are updated.
+ requires_update = dict.fromkeys(intersections, False)
+ for dependency in self._reverse_dependencies[node]:
+ requires_update[dependency] = True
+
+ cache = self._cache_factory()
+ while any(requires_update.itervalues()):
+ for (node, count) in intersections.items():
+ if not count:
+ has_changed = False
+ if requires_update[node]:
+ old_state = self._states.pop(node)
+ new_state = self._update_node_state(node, cache)
+ if new_state != old_state:
+ self._notify_state_observers(node, old_state,
+ new_state, False)
+ has_changed = True
+
+ for dependency in self._reverse_dependencies[node]:
+ intersections[dependency] -= 1
+ requires_update[dependency] |= has_changed
+
+ intersections.pop(node)
+ requires_update.pop(node)
+
+ def __iter__(self):
+ """Returns a graph of nodes."""
+ return iter(self._top_nodes)
+
+ def iterflat(self):
+ return iter(self._reverse_dependencies)
+
+ def refresh_states(self):
+ states = {}
+ cache = self._cache_factory()
+ for (node, state) in self._states.iteritems():
+ if state in (self.ERROR, self.RUNNING):
+ states[node] = state
+ self._states = states
+ for node in self._reverse_dependencies:
+ self._update_node_state(node, cache)
+ self._refresh_state_observers()
+
+ def add_state_observer(self, observer):
+ """Add an observer of changes to the node-graph. The observer
+ is expected to have the following functions:
+
+ refresh(nodegraph):
+ Called when an observer has been added, or when 'refresh_states'
+ has been called on the nodegraph. The observer should rebuild any
+ internal state at this point.
+
+ state_changed(node, old_state, new_state, is_primary):
+ Called when the state of an node has changed. 'is_primary' is
+ True only if the node for which 'set_node_state' was called,
+ and false for nodes the state of which changed as a consequence
+ of the change to the node marked 'is_primary'. This includes
+ ERROR propegating, and more."""
+ self._state_observers.append(observer)
+ observer.refresh(self)
+
+ def _notify_state_observers(self, node, old_state, new_state, is_primary):
+ for observer in self._state_observers:
+ observer.state_changed(node, old_state, new_state, is_primary)
+
+ def _refresh_state_observers(self):
+ for observer in self._state_observers:
+ observer.refresh(self)
+
+ def _calculate_intersections(self, for_node):
+ def count_nodes(node, counts):
+ for node in self._reverse_dependencies[node]:
+ if node in counts:
+ counts[node] += 1
+ else:
+ counts[node] = 1
+ count_nodes(node, counts)
+ return counts
+
+ if for_node not in self._intersections:
+ counts = count_nodes(for_node, {})
+ for dependency in self._reverse_dependencies[for_node]:
+ counts[dependency] -= 1
+ self._intersections[for_node] = counts
+
+ return dict(self._intersections[for_node])
+
+ def _update_node_state(self, node, cache):
+ if node in self._states:
+ return self._states[node]
+
+ # Update sub-nodes before checking for fixed states
+ dependency_states = set((NodeGraph.DONE,))
+ for dependency in node.dependencies:
+ dependency_states.add(self._update_node_state(dependency, cache))
+
+ state = max(dependency_states)
+ if state == NodeGraph.DONE:
+ if not self.is_done(node, cache):
+ state = NodeGraph.RUNABLE
+ elif not cache.files_exist(node.input_files):
+ # Somehow the input files have gone missing, despite the
+ # dependant nodes being done; this implies this node is
+ # outdated, since the input-files should be re-generated, but
+ # obviously it is not possible to run it at this point.
+ missing = cache.missing_files(node.input_files)
+ self._logger.error("ERROR: Input file(s) missing for node; "
+ "may have been moved while the pipeline "
+ "was running. Cannot proceed:\n"
+ " Node = %s\n Files = %s\n"
+ % (node, "\n ".join(missing)))
+ state = NodeGraph.ERROR
+ elif self.is_outdated(node, cache):
+ state = NodeGraph.RUNABLE
+ elif state in (NodeGraph.RUNNING, NodeGraph.RUNABLE, NodeGraph.QUEUED):
+ if self.is_done(node, cache):
+ state = NodeGraph.OUTDATED
+ else:
+ state = NodeGraph.QUEUED
+ self._states[node] = state
+
+ return state
+
+ @classmethod
+ def is_done(cls, node, cache):
+ """Returns true if the node itself is done; this only implies that the
+ output files generated by this node exists. The files themselves may
+ be outdated.
+ """
+ return cache.files_exist(node.output_files)
+
+ @classmethod
+ def is_outdated(cls, node, cache):
+ """Returns true if the not is not done or if one or more of the input
+ files appear to have been changed since the creation of the output
+ files (based on the timestamps). A node that lacks either input or
+ output files is never considered outdated.
+ """
+ if not (node.input_files and node.output_files):
+ return False
+
+ return cache.are_files_outdated(node.input_files, node.output_files)
+
+ @classmethod
+ def _check_required_executables(cls, nodes):
+ exec_filenames = set()
+ for node in nodes:
+ exec_filenames.update(node.executables)
+
+ # Requirements may include executables not invoked directly
+ for requirement in node.requirements:
+ if isinstance(requirement, versions.RequirementObj):
+ executable = requirement.executable
+ if executable is not None:
+ exec_filenames.add(executable)
+
+ missing_exec = missing_executables(exec_filenames)
+ if missing_exec:
+ raise NodeGraphError("Required executables are missing:\n\t%s"
+ % ("\n\t".join(sorted(missing_exec))))
+
+ def _check_version_requirements(self, nodes):
+ exec_requirements = set()
+ for node in nodes:
+ exec_requirements.update(node.requirements)
+
+ def _key_func(reqobj):
+ # Sort priority in decreasing order, name in increasing order
+ return (-reqobj.priority, reqobj.name)
+ exec_requirements = list(sorted(exec_requirements, key=_key_func))
+
+ try:
+ for requirement in exec_requirements:
+ self._logger.info(" - Checking version of %r ..."
+ % (requirement.name,))
+
+ requirement()
+ except versions.VersionRequirementError, error:
+ raise NodeGraphError(error)
+ except OSError, error:
+ raise NodeGraphError("Could not check version for %s:\n\t%s"
+ % (requirement.name, error))
+
+ @classmethod
+ def _check_file_dependencies(cls, nodes):
+ files = ("input_files", "output_files")
+ files = dict((key, collections.defaultdict(set)) for key in files)
+ # Auxiliary files are treated as input files
+ files["auxiliary_files"] = files["input_files"]
+
+ for node in nodes:
+ for (attr, nodes_by_file) in files.iteritems():
+ for filename in getattr(node, attr):
+ nodes_by_file[filename].add(node)
+
+ max_messages = range(_MAX_ERROR_MESSAGES)
+ error_messages = []
+ error_messages.extend(zip(max_messages, cls._check_output_files(files["output_files"])))
+ error_messages.extend(zip(max_messages, cls._check_input_dependencies(files["input_files"],
+ files["output_files"], nodes)))
+
+ if error_messages:
+ messages = []
+ for (_, error) in error_messages:
+ for line in error.split("\n"):
+ messages.append("\t" + line)
+
+ raise NodeGraphError("Errors detected during graph construction (max %i shown):\n%s" \
+ % (_MAX_ERROR_MESSAGES * 2, "\n".join(messages)),)
+
+
+ @classmethod
+ def _check_output_files(cls, output_files):
+ """Checks dict of output files to nodes for cases where
+ multiple nodes create the same output file.
+
+ The directory component of paths are realized in order to
+ detect cases where nodes create the same file, but via
+ different paths (e.g. due to relative/absolute paths, or
+ due to use of symbolic links). Since output files are
+ replaced, not modified in place, it is not nessesary to
+ compare files themselves."""
+ dirpath_cache, real_output_files = {}, {}
+ for (filename, nodes) in output_files.iteritems():
+ dirpath = os.path.dirname(filename)
+ if dirpath not in dirpath_cache:
+ dirpath_cache[dirpath] = os.path.realpath(dirpath)
+
+ real_output_file = reroot_path(dirpath_cache[dirpath], filename)
+ real_output_files.setdefault(real_output_file, []).extend(nodes)
+
+ for (filename, nodes) in real_output_files.iteritems():
+ if (len(nodes) > 1):
+ nodes = _summarize_nodes(nodes)
+ yield "Multiple nodes create the same (clobber) output-file:" \
+ "\n\tFilename: %s\n\tNodes: %s" \
+ % (filename, "\n\t ".join(nodes))
+
+
+ @classmethod
+ def _check_input_dependencies(cls, input_files, output_files, nodes):
+ dependencies = cls._collect_dependencies(nodes, {})
+
+ for (filename, nodes) in sorted(input_files.items(), key = lambda v: v[0]):
+ if (filename in output_files):
+ producers = output_files[filename]
+ bad_nodes = set()
+ for consumer in nodes:
+ if not (producers & dependencies[consumer]):
+ bad_nodes.add(consumer)
+
+ if bad_nodes:
+ producer = iter(producers).next()
+ bad_nodes = _summarize_nodes(bad_nodes)
+ yield "Node depends on dynamically created file, but not on the node creating it:" + \
+ "\n\tFilename: %s\n\tCreated by: %s\n\tDependent node(s): %s" \
+ % (filename, producer, "\n\t ".join(bad_nodes))
+ elif not os.path.exists(filename):
+ nodes = _summarize_nodes(nodes)
+ yield "Required file does not exist, and is not created by a node:" + \
+ "\n\tFilename: %s\n\tDependent node(s): %s" \
+ % (filename, "\n\t ".join(nodes))
+
+ @classmethod
+ def _collect_dependencies(cls, nodes, dependencies):
+ for node in nodes:
+ if node not in dependencies:
+ subnodes = node.dependencies
+ if not subnodes:
+ dependencies[node] = frozenset()
+ continue
+
+ cls._collect_dependencies(subnodes, dependencies)
+
+ collected = set(subnodes)
+ for subnode in subnodes:
+ collected.update(dependencies[subnode])
+ dependencies[node] = frozenset(collected)
+
+ return dependencies
+
+
+ @classmethod
+ def _collect_reverse_dependencies(cls, lst, rev_dependencies, processed):
+ for node in lst:
+ if node not in processed:
+ processed.add(node)
+
+ # Initialize default-dict
+ rev_dependencies[node] # pylint: disable=W0104
+
+ subnodes = node.dependencies
+ for dependency in subnodes:
+ rev_dependencies[dependency].add(node)
+ cls._collect_reverse_dependencies(subnodes, rev_dependencies, processed)
+
+
+
+def _summarize_nodes(nodes):
+ nodes = list(sorted(set(map(str, nodes))))
+ if len(nodes) > 4:
+ nodes = nodes[:5] + ["and %i more nodes ..." % len(nodes)]
+ return nodes
diff --git a/paleomix/nodes/__init__.py b/paleomix/nodes/__init__.py
new file mode 100644
index 0000000..cd42802
--- /dev/null
+++ b/paleomix/nodes/__init__.py
@@ -0,0 +1,22 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
diff --git a/paleomix/nodes/adapterremoval.py b/paleomix/nodes/adapterremoval.py
new file mode 100644
index 0000000..18acabb
--- /dev/null
+++ b/paleomix/nodes/adapterremoval.py
@@ -0,0 +1,196 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+
+from paleomix.node import \
+ CommandNode
+from paleomix.atomiccmd.sets import \
+ ParallelCmds
+from paleomix.atomiccmd.builder import \
+ AtomicCmdBuilder, \
+ use_customizable_cli_parameters, \
+ create_customizable_cli_parameters
+
+import paleomix.common.fileutils as fileutils
+import paleomix.common.versions as versions
+import paleomix.tools.factory as factory
+
+
+_VERSION_CHECK = versions.Requirement(call=("AdapterRemoval", "--version"),
+ search=r"ver. (\d+)\.(\d+)\.(\d+)",
+ checks=versions.GE(2, 1, 5))
+
+
+class SE_AdapterRemovalNode(CommandNode):
+ @create_customizable_cli_parameters
+ def customize(cls, input_files, output_prefix, output_format="bz2",
+ threads=1, dependencies=()):
+ # See below for parameters in common between SE/PE
+ cmd = _get_common_parameters(output_format, threads=threads)
+
+ # Prefix for output files, ensure that all end up in temp folder
+ cmd.set_option("--basename", "%(TEMP_OUT_BASENAME)s")
+
+ output_tmpl = output_prefix + ".%s." + output_format
+ cmd.set_kwargs(TEMP_OUT_BASENAME=os.path.basename(output_prefix),
+
+ OUT_SETTINGS=output_prefix + ".settings",
+ OUT_MATE_1=output_tmpl % ("truncated",),
+ OUT_DISCARDED=output_tmpl % ("discarded",))
+
+ if len(input_files) > 1:
+ # Uncompressed reads (piped from 'paleomix cat')
+ cmd.set_option("--file1", "%(TEMP_IN_READS_1)s")
+ cmd.set_kwargs(TEMP_IN_READS_1="uncompressed_input")
+ else:
+ cmd.set_option("--file1", "%(IN_READS_1)s")
+ cmd.set_kwargs(IN_READS_1=input_files[0])
+
+ return {"command": cmd,
+ "threads": threads,
+ "dependencies": dependencies}
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ command = parameters.command.finalize()
+
+ self._multi_file_input = len(parameters.input_files) > 1
+ if self._multi_file_input:
+ cat = _build_cat_command(parameters.input_files, "uncompressed_input")
+ command = ParallelCmds((command, cat))
+
+ CommandNode.__init__(self,
+ command=command,
+ threads=parameters.threads,
+ description="<AdapterRM (SE): %s -> '%s.*'>"
+ % (fileutils.describe_files(parameters.input_files),
+ parameters.output_prefix),
+ dependencies=parameters.dependencies)
+
+ def _setup(self, config, temp):
+ if self._multi_file_input:
+ os.mkfifo(os.path.join(os.path.join(temp, "uncompressed_input")))
+
+ CommandNode._setup(self, config, temp)
+
+
+class PE_AdapterRemovalNode(CommandNode):
+ @create_customizable_cli_parameters
+ def customize(cls, input_files_1, input_files_2, output_prefix,
+ output_format="bz2", collapse=True, threads=1,
+ dependencies=()):
+ if len(input_files_1) != len(input_files_2):
+ raise ValueError("Unequal number of mate 1 and mate 2 files")
+
+ cmd = _get_common_parameters(output_format, threads=threads)
+
+ # Prefix for output files, to ensure that all end up in temp folder
+ cmd.set_option("--basename", "%(TEMP_OUT_BASENAME)s")
+
+ output_tmpl = output_prefix + ".%s." + output_format
+ cmd.set_kwargs(TEMP_OUT_BASENAME=os.path.basename(output_prefix),
+
+ OUT_SETTINGS=output_prefix + ".settings",
+ OUT_READS_1=output_tmpl % ("pair1.truncated",),
+ OUT_READS_2=output_tmpl % ("pair2.truncated",),
+ OUT_SINGLETON=output_tmpl % ("singleton.truncated",),
+ OUT_DISCARDED=output_tmpl % ("discarded",))
+
+ if collapse:
+ cmd.set_option("--collapse")
+
+ cmd.set_kwargs(OUT_COLLAPSED=output_tmpl % ("collapsed",),
+ OUT_COLLAPSED_TRUNC=output_tmpl
+ % ("collapsed.truncated",))
+
+ if len(input_files_1) > 1:
+ # Uncompressed reads (piped from 'paleomix cat')
+ cmd.set_option("--file1", "%(TEMP_IN_READS_1)s")
+ cmd.set_option("--file2", "%(TEMP_IN_READS_2)s")
+ cmd.set_kwargs(TEMP_IN_READS_1="uncompressed_input_1",
+ TEMP_IN_READS_2="uncompressed_input_2")
+ else:
+ cmd.set_option("--file1", "%(IN_READS_1)s")
+ cmd.set_option("--file2", "%(IN_READS_2)s")
+ cmd.set_kwargs(IN_READS_1=input_files_1[0],
+ IN_READS_2=input_files_2[0])
+
+ return {"command": cmd,
+ "threads": threads,
+ "dependencies": dependencies}
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ command = parameters.command.finalize()
+ self._multi_file_input = len(parameters.input_files_1) > 1
+ if self._multi_file_input:
+ cat_1 = _build_cat_command(parameters.input_files_1, "uncompressed_input_1")
+ cat_2 = _build_cat_command(parameters.input_files_2, "uncompressed_input_2")
+ command = ParallelCmds((command, cat_1, cat_2))
+
+ CommandNode.__init__(self,
+ command=command,
+ threads=parameters.threads,
+ description="<AdapterRM (PE): %s -> '%s.*'>"
+ % (fileutils.describe_paired_files(parameters.input_files_1,
+ parameters.input_files_2),
+ parameters.output_prefix),
+ dependencies=parameters.dependencies)
+
+ def _setup(self, config, temp):
+ if self._multi_file_input:
+ os.mkfifo(os.path.join(os.path.join(temp, "uncompressed_input_1")))
+ os.mkfifo(os.path.join(os.path.join(temp, "uncompressed_input_2")))
+
+ CommandNode._setup(self, config, temp)
+
+
+def _build_cat_command(input_files, output_file):
+ cat = factory.new("cat")
+ cat.set_option("--output", "%(TEMP_OUT_CAT)s")
+ cat.set_kwargs(TEMP_OUT_CAT=output_file)
+ cat.add_multiple_values(input_files)
+
+ return cat.finalize()
+
+
+def _get_common_parameters(output_format, threads=1):
+ cmd = AtomicCmdBuilder("AdapterRemoval",
+ CHECK_VERSION=_VERSION_CHECK)
+
+ if output_format == "bz2":
+ cmd.set_option("--bzip2")
+ elif output_format == "gz":
+ cmd.set_option("--gzip")
+ else:
+ raise ValueError("Invalid output compression %r" % (output_format,))
+
+ # Trim Ns at read ends
+ cmd.set_option("--trimns", fixed=False)
+ # Trim low quality scores
+ cmd.set_option("--trimqualities", fixed=False)
+
+ # Fix number of threads to ensure consistency when scheduling node
+ cmd.set_option("--threads", threads)
+
+ return cmd
diff --git a/paleomix/nodes/bedtools.py b/paleomix/nodes/bedtools.py
new file mode 100644
index 0000000..04b852a
--- /dev/null
+++ b/paleomix/nodes/bedtools.py
@@ -0,0 +1,117 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import paleomix.common.versions as versions
+
+from paleomix.atomiccmd.command import \
+ AtomicCmd
+from paleomix.node import \
+ CommandNode, \
+ Node, \
+ NodeError
+
+from paleomix.common.bedtools import \
+ read_bed_file
+from paleomix.common.fileutils import \
+ move_file, \
+ reroot_path
+
+
+BEDTOOLS_VERSION \
+ = versions.Requirement(call=("bedtools", "--version"),
+ search=r"bedtools v?(\d+)\.(\d+)\.(\d+)",
+ checks=versions.GE(2, 15, 0))
+
+
+class SlopBedNode(CommandNode):
+ def __init__(self, infile, outfile, genome, from_start=0, from_end=0,
+ strand_relative=False, dependencies=()):
+ if type(from_start) != type(from_end):
+ raise ValueError("Parameters 'from_start' and 'from_end' should "
+ "be of same type!")
+
+ call = ["bedtools", "slop",
+ "-i", "%(IN_FILE)s",
+ "-g", "%(IN_GENOME)s",
+ "-l", str(from_start),
+ "-r", str(from_end)]
+
+ if strand_relative:
+ call.append("-s")
+ if type(from_start) is float:
+ call.append("-pct")
+
+ command = AtomicCmd(call,
+ IN_FILE=infile,
+ IN_GENOME=genome,
+ OUT_STDOUT=outfile,
+ CHECK_VERSION=BEDTOOLS_VERSION)
+
+ description = "<SlopBed: '%s' -> '%s'>" % (infile, outfile)
+
+ CommandNode.__init__(self,
+ description=description,
+ command=command,
+ dependencies=dependencies)
+
+
+class PaddedBedNode(Node):
+ """Simple node for padding BED records a fixed amount and merging
+ overlapping records. Columns beyond the 3rd column are dropped.
+ """
+
+ def __init__(self, infile, outfile, fai_file, amount=0, dependencies=()):
+ self._amount = int(amount)
+ self._infile = infile
+ self._outfile = outfile
+ self._fai_file = fai_file
+
+ Node.__init__(self,
+ description='<PaddedBed (%i): %r -> %r>'
+ % (amount, infile, outfile),
+ input_files=(infile, fai_file),
+ output_files=(outfile,),
+ dependencies=dependencies)
+
+ def _run(self, config, temp):
+ contigs = {}
+ with open(self._fai_file) as handle:
+ for line in handle:
+ name, length, _ = line.split('\t', 2)
+ if name in contigs:
+ raise NodeError('Reference genome contains multiple '
+ 'identically named contigs (%r)!'
+ % (name,))
+
+ contigs[name] = int(length)
+
+ with open(reroot_path(temp, self._outfile), 'w') as handle:
+ for record in read_bed_file(self._infile, contigs=contigs):
+ max_length = contigs[record.contig]
+ record.start = max(0, record.start - self._amount)
+ record.end = min(record.end + self._amount, max_length)
+
+ handle.write('%s\n' % (record,))
+
+ def _teardown(self, config, temp):
+ source = reroot_path(temp, self._outfile)
+ move_file(source, self._outfile)
diff --git a/paleomix/nodes/bowtie2.py b/paleomix/nodes/bowtie2.py
new file mode 100644
index 0000000..0c63414
--- /dev/null
+++ b/paleomix/nodes/bowtie2.py
@@ -0,0 +1,147 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+
+from paleomix.node import \
+ CommandNode, \
+ NodeError
+from paleomix.atomiccmd.command import \
+ AtomicCmd
+from paleomix.atomiccmd.builder import \
+ AtomicCmdBuilder, \
+ use_customizable_cli_parameters, \
+ create_customizable_cli_parameters
+from paleomix.atomiccmd.sets import \
+ ParallelCmds
+from paleomix.nodes.bwa import \
+ _get_node_description, \
+ _process_output, \
+ _get_max_threads
+from paleomix.common.utilities import \
+ safe_coerce_to_tuple
+
+import paleomix.common.versions as versions
+
+
+BOWTIE2_VERSION = versions.Requirement(call=("bowtie2", "--version"),
+ search=r"version (\d+)\.(\d+)\.(\d+)",
+ checks=versions.GE(2, 1, 0))
+
+
+class Bowtie2IndexNode(CommandNode):
+ @create_customizable_cli_parameters
+ def customize(cls, input_file, prefix=None, dependencies=()):
+ prefix = prefix if prefix else input_file
+ params = _bowtie2_template(("bowtie2-build"), prefix, iotype="OUT",
+ IN_FILE=input_file,
+ TEMP_OUT_PREFIX=os.path.basename(prefix),
+ CHECK_VERSION=BOWTIE2_VERSION)
+
+ params.add_value("%(IN_FILE)s")
+ # Destination prefix, in temp folder
+ params.add_value("%(TEMP_OUT_PREFIX)s")
+
+ return {"prefix": prefix,
+ "command": params,
+ "dependencies": dependencies}
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ command = parameters.command.finalize()
+ description = "<Bowtie2 Index '%s' -> '%s.*'>" \
+ % (parameters.input_file, parameters.prefix)
+
+ CommandNode.__init__(self,
+ command=command,
+ description=description,
+ dependencies=parameters.dependencies)
+
+
+class Bowtie2Node(CommandNode):
+ @create_customizable_cli_parameters
+ def customize(cls, input_file_1, input_file_2, output_file, reference,
+ prefix, threads=2, log_file=None, dependencies=()):
+
+ # Setting IN_FILE_2 to None makes AtomicCmd ignore this key
+ aln = _bowtie2_template(("bowtie2",), prefix,
+ OUT_STDOUT=AtomicCmd.PIPE,
+ CHECK_VERSION=BOWTIE2_VERSION)
+
+ aln.set_option("-x", prefix)
+
+ if log_file is not None:
+ aln.set_kwargs(OUT_STDERR=log_file)
+
+ if input_file_1 and not input_file_2:
+ aln.add_multiple_options("-U", safe_coerce_to_tuple(input_file_1),
+ template="IN_FILE_1_%02i")
+ elif input_file_1 and input_file_2:
+ aln.add_multiple_options("-1", safe_coerce_to_tuple(input_file_1),
+ template="IN_FILE_1_%02i")
+ aln.add_multiple_options("-2", safe_coerce_to_tuple(input_file_2),
+ template="IN_FILE_2_%02i")
+ else:
+ raise NodeError("Input 1, OR both input 1 and input 2 must "
+ "be specified for Bowtie2 node")
+
+ max_threads = _get_max_threads(reference, threads)
+ aln.set_option("--threads", max_threads)
+
+ run_fixmate = input_file_1 and input_file_2
+ order, commands = _process_output(aln, output_file, reference,
+ run_fixmate=run_fixmate)
+ commands["aln"] = aln
+
+ return {"commands": commands,
+ "order": ["aln"] + order,
+ "threads": max_threads,
+ "dependencies": dependencies}
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ command = ParallelCmds([parameters.commands[key].finalize()
+ for key in parameters.order])
+
+ algorithm = "PE" if parameters.input_file_2 else "SE"
+ description \
+ = _get_node_description(name="Bowtie2",
+ algorithm=algorithm,
+ input_files_1=parameters.input_file_1,
+ input_files_2=parameters.input_file_2,
+ prefix=parameters.prefix,
+ threads=parameters.threads)
+
+ CommandNode.__init__(self,
+ command=command,
+ description=description,
+ threads=parameters.threads,
+ dependencies=parameters.dependencies)
+
+
+def _bowtie2_template(call, prefix, iotype="IN", **kwargs):
+ for postfix in ("1.bt2", "2.bt2", "3.bt2", "4.bt2",
+ "rev.1.bt2", "rev.2.bt2"):
+ key = "%s_PREFIX_%s" % (iotype, postfix.upper())
+ kwargs[key] = prefix + "." + postfix
+
+ return AtomicCmdBuilder(call, **kwargs)
diff --git a/paleomix/nodes/bwa.py b/paleomix/nodes/bwa.py
new file mode 100644
index 0000000..4b66d7c
--- /dev/null
+++ b/paleomix/nodes/bwa.py
@@ -0,0 +1,430 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+
+from paleomix.node import CommandNode, NodeError
+from paleomix.atomiccmd.command import AtomicCmd
+from paleomix.atomiccmd.builder import \
+ AtomicCmdBuilder, \
+ use_customizable_cli_parameters, \
+ create_customizable_cli_parameters
+
+from paleomix.atomiccmd.sets import ParallelCmds
+from paleomix.nodes.samtools import SAMTOOLS_VERSION
+from paleomix.common.fileutils import \
+ describe_paired_files, \
+ missing_files
+
+import paleomix.common.versions as versions
+import paleomix.tools.factory as factory
+
+
+BWA_VERSION = versions.Requirement(call=("bwa",),
+ search=r"Version: (\d+)\.(\d+)\.(\d+)",
+ checks=versions.Or(versions.EQ(0, 5, 9),
+ versions.EQ(0, 5, 10),
+ versions.EQ(0, 6, 2),
+ versions.GE(0, 7, 9)))
+
+BWA_VERSION_07x = versions.Requirement(call=("bwa",),
+ search=r"Version: (\d+)\.(\d+)\.(\d+)",
+ checks=versions.GE(0, 7, 9))
+
+
+class BWAIndexNode(CommandNode):
+ @create_customizable_cli_parameters
+ def customize(cls, input_file, prefix=None, dependencies=()):
+ prefix = prefix if prefix else input_file
+ params = _get_bwa_template(("bwa", "index"), prefix, iotype="OUT",
+ IN_FILE=input_file,
+ TEMP_OUT_PREFIX=os.path.basename(prefix),
+ CHECK_BWA=BWA_VERSION)
+
+ # Input fasta sequence
+ params.add_value("%(IN_FILE)s")
+ # Destination prefix, in temp folder
+ params.set_option("-p", "%(TEMP_OUT_PREFIX)s")
+
+ return {"prefix": prefix,
+ "command": params,
+ "dependencies": dependencies}
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ command = parameters.command.finalize()
+ description = "<BWA Index '%s' -> '%s.*'>" % (parameters.input_file,
+ parameters.prefix)
+ CommandNode.__init__(self,
+ command=command,
+ description=description,
+ dependencies=parameters.dependencies)
+
+
+class BWABacktrack(CommandNode):
+ @create_customizable_cli_parameters
+ def customize(cls, input_file, output_file, reference, prefix, threads=1,
+ dependencies=()):
+ _check_bwa_prefix(reference)
+ threads = _get_max_threads(reference, threads)
+
+ aln_in = _build_cat_command(input_file, "uncompressed_input")
+ aln = _get_bwa_template(("bwa", "aln"), prefix,
+ TEMP_IN_FILE="uncompressed_input",
+ OUT_STDOUT=output_file,
+ CHECK_BWA=BWA_VERSION)
+ aln.add_value(prefix)
+ aln.add_value("%(TEMP_IN_FILE)s")
+ aln.set_option("-t", threads)
+
+ return {"commands": {"aln_in": aln_in, "aln": aln},
+ "order": ["aln_in", "aln"],
+ "threads": threads,
+ "dependencies": dependencies}
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ command = ParallelCmds([parameters.commands[key].finalize()
+ for key in parameters.order])
+
+ description \
+ = _get_node_description(name="BWA",
+ algorithm='Backtrack',
+ input_files_1=parameters.input_file,
+ prefix=parameters.prefix,
+ threads=parameters.threads)
+
+ CommandNode.__init__(self,
+ command=command,
+ description=description,
+ threads=parameters.threads,
+ dependencies=parameters.dependencies)
+
+ def _setup(self, _config, temp):
+ os.mkfifo(os.path.join(temp, "uncompressed_input"))
+
+
+class BWASamse(CommandNode):
+ @create_customizable_cli_parameters
+ def customize(cls, input_file_fq, input_file_sai, output_file,
+ reference, prefix, dependencies=()):
+ _check_bwa_prefix(reference)
+
+ samse_in = _build_cat_command(input_file_fq, "uncompressed_input")
+ samse = _get_bwa_template(("bwa", "samse"), prefix,
+ IN_FILE_SAI=input_file_sai,
+ TEMP_IN_FQ="uncompressed_input",
+ OUT_STDOUT=AtomicCmd.PIPE,
+ CHECK_BWA=BWA_VERSION)
+ samse.add_value(prefix)
+ samse.add_value("%(IN_FILE_SAI)s")
+ samse.add_value("%(TEMP_IN_FQ)s")
+
+ order, commands = _process_output(samse, output_file, reference)
+ commands["sam_in"] = samse_in
+ commands["sam"] = samse
+
+ return {"commands": commands,
+ "order": ["sam_in", "sam"] + order,
+ "dependencies": dependencies}
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ command = ParallelCmds([parameters.commands[key].finalize()
+ for key in parameters.order])
+
+ input_file = parameters.input_file_fq
+ description = _get_node_description(name="BWA Samse",
+ input_files_1=input_file,
+ prefix=parameters.prefix)
+
+ CommandNode.__init__(self,
+ command=command,
+ description=description,
+ dependencies=parameters.dependencies)
+
+ def _setup(self, _config, temp):
+ os.mkfifo(os.path.join(temp, "uncompressed_input"))
+
+
+class BWASampe(CommandNode):
+ @create_customizable_cli_parameters
+ def customize(cls,
+ input_file_fq_1, input_file_fq_2,
+ input_file_sai_1, input_file_sai_2,
+ output_file, reference, prefix, dependencies=()):
+ _check_bwa_prefix(reference)
+
+ sampe_in_1 = _build_cat_command(input_file_fq_1,
+ "uncompressed_input_1")
+ sampe_in_2 = _build_cat_command(input_file_fq_2,
+ "uncompressed_input_2")
+
+ sampe = _get_bwa_template(("bwa", "sampe"), prefix,
+ IN_FILE_SAI_1=input_file_sai_1,
+ IN_FILE_SAI_2=input_file_sai_2,
+ TEMP_IN_FQ_1="uncompressed_input_1",
+ TEMP_IN_FQ_2="uncompressed_input_2",
+ OUT_STDOUT=AtomicCmd.PIPE,
+ CHECK_BWA=BWA_VERSION)
+ sampe.add_value(prefix)
+ sampe.add_value("%(IN_FILE_SAI_1)s")
+ sampe.add_value("%(IN_FILE_SAI_2)s")
+ sampe.add_value("%(TEMP_IN_FQ_1)s")
+ sampe.add_value("%(TEMP_IN_FQ_2)s")
+
+ order, commands = _process_output(sampe, output_file, reference,
+ run_fixmate=True)
+ commands["sam_in_1"] = sampe_in_1
+ commands["sam_in_2"] = sampe_in_2
+ commands["sam"] = sampe
+
+ return {"commands": commands,
+ "order": ["sam_in_1", "sam_in_2", "sam"] + order,
+ "dependencies": dependencies}
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ command = ParallelCmds([parameters.commands[key].finalize()
+ for key in parameters.order])
+
+ input_file_1 = parameters.input_file_fq_1
+ input_file_2 = parameters.input_file_fq_2
+ description = _get_node_description(name="BWA Sampe",
+ input_files_1=input_file_1,
+ input_files_2=input_file_2,
+ prefix=parameters.prefix)
+
+ CommandNode.__init__(self,
+ command=command,
+ description=description,
+ dependencies=parameters.dependencies)
+
+ def _setup(self, _config, temp):
+ os.mkfifo(os.path.join(temp, "uncompressed_input_1"))
+ os.mkfifo(os.path.join(temp, "uncompressed_input_2"))
+
+
+class BWAAlgorithmNode(CommandNode):
+ @create_customizable_cli_parameters
+ def customize(cls, input_file_1, output_file, reference, prefix,
+ input_file_2=None, threads=1, algorithm="mem",
+ dependencies=()):
+ if algorithm not in ("mem", "bwasw"):
+ raise NotImplementedError("BWA algorithm %r not implemented"
+ % (algorithm,))
+
+ threads = _get_max_threads(reference, threads)
+
+ zcat_1 = _build_cat_command(input_file_1, "uncompressed_input_1")
+ aln = _get_bwa_template(("bwa", algorithm), prefix,
+ TEMP_IN_FILE_1="uncompressed_input_1",
+ OUT_STDOUT=AtomicCmd.PIPE,
+ CHECK_BWA=BWA_VERSION_07x)
+ aln.add_value(prefix)
+ aln.add_value("%(TEMP_IN_FILE_1)s")
+
+ _, commands = _process_output(aln, output_file, reference)
+ commands["aln"] = aln
+ commands["zcat_1"] = zcat_1
+ if input_file_2:
+ aln.add_value("%(TEMP_IN_FILE_2)s")
+ aln.set_kwargs(**{"TEMP_IN_FILE_2": "uncompressed_input_2"})
+ zcat_2 = _build_cat_command(input_file_2, "uncompressed_input_2")
+ commands["zcat_2"] = zcat_2
+ else:
+ # Ensure that the pipe is automatically removed
+ aln.set_kwargs(**{"TEMP_OUT_FILE_2": "uncompressed_input_2"})
+
+ aln.set_option("-t", threads)
+ # Mark alternative hits as secondary; required by e.g. Picard
+ aln.set_option("-M")
+
+ commands["aln"] = aln
+ return {"commands": commands,
+ "threads": threads,
+ "dependencies": dependencies}
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ _check_bwa_prefix(parameters.prefix)
+ algorithm = parameters.algorithm.upper()
+ algorithm += "_PE" if parameters.input_file_2 else "_SE"
+ desc = _get_node_description(name="BWA",
+ algorithm=algorithm,
+ input_files_1=parameters.input_file_1,
+ input_files_2=parameters.input_file_2,
+ prefix=parameters.prefix)
+
+ command = ParallelCmds([cmd.finalize()
+ for cmd in parameters.commands.itervalues()])
+ CommandNode.__init__(self,
+ command=command,
+ description=desc,
+ threads=parameters.threads,
+ dependencies=parameters.dependencies)
+
+ def _setup(self, _config, temp):
+ os.mkfifo(os.path.join(temp, "uncompressed_input_1"))
+ os.mkfifo(os.path.join(temp, "uncompressed_input_2"))
+
+
+def _process_output(stdin, output_file, reference, run_fixmate=False):
+ convert = factory.new("cleanup")
+ if reference is not None:
+ convert.set_option("--fasta", "%(IN_FASTA_REF)s")
+ convert.set_option("--temp-prefix", "%(TEMP_OUT_PREFIX)s")
+ convert.set_kwargs(IN_STDIN=stdin,
+ IN_FASTA_REF=reference,
+ OUT_STDOUT=output_file,
+ TEMP_OUT_PREFIX="bam_cleanup",
+ CHECK_SAMTOOLS=SAMTOOLS_VERSION)
+
+ if run_fixmate:
+ convert.set_option('--paired-end')
+
+ try:
+ if SAMTOOLS_VERSION.version >= (1,):
+ convert.set_option('--samtools1x', 'yes')
+ else:
+ convert.set_option('--samtools1x', 'no')
+ except versions.VersionRequirementError:
+ pass
+
+ return ["convert"], {"convert": convert}
+
+
+def _get_bwa_template(call, prefix, iotype="IN", **kwargs):
+ extensions = ["amb", "ann", "bwt", "pac", "sa"]
+ try:
+ if BWA_VERSION.version < (0, 6, 0):
+ extensions.extend(("rbwt", "rpac", "rsa"))
+ except versions.VersionRequirementError:
+ pass # Ignored here, handled elsewhere
+
+ params = AtomicCmdBuilder(call, **kwargs)
+ for postfix in extensions:
+ key = "%s_PREFIX_%s" % (iotype, postfix.upper())
+ params.set_kwargs(**{key: (prefix + "." + postfix)})
+
+ return params
+
+
+def _get_max_threads(reference, threads):
+ """Returns the maximum number of threads to use when mapping against a
+ given reference sequence. This is done since very little gain is obtained
+ when using multiple threads for a small genome (e.g. < 1MB). If the
+ reference falls below this size, only 1 thread is used (returned),
+ otherwise the requested number of threads is returned.
+ """
+ if reference not in _PREFIX_SIZE_CACHE:
+ if reference is None or not os.path.exists(reference):
+ _PREFIX_SIZE_CACHE[reference] = None
+ else:
+ _PREFIX_SIZE_CACHE[reference] = os.path.getsize(reference)
+
+ prefix_size = _PREFIX_SIZE_CACHE[reference]
+ if prefix_size is None or prefix_size >= 2 ** 20: # > 1 MB
+ return threads
+ return 1
+_PREFIX_SIZE_CACHE = {}
+
+
+def _check_bwa_prefix(prefix):
+ """Checks that a given prefix is compatible with the currently
+ installed version of BWA. This is required in order to allow
+ auto-indexing of prefixes, as indexes produced by v0.5.x and
+ by 0.6+ are not only incompatible, but differs in the files
+ produced, with 0.5.x producing a handful of additional files.
+
+ As a consequence, simply using normal input-file dependencies
+ would result in prefixes being re-indexed if the version of
+ BWA was changed from 0.6+ to 0.5.x, and in failures during
+ runtime if the version was changed from 0.5.x to 0.6+.
+
+ This function treats that a difference in the version of BWA
+ installed and the version implied by the prefix files is an
+ error, and therefore requires user intervention."""
+ if prefix in _PREFIXES_CHECKED:
+ return
+ _PREFIXES_CHECKED.add(prefix)
+
+ try:
+ bwa_version = BWA_VERSION.version
+ except versions.VersionRequirementError:
+ return # Ignored here, reported elsewhere
+
+ # Files unique to v0.5.x
+ v05x_files = set((prefix + ext) for ext in (".rbwt", ".rpac", ".rsa"))
+ # Files common to v0.5.x, v0.6.x, and v0.7.x
+ common_files = set((prefix + ext)
+ for ext in (".amb", ".ann", ".bwt", ".pac", ".sa"))
+ all_files = v05x_files | common_files
+ current_files = all_files - set(missing_files(all_files))
+
+ expected_version = None
+ if (current_files & common_files):
+ if bwa_version >= (0, 6, 0):
+ if (current_files & v05x_files):
+ expected_version = "v0.5.x"
+ elif bwa_version < (0, 6, 0):
+ if not (current_files & v05x_files):
+ expected_version = "v0.6.x or later"
+
+ if expected_version:
+ raise NodeError("BWA version is v%s, but prefix appears to be created using %s!\n"
+ " Your copy of BWA may have changed, or you may be using the wrong\n"
+ " prefix. To resolve this issue, either change your prefix, re-install\n"
+ " BWA %s, or remove the prefix files at\n"
+ " $ ls %s.*" \
+ % (".".join(map(str, bwa_version)), expected_version, expected_version, prefix))
+_PREFIXES_CHECKED = set()
+
+
+def _build_cat_command(input_file, output_file):
+ cat = factory.new("cat")
+ cat.set_option("--output", "%(TEMP_OUT_CAT)s")
+ cat.add_value("%(IN_ARCHIVE)s")
+ cat.set_kwargs(TEMP_OUT_CAT=output_file,
+ IN_ARCHIVE=input_file)
+ return cat
+
+
+def _get_node_description(name, input_files_1, input_files_2=None,
+ algorithm=None, prefix=None, threads=1):
+ info = []
+ if prefix is not None:
+ prefix = os.path.basename(prefix)
+ if prefix.endswith(".fasta") or prefix.endswith(".fa"):
+ prefix = prefix.rsplit(".", 1)[0]
+
+ info.append(prefix)
+
+ if algorithm is not None:
+ info.append(algorithm)
+
+ if threads > 1:
+ info.append("%i threads" % (threads,))
+
+ file_desc = describe_paired_files(input_files_1, input_files_2 or ())
+
+ return "<%s (%s): %s>" % (name, ", ".join(info), file_desc)
diff --git a/paleomix/nodes/commands.py b/paleomix/nodes/commands.py
new file mode 100644
index 0000000..64f5dc9
--- /dev/null
+++ b/paleomix/nodes/commands.py
@@ -0,0 +1,375 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+"""Implements nodes for calling PALEOMIX commands.
+
+Each node is equivalent to a particular command:
+ $ paleomix [...]
+"""
+import os
+import gzip
+
+from paleomix.node import \
+ CommandNode, \
+ Node
+from paleomix.atomiccmd.command import \
+ AtomicCmd
+from paleomix.atomiccmd.sets import \
+ ParallelCmds
+from paleomix.common.fileutils import \
+ describe_files
+from paleomix.nodes.picard import \
+ MultiBAMInput, \
+ MultiBAMInputNode
+from paleomix.atomiccmd.builder import \
+ AtomicCmdBuilder, \
+ create_customizable_cli_parameters, \
+ use_customizable_cli_parameters
+from paleomix.common.fileutils import \
+ reroot_path, \
+ move_file
+
+from paleomix.nodes.samtools import \
+ SAMTOOLS_VERSION, \
+ SAMTOOLS_VERSION_0119, \
+ BCFTOOLS_VERSION_0119
+
+import paleomix.tools.bam_stats.coverage as coverage
+import paleomix.tools.factory as factory
+
+
+class DuplicateHistogramNode(MultiBAMInputNode):
+ """Node for calling the 'paleomix duphist' command.
+
+ Takes 1 or more BAMs as imput, requiring a config object in order to run
+ MergeSamFiles.jar to merge these files. The output is a histogram of PCR
+ duplicate counts, usable as input for the 'preseq' tool.
+ """
+
+ def __init__(self, config, input_files, output_file, dependencies=()):
+ bam_input = MultiBAMInput(config, input_files, indexed=False)
+ duphist_command = factory.new("duphist")
+ duphist_command.add_value('%(TEMP_IN_BAM)s')
+ duphist_command.set_kwargs(OUT_STDOUT=output_file)
+ bam_input.setup(duphist_command)
+ duphist_command = duphist_command.finalize()
+
+ commands = ParallelCmds(bam_input.commands + [duphist_command])
+
+ description = "<DuplicateHistogram: %s -> %r>" \
+ % (describe_files(input_files), output_file)
+ MultiBAMInputNode.__init__(self,
+ bam_input=bam_input,
+ command=commands,
+ description=description,
+ dependencies=dependencies)
+
+
+class CoverageNode(CommandNode):
+ def __init__(self, config, target_name, input_file, output_file,
+ regions_file=None, dependencies=()):
+ builder = factory.new("coverage")
+ builder.add_value("%(IN_BAM)s")
+ builder.add_value("%(OUT_FILE)s")
+ builder.set_option("--target-name", target_name)
+ builder.set_kwargs(IN_BAM=input_file,
+ OUT_FILE=output_file)
+
+ if regions_file:
+ builder.set_option('--regions-file', '%(IN_REGIONS)s')
+ builder.set_kwargs(IN_REGIONS=regions_file)
+
+ description = "<Coverage: %s -> '%s'>" % (input_file, output_file)
+ CommandNode.__init__(self,
+ command=builder.finalize(),
+ description=description,
+ dependencies=dependencies)
+
+
+class MergeCoverageNode(Node):
+ def __init__(self, input_files, output_file, dependencies=()):
+ self._output_file = output_file
+
+ Node.__init__(self,
+ description="<MergeCoverage: %s -> '%s'>"
+ % (describe_files(input_files), self._output_file),
+ input_files=input_files,
+ output_files=self._output_file,
+ dependencies=dependencies)
+
+ def _run(self, _config, temp):
+ table = {}
+ for filename in self.input_files:
+ coverage.read_table(table, filename)
+
+ coverage.write_table(table, reroot_path(temp, self._output_file))
+ move_file(reroot_path(temp, self._output_file), self._output_file)
+
+
+class DepthHistogramNode(MultiBAMInputNode):
+ def __init__(self, config, target_name, input_files, output_file,
+ regions_file=None, dependencies=()):
+ bam_input = MultiBAMInput(config, input_files,
+ indexed=bool(regions_file))
+ if len(bam_input.files) > 1 and regions_file:
+ raise ValueError("DepthHistogram for regions require single, "
+ "indexed input BAM file.")
+
+ builder = factory.new("depths")
+ builder.add_value("%(TEMP_IN_BAM)s")
+ builder.add_value("%(OUT_FILE)s")
+ builder.set_option("--target-name", target_name)
+ builder.set_kwargs(OUT_FILE=output_file)
+ bam_input.setup(builder)
+
+ if regions_file:
+ builder.set_option('--regions-file', '%(IN_REGIONS)s')
+ builder.set_kwargs(IN_REGIONS=regions_file)
+
+ command = ParallelCmds(bam_input.commands + [builder.finalize()])
+ description = "<DepthHistogram: %s -> '%s'>" \
+ % (describe_files(bam_input.files), output_file)
+
+ MultiBAMInputNode.__init__(self,
+ bam_input=bam_input,
+ command=command,
+ description=description,
+ dependencies=dependencies)
+
+
+class FilterCollapsedBAMNode(MultiBAMInputNode):
+ def __init__(self, config, input_bams, output_bam, keep_dupes=True,
+ dependencies=()):
+ bam_input = MultiBAMInput(config, input_bams, indexed=False)
+
+ builder = factory.new("rmdup_collapsed")
+ builder.add_value("%(TEMP_IN_BAM)s")
+ builder.set_kwargs(OUT_STDOUT=output_bam)
+ bam_input.setup(builder)
+
+ if not keep_dupes:
+ builder.set_option("--remove-duplicates")
+
+ filteruniq = builder.finalize()
+ command = ParallelCmds(bam_input.commands + [filteruniq])
+ description = "<FilterCollapsedBAM: %s>" \
+ % (describe_files(bam_input.files),)
+ MultiBAMInputNode.__init__(self,
+ bam_input=bam_input,
+ command=command,
+ description=description,
+ dependencies=dependencies)
+
+
+class VCFPileupNode(CommandNode):
+ """Collects heterozygous SNPs from a VCF file, and generates a bgzipped
+ pileup for those sites containing the SNPs.
+
+ The resulting pileup is read by 'paleomix vcf_filter'; this allows
+ filtering based on the frequency of the minority SNP, since this is not
+ reported in the VCF.
+ """
+
+ @create_customizable_cli_parameters
+ def customize(cls, reference, infile_bam, infile_vcf, outfile,
+ dependencies=()):
+ params = factory.new("genotype")
+ params.add_value("%(IN_BAMFILE)s")
+ params.add_value("%(OUT_PILEUP)s")
+ params.set_option("--bedfile", "%(TEMP_IN_INTERVALS)s")
+ params.set_option("--pileup-only")
+ # Ignore read-groups for pileup
+ params.add_option("--mpileup-argument", "-R", sep="=")
+ # Reference sequence (FASTA)
+ params.add_option("--mpileup-argument",
+ "-f=%s" % (reference,), sep="=")
+
+ params.set_kwargs(IN_BAMFILE=infile_bam,
+ TEMP_IN_INTERVALS="heterozygous_snps.bed",
+ # Automatically remove this file
+ TEMP_OUT_INTERVALS="heterozygous_snps.bed",
+ OUT_PILEUP=outfile,
+ CHECK_SAMTOOLS=SAMTOOLS_VERSION)
+
+ return {"command": params}
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ self._in_vcf = parameters.infile_vcf
+ command = parameters.command.finalize()
+ description = "<VCFPileup: '%s' -> '%s'>" \
+ % (parameters.infile_vcf,
+ parameters.outfile)
+
+ CommandNode.__init__(self,
+ description=description,
+ command=command,
+ dependencies=parameters.dependencies)
+
+ def _run(self, config, temp):
+ with gzip.open(self._in_vcf) as handle:
+ with open(os.path.join(temp, "heterozygous_snps.bed"), "w") as bed:
+ for line in handle:
+ if line.startswith("#"):
+ continue
+
+ fields = line.split("\t", 5)
+ if "," in fields[4]:
+ pos = int(fields[1])
+ bed.write("%s\t%i\t%i\n" % (fields[0], pos - 1, pos))
+
+ CommandNode._run(self, config, temp)
+
+
+class VCFFilterNode(CommandNode):
+ @create_customizable_cli_parameters
+ def customize(cls, pileup, infile, outfile, regions, dependencies=()):
+ cat = factory.new("cat")
+ cat.add_value("%(IN_VCF)s")
+ cat.set_kwargs(IN_VCF=infile,
+ OUT_STDOUT=AtomicCmd.PIPE)
+
+ vcffilter = factory.new("vcf_filter")
+ vcffilter.add_option("--pileup", "%(IN_PILEUP)s")
+ for contig in regions["HomozygousContigs"]:
+ vcffilter.add_option("--homozygous-chromosome", contig)
+ vcffilter.set_kwargs(IN_PILEUP=pileup,
+ IN_STDIN=cat,
+ OUT_STDOUT=AtomicCmd.PIPE)
+
+ bgzip = AtomicCmdBuilder(["bgzip"],
+ IN_STDIN=vcffilter,
+ OUT_STDOUT=outfile)
+
+ return {"commands": {"cat": cat,
+ "filter": vcffilter,
+ "bgzip": bgzip}}
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ commands = [parameters.commands[key].finalize()
+ for key in ("cat", "filter", "bgzip")]
+
+ description = "<VCFFilter: '%s' -> '%s'>" % (parameters.infile,
+ parameters.outfile)
+ CommandNode.__init__(self,
+ description=description,
+ command=ParallelCmds(commands),
+ dependencies=parameters.dependencies)
+
+
+class GenotypeRegionsNode(CommandNode):
+ @create_customizable_cli_parameters
+ def customize(cls, reference, infile, bedfile, outfile,
+ pileup_only=False, nbatches=1, dependencies=()):
+ params = factory.new("genotype")
+ params.add_value("%(IN_BAMFILE)s")
+ params.add_value("%(OUT_VCFFILE)s")
+ params.set_option("--nbatches", nbatches)
+
+ if bedfile:
+ params.set_option("--bedfile", "%(IN_INTERVALS)s")
+
+ if pileup_only:
+ params.set_option("--pileup-only")
+ # Ignore read-groups for pileup
+ params.add_option("--mpileup-argument", "-R", sep="=")
+
+ # Reference sequence (FASTA)
+ params.add_option("--mpileup-argument",
+ "-f=%s" % (reference,), sep="=")
+
+ params.set_kwargs(IN_BAMFILE=infile,
+ IN_INTERVALS=bedfile,
+ OUT_VCFFILE=outfile,
+ CHECK_SAMTOOLS=SAMTOOLS_VERSION_0119,
+ CHECK_BCFTOOLS=BCFTOOLS_VERSION_0119)
+
+ return {"command": params}
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ command = parameters.command.finalize()
+ invokation = " (%s%i thread(s))" \
+ % ("pileup; " if parameters.pileup_only else "",
+ parameters.nbatches)
+ description = "<GenotypeRegions%s: '%s' -> '%s'>" \
+ % (invokation,
+ parameters.infile,
+ parameters.outfile)
+
+ CommandNode.__init__(self,
+ description=description,
+ command=command,
+ threads=parameters.nbatches,
+ dependencies=parameters.dependencies)
+
+
+class BuildRegionsNode(CommandNode):
+ @create_customizable_cli_parameters
+ def customize(cls, infile, bedfile, outfile, padding, dependencies=()):
+ params = factory.new("vcf_to_fasta")
+ params.set_option("--padding", padding)
+ params.set_option("--genotype", "%(IN_VCFFILE)s")
+ params.set_option("--intervals", "%(IN_INTERVALS)s")
+
+ params.set_kwargs(IN_VCFFILE=infile,
+ IN_TABIX=infile + ".tbi",
+ IN_INTERVALS=bedfile,
+ OUT_STDOUT=outfile)
+
+ return {"command": params}
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ command = parameters.command.finalize()
+ description = "<BuildRegions: '%s' -> '%s'>" % (parameters.infile,
+ parameters.outfile)
+ CommandNode.__init__(self,
+ description=description,
+ command=command,
+ dependencies=parameters.dependencies)
+
+
+class SampleRegionsNode(CommandNode):
+ @create_customizable_cli_parameters
+ def customize(cls, infile, bedfile, outfile, dependencies=()):
+ params = factory.new("sample_pileup")
+ params.set_option("--genotype", "%(IN_PILEUP)s")
+ params.set_option("--intervals", "%(IN_INTERVALS)s")
+ params.set_kwargs(IN_PILEUP=infile,
+ IN_INTERVALS=bedfile,
+ OUT_STDOUT=outfile)
+
+ return {"command": params}
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ command = parameters.command.finalize()
+
+ description = "<SampleRegions: '%s' -> '%s'>" \
+ % (parameters.infile, parameters.outfile)
+
+ CommandNode.__init__(self,
+ description=description,
+ command=command,
+ dependencies=parameters.dependencies)
diff --git a/paleomix/nodes/examl.py b/paleomix/nodes/examl.py
new file mode 100644
index 0000000..299463a
--- /dev/null
+++ b/paleomix/nodes/examl.py
@@ -0,0 +1,289 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2013 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import re
+import glob
+import random
+
+import paleomix.common.fileutils as fileutils
+import paleomix.common.versions as versions
+
+from paleomix.node import CommandNode
+from paleomix.atomiccmd.builder import \
+ AtomicCmdBuilder, \
+ AtomicMPICmdBuilder, \
+ use_customizable_cli_parameters, \
+ create_customizable_cli_parameters
+
+from paleomix.nodegraph import \
+ FileStatusCache
+
+
+EXAML_VERSION = versions.Requirement(call = ("examl", "-version"),
+ search = r"version (\d+)\.(\d+)\.(\d+)",
+ checks = versions.GE(3, 0, 0))
+
+PARSER_VERSION = versions.Requirement(call = ("parse-examl", "-h"),
+ search = r"version (\d+)\.(\d+)\.(\d+)",
+ checks = versions.GE(3, 0, 0))
+
+
+class ExaMLParserNode(CommandNode):
+ @create_customizable_cli_parameters
+ def customize(cls, input_alignment, input_partition, output_file, dependencies = ()):
+ """
+ Arguments:
+ input_alignment -- An alignment file in a format readable by RAxML.
+ input_partition -- A set of partitions in a format readable by RAxML.
+ output_filename -- Filename for the output binary sequence."""
+
+ command = AtomicCmdBuilder("parse-examl", set_cwd = True)
+
+ command.set_option("-s", "%(TEMP_OUT_ALN)s")
+ command.set_option("-q", "%(TEMP_OUT_PART)s")
+ # Output file will be named output.binary, and placed in the CWD
+ command.set_option("-n", "output")
+
+ # Substitution model
+ command.set_option("-m", "DNA", fixed = False)
+
+
+ command.set_kwargs(# Auto-delete: Symlinks
+ TEMP_OUT_PART = os.path.basename(input_partition),
+ TEMP_OUT_ALN = os.path.basename(input_alignment),
+
+ # Input files, are not used directly (see below)
+ IN_ALIGNMENT = input_alignment,
+ IN_PARTITION = input_partition,
+
+ # Final output file, are not created directly
+ OUT_BINARY = output_file,
+
+ CHECK_EXAML = PARSER_VERSION)
+
+ return {"command" : command}
+
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ self._symlinks = [os.path.abspath(parameters.input_alignment),
+ os.path.abspath(parameters.input_partition)]
+ self._output_file = os.path.basename(parameters.output_file)
+
+
+ CommandNode.__init__(self,
+ command = parameters.command.finalize(),
+ description = "<ExaMLParser: '%s' -> '%s'>" \
+ % (parameters.input_alignment, parameters.output_file),
+ dependencies = parameters.dependencies)
+
+
+ def _setup(self, config, temp):
+ CommandNode._setup(self, config, temp)
+
+ # Required to avoid the creation of files outside the temp folder
+ for filename in self._symlinks:
+ source = os.path.abspath(filename)
+ destination = os.path.join(temp, os.path.basename(filename))
+
+ os.symlink(source, destination)
+
+
+ def _teardown(self, config, temp):
+ os.remove(os.path.join(temp, "RAxML_info.output"))
+
+ source = os.path.join(temp, "output.binary")
+ destination = fileutils.reroot_path(temp, self._output_file)
+ fileutils.move_file(source, destination)
+
+ CommandNode._teardown(self, config, temp)
+
+
+class ExaMLNode(CommandNode):
+ @create_customizable_cli_parameters
+ def customize(cls, input_binary, initial_tree, output_template, threads = 1, dependencies = ()):
+ """
+ Arguments:
+ input_binary -- A binary alignment file in a format readable by ExaML.
+ output_template -- A template string used to construct final filenames. Should consist
+ of a full path, including a single '%s', which is replaced with the
+ variable part of RAxML output files (e.g. 'info', 'bestTree', ...).
+ Example destination: '/disk/project/SN013420.RAxML.%s'
+ Example output: '/disk/project/SN013420.RAxML.bestTree'"""
+
+ # TODO: Make MPIParams!
+ command = AtomicMPICmdBuilder("examl", threads = threads)
+
+ # Ensures that output is saved to the temporary directory
+ command.set_option("-w", "%(TEMP_DIR)s")
+
+ command.set_option("-s", "%(IN_ALN)s")
+ command.set_option("-t", "%(IN_TREE)s")
+ command.set_option("-n", "Pypeline")
+
+ command.set_kwargs(IN_ALN=input_binary,
+ IN_TREE=initial_tree,
+
+ # Final output files, are not created directly
+ OUT_INFO=output_template % "info",
+ OUT_BESTTREE=output_template % "result",
+ OUT_BOOTSTRAP=output_template % "log",
+
+ # Only generated by newer versions of ExaML
+ TEMP_OUT_MODELFILE=os.path.basename(output_template
+ % "modelFile"),
+
+ CHECK_EXAML=EXAML_VERSION)
+
+ # Use the GAMMA model of NT substitution by default
+ command.set_option("-m", "GAMMA", fixed = False)
+
+ return {"command" : command}
+
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ self._dirname = os.path.dirname(parameters.output_template)
+ self._template = os.path.basename(parameters.output_template)
+
+ CommandNode.__init__(self,
+ command = parameters.command.finalize(),
+ description = "<ExaML (%i thread(s)): '%s' -> '%s'>" \
+ % (parameters.threads,
+ parameters.input_binary,
+ parameters.output_template),
+ threads = parameters.threads,
+ dependencies = parameters.dependencies)
+
+ def _create_temp_dir(self, _config):
+ """Called by 'run' in order to create a temporary folder.
+ To allow restarting from checkpoints, we use a fixed folder
+ determined by the output_template."""
+ temp = os.path.join(self._dirname, self._template % ("temp",))
+ fileutils.make_dirs(temp)
+ return temp
+
+ def _setup(self, config, temp):
+ CommandNode._setup(self, config, temp)
+
+ # The temp folder may contain old files:
+ # Remove old pipes to prevent failure at _teardown
+ for pipe_fname in glob.glob(os.path.join(temp, "pipe*")):
+ fileutils.try_remove(pipe_fname)
+ # ExaML refuses to overwrite old info files
+ fileutils.try_remove(os.path.join(temp, "ExaML_info.Pypeline"))
+
+ # Resume from last checkpoint, if one such was generated
+ checkpoints = glob.glob(os.path.join(temp,
+ "ExaML_binaryCheckpoint.Pypeline_*"))
+ if not checkpoints:
+ return
+
+ cache = FileStatusCache()
+ if not cache.are_files_outdated(self.input_files, checkpoints):
+ checkpoints.sort(key=lambda fname: int(fname.rsplit("_", 1)[-1]))
+
+ # FIXME: Less hacky solution to modifying AtomicCmds needed
+ self._command._command.append("-R")
+ self._command._command.append(checkpoints[-1])
+ else:
+ for fpath in checkpoints:
+ fileutils.try_remove(fpath)
+
+ def _teardown(self, config, temp):
+ for filename in os.listdir(temp):
+ match = re.match("ExaML_(.*).Pypeline", filename)
+ if match:
+ if "binaryCheckpoint" in match.groups():
+ os.remove(os.path.join(temp, filename))
+ else:
+ source = os.path.join(temp, filename)
+ destination = os.path.join(temp, self._template % match.groups())
+
+ fileutils.move_file(source, destination)
+
+ CommandNode._teardown(self, config, temp)
+
+
+
+
+class ParsimonatorNode(CommandNode):
+ @create_customizable_cli_parameters
+ def customize(cls, input_alignment, output_tree, dependencies = ()):
+ """
+ Arguments:
+ input_alignment -- An alignment file in a format readable by RAxML.
+ output_tree -- Filename for the output newick tree."""
+
+ command = AtomicCmdBuilder("parsimonator", set_cwd = True)
+
+ command.set_option("-s", "%(TEMP_OUT_ALN)s")
+ command.set_option("-n", "output")
+ # Random seed for the stepwise addition process
+ command.set_option("-p", int(random.random() * 2**31 - 1), fixed = False)
+
+ command.set_kwargs(# Auto-delete: Symlinks
+ TEMP_OUT_ALN = os.path.basename(input_alignment),
+
+ # Input files, are not used directly (see below)
+ IN_ALIGNMENT = input_alignment,
+
+ # Final output file, are not created directly
+ OUT_TREE = output_tree)
+
+ return {"command" : command}
+
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ self._symlinks = [os.path.abspath(parameters.input_alignment)]
+ self._output_tree = os.path.basename(parameters.output_tree)
+
+
+ CommandNode.__init__(self,
+ command = parameters.command.finalize(),
+ description = "<Parsimonator: '%s' -> '%s'>" \
+ % (parameters.input_alignment, parameters.output_tree),
+ dependencies = parameters.dependencies)
+
+
+ def _setup(self, config, temp):
+ CommandNode._setup(self, config, temp)
+
+ # Required to avoid the creation of files outside the temp folder
+ for filename in self._symlinks:
+ source = os.path.abspath(filename)
+ destination = os.path.join(temp, os.path.basename(filename))
+
+ os.symlink(source, destination)
+
+
+ def _teardown(self, config, temp):
+ os.remove(os.path.join(temp, "RAxML_info.output"))
+
+ source = os.path.join(temp, "RAxML_parsimonyTree.output.0")
+ destination = fileutils.reroot_path(temp, self._output_tree)
+ fileutils.move_file(source, destination)
+
+ CommandNode._teardown(self, config, temp)
+
diff --git a/paleomix/nodes/formats.py b/paleomix/nodes/formats.py
new file mode 100755
index 0000000..e12b987
--- /dev/null
+++ b/paleomix/nodes/formats.py
@@ -0,0 +1,257 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import copy
+import collections
+
+from paleomix.node import Node
+from paleomix.common.fileutils import move_file, reroot_path
+from paleomix.common.formats.msa import MSA
+from paleomix.common.formats.phylip import interleaved_phy, sequential_phy
+
+from paleomix.common.utilities import \
+ safe_coerce_to_frozenset, \
+ safe_coerce_to_tuple
+
+
+
+_VALID_KEYS = frozenset(["partitions", "filenames"])
+
+
+class FastaToPartitionedInterleavedPhyNode(Node):
+ def __init__(self, infiles, out_prefix, exclude_groups=(), reduce=False,
+ dependencies=(), file_dependencies=()):
+ """
+ infiles = {names : {"partitions" : ..., "filenames" : [...]}}
+ """
+ if not (isinstance(infiles, dict)
+ and all(isinstance(dd, dict) for dd in infiles.values())):
+ raise TypeError("'infiles' must be a dictionary of dictionaries")
+
+ input_filenames = []
+ for (name, subdd) in infiles.iteritems():
+ if set(subdd) - _VALID_KEYS:
+ raise ValueError("Invalid keys found for %r: %s"
+ % (name, ", ".join(set(subdd) - _VALID_KEYS)))
+ elif not isinstance(subdd["filenames"], list):
+ raise ValueError("filenames must be a list of strings")
+ input_filenames.extend(subdd["filenames"])
+ # Optional file dependencies; used to depend on the list of sequcences
+ input_filenames.extend(safe_coerce_to_tuple(file_dependencies))
+
+ self._reduce = bool(reduce)
+ self._infiles = copy.deepcopy(infiles)
+ self._out_prefix = out_prefix
+ self._excluded = safe_coerce_to_frozenset(exclude_groups)
+
+ description = "<FastaToPartitionedPhy%s: %i file(s) -> '%s.*'>" % \
+ (" (reducing)" if reduce else "", len(infiles), out_prefix)
+
+ Node.__init__(self,
+ description=description,
+ input_files=input_filenames,
+ output_files=[out_prefix + ".phy",
+ out_prefix + ".partitions"],
+ dependencies=dependencies)
+
+ def _run(self, _config, temp):
+ merged_msas = []
+ for (name, files_dd) in sorted(self._infiles.iteritems()):
+ partitions = files_dd["partitions"]
+ msas = dict((key, []) for key in partitions)
+ for filename in files_dd["filenames"]:
+ msa = MSA.from_file(filename)
+ if self._excluded:
+ msa = msa.exclude(self._excluded)
+
+ for (key, msa_part) in msa.split(partitions).iteritems():
+ msas[key].append(msa_part)
+
+ msas.pop("X", None)
+ for (key, msa_parts) in sorted(msas.iteritems()):
+ merged_msa = MSA.join(*msa_parts)
+ if self._reduce:
+ merged_msa = merged_msa.reduce()
+
+ if merged_msa is not None:
+ merged_msas.append(("%s_%s" % (name, key),
+ merged_msa))
+
+ out_fname_phy = reroot_path(temp, self._out_prefix + ".phy")
+ with open(out_fname_phy, "w") as output_phy:
+ final_msa = MSA.join(*(msa for (_, msa) in merged_msas))
+ output_phy.write(interleaved_phy(final_msa))
+
+ partition_end = 0
+ out_fname_parts = reroot_path(temp, self._out_prefix + ".partitions")
+ with open(out_fname_parts, "w") as output_part:
+ for (name, msa) in merged_msas:
+ length = msa.seqlen()
+ output_part.write("DNA, %s = %i-%i\n"
+ % (name,
+ partition_end + 1,
+ partition_end + length))
+ partition_end += length
+
+ def _teardown(self, _config, temp):
+ move_file(reroot_path(temp, self._out_prefix + ".phy"),
+ self._out_prefix + ".phy")
+ move_file(reroot_path(temp, self._out_prefix + ".partitions"),
+ self._out_prefix + ".partitions")
+
+
+
+
+class FastaToPartitionsNode(Node):
+ def __init__(self, infiles, out_partitions, partition_by = "123", dependencies = ()):
+ if (len(partition_by) != 3):
+ raise ValueError("Default 'partition_by' must be 3 entires long!")
+ elif not isinstance(infiles, dict):
+ raise TypeError("'infiles' must be a dictionary")
+ elif any(len(dd.get("partition_by", "123")) != 3 for dd in infiles.itervalues()):
+ raise ValueError("'partition_by' must be 3 entires long!")
+ elif not all(isinstance(dd, dict) for dd in infiles.values()):
+ raise TypeError("'infiles' must be a dictionary of dictionaries")
+ elif not any(("name" in dd) for dd in infiles.values()):
+ raise ValueError("'name' must be specified for all input files")
+ elif any((set(dd) - _VALID_KEYS) for dd in infiles.values()):
+ raise ValueError("Invalid keys found: %s" % ", ".join(set(dd) - _VALID_KEYS))
+
+ self._infiles = infiles
+ self._out_part = out_partitions
+ self._part_by = partition_by
+
+ description = "<FastaToPartitions (default: %s): %i file(s) -> '%s'>" % \
+ (partition_by, len(infiles), out_partitions)
+
+ Node.__init__(self,
+ description = description,
+ input_files = infiles.keys(),
+ output_files = out_partitions,
+ dependencies = dependencies)
+
+
+ def _run(self, _config, temp):
+ end = 0
+ partitions = collections.defaultdict(list)
+ for (filename, msa) in _read_sequences(self._infiles):
+ length = msa.seqlen()
+ start, end = end + 1, end + length
+
+ for (group, offsets) in self._get_partition_by(filename):
+ if len(offsets) != 3:
+ parts = [("%i-%i\\3" % (start + offset, end)) for offset in offsets]
+ else:
+ parts = ["%i-%i" % (start, end)]
+
+ name = "%s_%s" % (self._infiles[filename]["name"], group)
+ partitions[name].extend(parts)
+
+ with open(reroot_path(temp, self._out_part), "w") as part_file:
+ for (name, parts) in sorted(partitions.items()):
+ part_file.writelines("DNA, %s = %s\n" % (name, ", ".join(parts)))
+
+
+ def _teardown(self, _config, temp):
+ move_file(reroot_path(temp, self._out_part), self._out_part)
+
+
+ def _get_partition_by(self, filename):
+ groups = self._infiles[filename].get("partition_by", self._part_by)
+
+ partition_by = {}
+ for (group, offset) in zip(groups, range(3)):
+ partition_by.setdefault(group, []).append(offset)
+
+ return list(sorted(partition_by.items()))
+
+
+
+class FastaToInterleavedPhyNode(Node):
+ def __init__(self, infiles, out_phy, add_flag = False, dependencies = ()):
+ self._add_flag = add_flag
+ self._out_phy = out_phy
+
+ description = "<FastaToInterleavedPhy: %i file(s) -> '%s'%s>" % \
+ (len(infiles), out_phy, (" (w/ flag)" if add_flag else ""))
+
+ Node.__init__(self,
+ description = description,
+ input_files = infiles,
+ output_files = [out_phy],
+ dependencies = dependencies)
+
+
+ def _run(self, _config, temp):
+ msa = MSA.join(*(MSA.from_file(filename) for filename in sorted(self.input_files)))
+
+ with open(reroot_path(temp, self._out_phy), "w") as output:
+ output.write(interleaved_phy(msa, add_flag = self._add_flag))
+
+
+ def _teardown(self, _config, temp):
+ move_file(reroot_path(temp, self._out_phy), self._out_phy)
+
+
+
+class FastaToSequentialPhyNode(Node):
+ def __init__(self, infiles, out_phy, add_flag = False, dependencies = ()):
+ self._add_flag = add_flag
+ self._out_phy = out_phy
+
+ description = "<FastaToInterleavedPhy: %i file(s) -> '%s'%s>" % \
+ (len(infiles), out_phy, (" (w/ flag)" if add_flag else ""))
+
+ Node.__init__(self,
+ description = description,
+ input_files = infiles,
+ output_files = [out_phy],
+ dependencies = dependencies)
+
+
+ def _run(self, _config, temp):
+ # Read and check that MSAs share groups
+ msas = [MSA.from_file(filename) for filename in sorted(self.input_files)]
+ MSA.validate(*msas)
+
+ blocks = []
+ for msa in msas:
+ blocks.append(sequential_phy(msa, add_flag = self._add_flag))
+
+ with open(reroot_path(temp, self._out_phy), "w") as output:
+ output.write("\n\n".join(blocks))
+
+
+ def _teardown(self, _config, temp):
+ move_file(reroot_path(temp, self._out_phy), self._out_phy)
+
+
+
+
+
+def _read_sequences(filenames):
+ results = {}
+ for filename in filenames:
+ results[filename] = MSA.from_file(filename)
+ MSA.validate(*results.values())
+
+ return results.iteritems()
diff --git a/paleomix/nodes/gatk.py b/paleomix/nodes/gatk.py
new file mode 100644
index 0000000..75650ff
--- /dev/null
+++ b/paleomix/nodes/gatk.py
@@ -0,0 +1,153 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+
+import paleomix.common.fileutils as \
+ fileutils
+from paleomix.node import \
+ CommandNode
+from paleomix.atomiccmd.command import \
+ AtomicCmd
+from paleomix.atomiccmd.builder import \
+ AtomicJavaCmdBuilder
+from paleomix.atomiccmd.sets import \
+ ParallelCmds
+from paleomix.common.fileutils import \
+ swap_ext, \
+ describe_files
+from paleomix.common.utilities import \
+ safe_coerce_to_tuple
+from paleomix.nodes.samtools import \
+ SAMTOOLS_VERSION
+from paleomix.nodes.bwa import \
+ _get_max_threads
+
+import paleomix.common.versions as versions
+
+
+def _get_gatk_version_check(config):
+ """Returns a version-check object for the "GenomeAnalysisTK.jar" located at
+ config.jar_root; for now, this check only serves to verify that the JAR can
+ be executed, which may not be the case if the JRE is outdated.
+ """
+ jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar")
+ if jar_file not in _GATK_VERSION:
+ params = AtomicJavaCmdBuilder(jar_file,
+ temp_root=config.temp_root,
+ jre_options=config.jre_options)
+ params.add_value("--version")
+
+ # Any version is fine; for now just catch old JREs
+ requirement = versions.Requirement(call=params.finalized_call,
+ name="GenomeAnalysisTK",
+ search=r"^(\d+)\.(\d+)",
+ checks=versions.Any())
+ _GATK_VERSION[jar_file] = requirement
+ return _GATK_VERSION[jar_file]
+_GATK_VERSION = {}
+
+
+class GATKIndelTrainerNode(CommandNode):
+ def __init__(self, config, reference, infiles, outfile,
+ threads=1, dependencies=()):
+ threads = _get_max_threads(reference, threads)
+ infiles = safe_coerce_to_tuple(infiles)
+ jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar")
+ command = AtomicJavaCmdBuilder(jar_file,
+ jre_options=config.jre_options)
+ command.set_option("-T", "RealignerTargetCreator")
+ command.set_option("-R", "%(IN_REFERENCE)s")
+ command.set_option("-o", "%(OUT_INTERVALS)s")
+ command.set_option("-nt", threads)
+
+ _set_input_files(command, infiles)
+ command.set_kwargs(IN_REFERENCE=reference,
+ IN_REF_DICT=fileutils.swap_ext(reference, ".dict"),
+ OUT_INTERVALS=outfile,
+ CHECK_GATK=_get_gatk_version_check(config))
+
+ description = "<GATK Indel Realigner (training): %s -> %r>" \
+ % (describe_files(infiles), outfile)
+ CommandNode.__init__(self,
+ threads=threads,
+ description=description,
+ command=command.finalize(),
+ dependencies=dependencies)
+
+
+class GATKIndelRealignerNode(CommandNode):
+ def __init__(self, config, reference, intervals, infiles, outfile,
+ dependencies=()):
+ self._basename = os.path.basename(outfile)
+
+ infiles = safe_coerce_to_tuple(infiles)
+ jar_file = os.path.join(config.jar_root, "GenomeAnalysisTK.jar")
+ command = AtomicJavaCmdBuilder(jar_file,
+ jre_options=config.jre_options)
+ command.set_option("-T", "IndelRealigner")
+ command.set_option("-R", "%(IN_REFERENCE)s")
+ command.set_option("-targetIntervals", "%(IN_INTERVALS)s")
+ command.set_option("-o", "%(OUT_BAMFILE)s")
+ command.set_option("--bam_compression", 0)
+ command.set_option("--disable_bam_indexing")
+ _set_input_files(command, infiles)
+
+ command.set_kwargs(IN_REFERENCE=reference,
+ IN_REF_DICT=fileutils.swap_ext(reference, ".dict"),
+ IN_INTERVALS=intervals,
+ OUT_BAMFILE=outfile,
+ CHECK_GATK=_get_gatk_version_check(config))
+
+ calmd = AtomicCmd(["samtools", "calmd", "-b",
+ "%(TEMP_IN_BAM)s", "%(IN_REF)s"],
+ TEMP_IN_BAM=self._basename,
+ IN_REF=reference,
+ TEMP_OUT_STDOUT=self._basename + ".calmd",
+ CHECK_VERSION=SAMTOOLS_VERSION)
+
+ description = "<GATK Indel Realigner (aligning): %s -> %r>" \
+ % (describe_files(infiles), outfile)
+ CommandNode.__init__(self,
+ description=description,
+ command=ParallelCmds([command.finalize(), calmd]),
+ dependencies=dependencies)
+
+ def _setup(self, config, temp):
+ CommandNode._setup(self, config, temp)
+ os.mkfifo(os.path.join(temp, self._basename))
+
+ def _teardown(self, config, temp):
+ os.rename(os.path.join(temp, self._basename) + ".calmd",
+ os.path.join(temp, self._basename))
+
+ CommandNode._teardown(self, config, temp)
+
+
+def _set_input_files(command, input_files):
+ keys = {}
+ for (index, filename) in enumerate(input_files):
+ command.add_option("-I", "%%(IN_BAMFILE_%02i)s" % index)
+ keys["IN_BAMFILE_%02i" % index] = filename
+ keys["IN_BAIFILE_%02i" % index] = swap_ext(filename, ".bai")
+
+ command.set_kwargs(**keys)
diff --git a/paleomix/nodes/mafft.py b/paleomix/nodes/mafft.py
new file mode 100644
index 0000000..435068e
--- /dev/null
+++ b/paleomix/nodes/mafft.py
@@ -0,0 +1,93 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+from paleomix.node import \
+ CommandNode, \
+ NodeError
+from paleomix.atomiccmd.builder import \
+ AtomicCmdBuilder, \
+ use_customizable_cli_parameters, \
+ create_customizable_cli_parameters
+from paleomix.common.fileutils import \
+ reroot_path
+from paleomix.common.formats.msa import \
+ MSA, \
+ MSAError
+import paleomix.common.versions as versions
+
+
+MAFFT_VERSION = versions.Requirement(call = ("mafft", "--version"),
+ search = r"v(\d+)\.(\d+)",
+ checks = versions.GE(7, 0))
+
+
+# Presets mainly taken from
+# http://mafft.cbrc.jp/alignment/software/algorithms/algorithms.html
+_PRESETS = {
+ "mafft" : ["mafft"],
+ "auto" : ["mafft", "--auto"],
+ "fft-ns-1" : ["mafft-fftns", "--retree", 1],
+ "fft-ns-2" : ["mafft-fftns"],
+ "fft-ns-i" : ["mafft-fftnsi"],
+ "nw-ns-i" : ["mafft-nwnsi"],
+ "l-ins-i" : ["mafft-linsi"],
+ "e-ins-i" : ["mafft-einsi"],
+ "g-ins-i" : ["mafft-ginsi"],
+ }
+
+
+
+
+class MAFFTNode(CommandNode):
+ @create_customizable_cli_parameters
+ def customize(cls, input_file, output_file, algorithm = "auto", dependencies = ()):
+ command = AtomicCmdBuilder(_PRESETS[algorithm.lower()])
+ command.add_value("%(IN_FASTA)s")
+ command.set_kwargs(IN_FASTA = input_file,
+ OUT_STDOUT = output_file,
+ CHECK_VERSION = MAFFT_VERSION)
+
+ return {"command" : command,
+ "dependencies" : dependencies}
+
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ self._output_file = parameters.output_file
+ description = "<MAFFTNode (%s): '%s' -> '%s'>" \
+ % (parameters.algorithm,
+ parameters.input_file,
+ parameters.output_file)
+
+ CommandNode.__init__(self,
+ command = parameters.command.finalize(),
+ description = description,
+ dependencies = parameters.dependencies)
+
+ def _teardown(self, config, temp):
+ # Validate output from MAFFT
+ output_file = reroot_path(temp, self._output_file)
+ try:
+ MSA.from_file(output_file)
+ except MSAError, error:
+ raise NodeError("Invalid MSA produced by MAFFT:\n%s" % (error,))
+ CommandNode._teardown(self, config, temp)
diff --git a/paleomix/nodes/mapdamage.py b/paleomix/nodes/mapdamage.py
new file mode 100644
index 0000000..54c82e5
--- /dev/null
+++ b/paleomix/nodes/mapdamage.py
@@ -0,0 +1,294 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+
+import paleomix.common.rtools as rtools
+import paleomix.common.versions as versions
+
+from paleomix.common.fileutils import \
+ describe_files
+
+from paleomix.node import \
+ NodeError, \
+ CommandNode
+from paleomix.atomiccmd.sets import \
+ ParallelCmds
+from paleomix.nodes.picard import \
+ MultiBAMInput, \
+ MultiBAMInputNode
+from paleomix.atomiccmd.builder import \
+ AtomicCmdBuilder, \
+ use_customizable_cli_parameters, \
+ create_customizable_cli_parameters
+
+
+MAPDAMAGE_VERSION = versions.Requirement(call=("mapDamage", "--version"),
+ search=r"(\d+)\.(\d+).(\d+)",
+ checks=versions.GE(2, 0, 1))
+
+RSCRIPT_VERSION = versions.Requirement(call=("Rscript", "--version"),
+ search=r"(\d+)\.(\d+).(\d+)",
+ checks=versions.GE(2, 15, 1),
+ priority=10)
+
+
+class MapDamagePlotNode(MultiBAMInputNode):
+ @create_customizable_cli_parameters
+ def customize(self, config, reference, input_files, output_directory,
+ title="mapDamage", dependencies=()):
+ command = AtomicCmdBuilder(
+ ["mapDamage", "--no-stats",
+ # Prevent references with many contigs from using excessive
+ # amounts of memory, at the cost of per-contig statistics:
+ "--merge-reference-sequences",
+ "-t", title,
+ "-i", "%(TEMP_IN_BAM)s",
+ "-d", "%(TEMP_DIR)s",
+ "-r", "%(IN_REFERENCE)s"],
+ IN_REFERENCE=reference,
+ OUT_FREQ_3p=os.path.join(output_directory, "3pGtoA_freq.txt"),
+ OUT_FREQ_5p=os.path.join(output_directory, "5pCtoT_freq.txt"),
+ OUT_COMP_USER=os.path.join(output_directory, "dnacomp.txt"),
+ OUT_PLOT_FRAG=os.path.join(output_directory,
+ "Fragmisincorporation_plot.pdf"),
+ OUT_PLOT_LEN=os.path.join(output_directory, "Length_plot.pdf"),
+ OUT_LENGTH=os.path.join(output_directory, "lgdistribution.txt"),
+ OUT_MISINCORP=os.path.join(output_directory,
+ "misincorporation.txt"),
+ OUT_LOG=os.path.join(output_directory, "Runtime_log.txt"),
+ TEMP_OUT_STDOUT="pipe_mapDamage.stdout",
+ TEMP_OUT_STDERR="pipe_mapDamage.stderr",
+
+ CHECK_RSCRIPT=RSCRIPT_VERSION,
+ CHECK_MAPDAMAGE=MAPDAMAGE_VERSION)
+
+ return {"command": command,
+ "config": config,
+ "input_files": input_files,
+ "dependencies": dependencies}
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ bam_input = MultiBAMInput(parameters.config, parameters.input_files,
+ indexed=False)
+ bam_input.setup(parameters.command)
+ cmd_map = parameters.command.finalize()
+
+ description = "<mapDamage (plots): %s -> '%s'>" \
+ % (describe_files(parameters.input_files),
+ parameters.output_directory)
+ MultiBAMInputNode.__init__(self,
+ bam_input=bam_input,
+ command=ParallelCmds(bam_input.commands +
+ [cmd_map]),
+ description=description,
+ dependencies=parameters.dependencies)
+
+ def _teardown(self, config, temp):
+ # No Length_plot.pdf file is written if there are no SE reads in the
+ # input_file. In that case, we write a dummy PDF to ensure that all
+ # expected files exist.
+ err_message = "No length distributions are available"
+ with open(os.path.join(temp, "pipe_mapDamage.stderr")) as in_handle:
+ if any(line.startswith(err_message) for line in in_handle):
+
+ fpath = os.path.join(temp, "Length_plot.pdf")
+ with open(fpath, "w") as out_handle:
+ out_handle.write(_DUMMY_LENGTH_PLOT_PDF)
+
+ MultiBAMInputNode._teardown(self, config, temp)
+
+
+class MapDamageModelNode(CommandNode):
+ @create_customizable_cli_parameters
+ def customize(self, reference, directory, dependencies=()):
+ command = AtomicCmdBuilder(
+ ["mapDamage", "--stats-only",
+ "-r", "%(IN_REFERENCE)s",
+ "-d", "%(TEMP_DIR)s"],
+ IN_REFERENCE=reference,
+ TEMP_OUT_FREQ_3p="3pGtoA_freq.txt",
+ TEMP_OUT_FREQ_5p="5pCtoT_freq.txt",
+ TEMP_OUT_COMP_USER="dnacomp.txt",
+ TEMP_OUT_MISINCORP="misincorporation.txt",
+ TEMP_OUT_LOG="Runtime_log.txt",
+ TEMP_OUT_STDOUT="pipe_mapDamage.stdout",
+ TEMP_OUT_STDERR="pipe_mapDamage.stderr",
+ OUT_COMP_GENOME=os.path.join(directory, "dnacomp_genome.csv"),
+ OUT_MCMC_PROBS=os.path.join(directory,
+ "Stats_out_MCMC_correct_prob.csv"),
+ OUT_MCMC_HIST=os.path.join(directory, "Stats_out_MCMC_hist.pdf"),
+ OUT_MCMC_ITER=os.path.join(directory, "Stats_out_MCMC_iter.csv"),
+ OUT_MCMC_ITERSUM=os.path.join(directory,
+ "Stats_out_MCMC_iter_summ_stat.csv"),
+ OUT_MCMC_POSTPRED=os.path.join(directory,
+ "Stats_out_MCMC_post_pred.pdf"),
+ OUT_MCMC_TRACE=os.path.join(directory, "Stats_out_MCMC_trace.pdf"),
+
+ CHECK_RSCRIPT=RSCRIPT_VERSION,
+ CHECK_MAPDAMAGE=MAPDAMAGE_VERSION,
+ CHECK_R_INLINE=rtools.requirement("inline"),
+ CHECK_R_GGPLOT2=rtools.requirement("ggplot2"),
+ CHECK_R_RCPP=rtools.requirement("Rcpp"),
+ CHECK_R_GAM=rtools.requirement("gam"),
+ CHECK_R_RCPPGSL=rtools.requirement("RcppGSL"))
+
+ return {"command": command,
+ "dependencies": dependencies}
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ self._directory = parameters.directory
+
+ description = "<mapDamage (model): %r>" % (parameters.directory,)
+ CommandNode.__init__(self,
+ command=parameters.command.finalize(),
+ description=description,
+ dependencies=parameters.dependencies)
+
+ def _setup(self, config, temp):
+ CommandNode._setup(self, config, temp)
+ for fname in ("3pGtoA_freq.txt", "5pCtoT_freq.txt", "dnacomp.txt",
+ "misincorporation.txt"):
+ relpath = os.path.join(self._directory, fname)
+ abspath = os.path.abspath(relpath)
+ os.symlink(abspath, os.path.join(temp, fname))
+
+ def _run(self, config, temp):
+ try:
+ CommandNode._run(self, config, temp)
+ except NodeError, error:
+ err_message = "DNA damage levels are too low"
+ if self._command.join() == [1]:
+ fpath = os.path.join(temp, "pipe_mapDamage.stdout")
+ with open(fpath) as handle:
+ for line in handle:
+ if err_message in line:
+ line = line.strip().replace("Warning:", "ERROR:")
+ error = NodeError("%s\n\n%s" % (error, line))
+ break
+ raise error
+
+
+class MapDamageRescaleNode(MultiBAMInputNode):
+ @create_customizable_cli_parameters
+ def customize(self, config, reference, input_files, output_file, directory,
+ dependencies=()):
+ stats_out_fname = "Stats_out_MCMC_correct_prob.csv"
+ command = AtomicCmdBuilder(["mapDamage", "--rescale-only",
+ "-i", "%(TEMP_IN_BAM)s",
+ "-d", "%(TEMP_DIR)s",
+ "-r", "%(IN_REFERENCE)s",
+ "--rescale-out", "%(OUT_BAM)s"],
+ IN_REFERENCE=reference,
+ TEMP_OUT_LOG="Runtime_log.txt",
+ TEMP_OUT_CSV=stats_out_fname,
+ OUT_BAM=output_file,
+ CHECK_VERSION=MAPDAMAGE_VERSION)
+
+ return {"command": command,
+ "config": config,
+ "input_files": input_files,
+ "directory": directory,
+ "dependencies": dependencies}
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ self._directory = parameters.directory
+ bam_input = MultiBAMInput(parameters.config, parameters.input_files,
+ indexed=False)
+ bam_input.setup(parameters.command)
+ command = parameters.command.finalize()
+
+ description = "<mapDamage (rescale): %s -> %r>" \
+ % (describe_files(parameters.input_files),
+ parameters.output_file)
+ MultiBAMInputNode.__init__(self,
+ bam_input=bam_input,
+ command=ParallelCmds(bam_input.commands +
+ [command]),
+ description=description,
+ dependencies=parameters.dependencies)
+
+ def _setup(self, config, temp):
+ MultiBAMInputNode._setup(self, config, temp)
+ for fname in ("Stats_out_MCMC_correct_prob.csv", ):
+ relpath = os.path.join(self._directory, fname)
+ abspath = os.path.abspath(relpath)
+ os.symlink(abspath, os.path.join(temp, fname))
+
+
+# Minimal PDF written if Length_plot.pdf wasn't generated
+_DUMMY_LENGTH_PLOT_PDF = \
+ """%PDF-1.4
+
+1 0 obj
+ <</Type /Font /Subtype /Type1 /Encoding /WinAnsiEncoding /BaseFont /Courier >>
+endobj
+
+2 0 obj
+ <</Parent 4 0 R /MediaBox[0 0 450 50] /Type /Page /Contents[3 0 R ] /Resources 5 0 R >>
+endobj
+
+3 0 obj
+ <</Length 138 >>
+stream
+ BT
+ /F0 18 Tf
+ 20 10 Td
+ (Input file(s) did not contain SE reads.) Tj
+ 0 20 Td
+ (Length_plot.pdf not generated:) Tj
+ ET
+endstream
+endobj
+
+4 0 obj
+ <</Type /Pages /Count 1 /Kids[2 0 R ]>>
+endobj
+
+5 0 obj
+ <</ProcSet[/PDF /Text] /Font <</F0 1 0 R >>
+>>
+endobj
+
+6 0 obj
+ <</Type /Catalog /Pages 4 0 R >>
+endobj
+
+xref
+0 7
+0000000000 65535 f
+0000000010 00000 n
+0000000106 00000 n
+0000000211 00000 n
+0000000400 00000 n
+0000000457 00000 n
+0000000521 00000 n
+trailer
+ <</Size 7 /Root 6 0 R >>
+
+startxref
+571
+%%EOF
+"""
diff --git a/paleomix/nodes/misc.py b/paleomix/nodes/misc.py
new file mode 100644
index 0000000..4ac2538
--- /dev/null
+++ b/paleomix/nodes/misc.py
@@ -0,0 +1,54 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2013 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+from paleomix.node import \
+ Node
+from paleomix.common.fileutils import \
+ copy_file, \
+ reroot_path
+from paleomix.common.utilities import \
+ safe_coerce_to_tuple
+
+
+class CopyOutputFilesNode(Node):
+ """Copies the output-files of one or more nodes to a specified folder."""
+
+ def __init__(self, description, destination, source_nodes):
+ source_nodes = safe_coerce_to_tuple(source_nodes)
+
+ input_files = []
+ for source_node in source_nodes:
+ input_files.extend(source_node.output_files)
+
+ output_files = [reroot_path(destination, fpath) for fpath in input_files]
+ self._files = zip(input_files, output_files)
+
+ Node.__init__(self,
+ description = "<Copy %s output to %r>" % (description, destination),
+ input_files = input_files,
+ output_files = output_files,
+ dependencies = source_nodes)
+
+
+ def _run(self, _config, _temp):
+ for (src_file, dst_file) in self._files:
+ copy_file(src_file, dst_file)
diff --git a/paleomix/nodes/newick.py b/paleomix/nodes/newick.py
new file mode 100644
index 0000000..8402b6a
--- /dev/null
+++ b/paleomix/nodes/newick.py
@@ -0,0 +1,116 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+
+from paleomix.common.formats.newick import \
+ Newick
+from paleomix.common.utilities import \
+ safe_coerce_to_tuple
+from paleomix.common.fileutils import \
+ describe_files, \
+ move_file
+from paleomix.node import \
+ Node
+
+
+
+class NewickRerootNode(Node):
+ def __init__(self, tree_files, output_file, taxa = (), dependencies = ()):
+ self._output_file = output_file
+ self._tree_files = safe_coerce_to_tuple(tree_files)
+ self._reroot_on_taxa = safe_coerce_to_tuple(taxa)
+
+ reroot_on = "midpoint"
+ if self._reroot_on_taxa:
+ reroot_on = repr("', '".join(sorted(self._reroot_on_taxa)))
+
+ description = "<NewickReroot (on %s): %s>" % \
+ (reroot_on, describe_files(tree_files),)
+
+ Node.__init__(self,
+ description = description,
+ input_files = self._tree_files,
+ output_files = self._output_file,
+ dependencies = dependencies)
+
+
+ def _run(self, _config, temp):
+ lines = []
+ for tree in _read_tree_files(self._tree_files):
+ if self._reroot_on_taxa:
+ rooted_tree = tree.reroot_on_taxa(self._reroot_on_taxa)
+ else:
+ rooted_tree = tree.reroot_on_midpoint()
+ lines.append(str(rooted_tree))
+ lines = "\n".join(lines) + "\n"
+
+ temp_output_file = os.path.join(temp, os.path.basename(self._output_file))
+ with open(temp_output_file, "w") as handle:
+ handle.write(lines)
+
+ move_file(temp_output_file, self._output_file)
+
+
+
+
+class NewickSupportNode(Node):
+ def __init__(self, main_tree_files, support_tree_files, output_file, dependencies = ()):
+ self._output_file = output_file
+ self._main_tree_files = safe_coerce_to_tuple(main_tree_files)
+ self._support_tree_files = safe_coerce_to_tuple(support_tree_files)
+ input_files = self._main_tree_files + self._support_tree_files
+
+ description = "<NewickSupport: %s>" % \
+ (describe_files(main_tree_files),)
+
+ Node.__init__(self,
+ description = description,
+ input_files = input_files,
+ output_files = output_file,
+ dependencies = dependencies)
+
+ def _run(self, _config, temp):
+ main_trees = _read_tree_files(self._main_tree_files)
+ support_trees = _read_tree_files(self._support_tree_files)
+
+ lines = []
+ for main_tree in main_trees:
+ supported_tree = main_tree.add_support(support_trees)
+ lines.append(str(supported_tree))
+ lines = "\n".join(lines) + "\n"
+
+ temp_output_file = os.path.join(temp, os.path.basename(self._output_file))
+ with open(temp_output_file, "w") as handle:
+ handle.write(lines)
+
+ move_file(temp_output_file, self._output_file)
+
+
+
+def _read_tree_files(filenames):
+ trees = []
+ for filename in filenames:
+ with open(filename) as handle:
+ for line in handle:
+ trees.append(Newick.from_string(line))
+ return trees
diff --git a/paleomix/nodes/phylip.py b/paleomix/nodes/phylip.py
new file mode 100644
index 0000000..d41b1e2
--- /dev/null
+++ b/paleomix/nodes/phylip.py
@@ -0,0 +1,188 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import re
+import random
+
+from paleomix.node import \
+ Node, \
+ NodeError
+from paleomix.common.fileutils import \
+ move_file, \
+ reroot_path
+
+
+
+
+class PHYLIPBootstrapNode(Node):
+ """Generates a bootstrap alignment for a partition PHYLIP file;
+
+ Note that only the PHYLIP / partitions format produced by the Node
+ FastaToPartitionedInterleavedPhyNode is supported, in addition to the
+ formats produced by RAxMLReduceNode.
+
+ Parameters:
+ -- input_alignment - The input alignment file in PHYLIP format
+ -- input_partition - The input partition file in RAxML format
+ -- output_alignment - The output alignment file in PHYLIP format
+ The simple (RAxML like) sequential format is used.
+ -- seed - RNG seed for selecting alignment columns."""
+
+ def __init__(self, input_alignment, input_partition, output_alignment,
+ seed = None, dependencies = ()):
+ self._input_phy = input_alignment
+ self._input_part = input_partition
+ self._output_phy = output_alignment
+ self._seed = seed
+
+ Node.__init__(self,
+ description = "<PHYLIPBootstrap: %r -> %r>" \
+ % (input_alignment, output_alignment),
+ input_files = (input_alignment, input_partition),
+ output_files = (output_alignment,),
+ dependencies = dependencies)
+
+
+ def _run(self, _config, temp):
+ if self._seed is not None:
+ rng = random.Random(self._seed)
+ partitions = _read_partitions(self._input_part)
+ header, names, sequences = _read_sequences(self._input_phy)
+ bootstraps = self._bootstrap_sequences(sequences, partitions, rng)
+
+ temp_fpath = reroot_path(temp, self._output_phy)
+ with open(temp_fpath, "w") as output_phy:
+ output_phy.write(header)
+
+ for (name, fragments) in zip(names, bootstraps):
+ output_phy.write(name)
+ output_phy.write(" ")
+ for sequence in fragments:
+ output_phy.write(sequence)
+ output_phy.write("\n")
+
+ move_file(temp_fpath, self._output_phy)
+
+
+ @classmethod
+ def _bootstrap_sequences(cls, sequences, partitions, rng):
+ final_partitions = [[] for _ in sequences]
+ for (start, end) in partitions:
+ # Convert alignment to columns, and randomly select among those
+ columns = zip(*(sequence[start:end] for sequence in sequences))
+ bootstrap_partition = (rng.choice(columns) for _ in columns)
+
+ # Convert randomly selected columns back into sequences
+ for (dest, partition) in zip(final_partitions,
+ zip(*bootstrap_partition)):
+ dest.append("".join(partition))
+
+ return final_partitions
+
+
+
+_RE_PARTITION = re.compile(r"^[A-Z]+, [^ ]+ = (\d+)-(\d+)$")
+_RE_PARTITION_SINGLE = re.compile(r"^[A-Z]+, [^ ]+ = (\d+)$")
+
+def _read_partitions(filename):
+ """Read a partition file, as produced by the pipeline itself, and
+ returns a list of tuples containing the (start, end) coordinates;
+ each line is expected to follow the following format:
+
+ DNA, Name = Start-End
+
+ Multiple regions, or skips are not supported."""
+ partitions = []
+ with open(filename) as handle:
+ for (line_num, line) in enumerate(handle):
+ result = _RE_PARTITION.match(line.rstrip())
+ if result:
+ start, end = result.groups()
+ else:
+ result = _RE_PARTITION_SINGLE.match(line.rstrip())
+ if not result:
+ message = ("Line %i in partitions file does not follow "
+ "expected format:\n"
+ " Expected, either = 'DNA, Name = Start-End'\n"
+ " or = 'DNA, Name = Start'\n"
+ " Found = %r") % (line_num, line.rstrip())
+ raise NodeError(message)
+ start, = result.groups()
+ end = start
+
+ partitions.append((int(start) - 1, int(end)))
+ return partitions
+
+
+def _read_sequences(filename):
+ """Collects the sequences from a PHYLIP file, and returns the header,
+ the names of the sequences, and the sequences themselves. The parser
+ supports interleaved sequences (as produced by the pipeline), or simple
+ sequential (each paired name and sequence on a single line) as produced
+ by RAxML's reduce functionality. PHYLIP files containing multiple entries
+ are not supported."""
+ line, header = " ", None
+ with open(filename) as handle:
+ # Find header
+ num_sequences = num_bases = 0
+ while line:
+ line = handle.readline()
+ if line.strip():
+ header = line
+ num_sequences, num_bases = map(int, line.split())
+ break
+
+ names = [None for _ in xrange(num_sequences)]
+ sequences = [[] for _ in xrange(num_sequences)]
+
+ line_num = 0
+ while line:
+ line = handle.readline()
+ line_strip = line.strip()
+ if line_strip:
+ # The first N sequences are expected to contain sample names
+ index = line_num % num_sequences
+ if line_num < num_sequences:
+ name, line_strip = line_strip.split(None, 1)
+ names[index] = name
+
+ sequences[index].extend(line_strip.split())
+ line_num += 1
+
+ if len(sequences) != num_sequences:
+ message = ("Expected %i sequences, but found %i in PHYLIP file:\n"
+ " Filename = %r") % (num_sequences,
+ len(sequences),
+ filename)
+ raise NodeError(message)
+
+ for (index, fragments) in enumerate(sequences):
+ sequences[index] = "".join(fragments)
+ if len(sequences[index]) != num_bases:
+ message = ("Expected %ibp sequences, found %ibp sequence for %r\n"
+ " Filename = %r") % (num_bases,
+ len(sequences[index]),
+ names[index],
+ filename)
+ raise NodeError(message)
+
+ return header, names, sequences
diff --git a/paleomix/nodes/picard.py b/paleomix/nodes/picard.py
new file mode 100644
index 0000000..8641383
--- /dev/null
+++ b/paleomix/nodes/picard.py
@@ -0,0 +1,299 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import getpass
+
+from paleomix.node import CommandNode
+from paleomix.atomiccmd.builder import \
+ AtomicJavaCmdBuilder, \
+ create_customizable_cli_parameters, \
+ use_customizable_cli_parameters
+from paleomix.common.fileutils import \
+ swap_ext, \
+ try_rmtree, \
+ try_remove, \
+ reroot_path, \
+ describe_files
+from paleomix.common.utilities import \
+ safe_coerce_to_tuple
+import paleomix.common.versions as versions
+import paleomix.common.system
+
+
+class PicardNode(CommandNode):
+ """Base class for nodes using Picard Tools; adds an additional cleanup
+ step, in order to allow the jars to be run using the same temporary folder
+ as any other commands associated with the node. This is nessesary as some
+ Picard tools create a large number of temporary files, leading to potential
+ performance issues if these are located in the same folder.
+ """
+
+ def _teardown(self, config, temp):
+ # Picard creates a folder named after the user in the temp-root
+ try_rmtree(os.path.join(temp, getpass.getuser()))
+ # Some JREs may create a folder for temporary performance counters
+ try_rmtree(os.path.join(temp, "hsperfdata_" + getpass.getuser()))
+
+ CommandNode._teardown(self, config, temp)
+
+
+class ValidateBAMNode(PicardNode):
+ @create_customizable_cli_parameters
+ def customize(cls, config, input_bam, output_log=None, dependencies=()):
+ params = picard_command(config, "ValidateSamFile")
+ _set_max_open_files(params, "MAX_OPEN_TEMP_FILES")
+
+ params.set_option("I", "%(IN_BAM)s", sep="=")
+
+ output_log = output_log or swap_ext(input_bam, ".validated")
+ params.set_kwargs(IN_BAM=input_bam,
+ OUT_STDOUT=output_log)
+
+ return {"command": params,
+ "dependencies": dependencies}
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ description = "<Validate BAM: '%s'>" % (parameters.input_bam,)
+ PicardNode.__init__(self,
+ command=parameters.command.finalize(),
+ description=description,
+ dependencies=parameters.dependencies)
+
+
+class BuildSequenceDictNode(PicardNode):
+ @create_customizable_cli_parameters
+ def customize(cls, config, reference, dependencies=()):
+ params = picard_command(config, "CreateSequenceDictionary")
+
+ params.set_option("R", "%(TEMP_OUT_REF)s", sep="=")
+ params.set_option("O", "%(OUT_DICT)s", sep="=")
+ params.set_kwargs(IN_REF=reference,
+ TEMP_OUT_REF=os.path.basename(reference),
+ OUT_DICT=swap_ext(reference, ".dict"))
+
+ return {"command": params,
+ "dependencies": dependencies}
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ self._in_reference = os.path.abspath(parameters.reference)
+ description = "<SequenceDictionary: '%s'>" % (parameters.reference,)
+
+ PicardNode.__init__(self,
+ command=parameters.command.finalize(),
+ description=description,
+ dependencies=parameters.dependencies)
+
+ def _setup(self, _config, temp):
+ os.symlink(self._in_reference, reroot_path(temp, self._in_reference))
+
+
+class MarkDuplicatesNode(PicardNode):
+ @create_customizable_cli_parameters
+ def customize(cls, config, input_bams, output_bam, output_metrics=None,
+ keep_dupes=False, dependencies=()):
+ params = picard_command(config, "MarkDuplicates")
+ _set_max_open_files(params, "MAX_FILE_HANDLES")
+
+ # Create .bai index, since it is required by a lot of other programs
+ params.set_option("CREATE_INDEX", "True", sep="=")
+
+ params.set_option("OUTPUT", "%(OUT_BAM)s", sep="=")
+ params.set_option("METRICS_FILE", "%(OUT_METRICS)s", sep="=")
+ params.add_multiple_options("I", input_bams, sep="=")
+
+ if not keep_dupes:
+ # Remove duplicates from output by default to save disk-space
+ params.set_option("REMOVE_DUPLICATES", "True",
+ sep="=", fixed=False)
+
+ output_metrics = output_metrics or swap_ext(output_bam, ".metrics")
+ params.set_kwargs(OUT_BAM=output_bam,
+ OUT_BAI=swap_ext(output_bam, ".bai"),
+ OUT_METRICS=output_metrics)
+
+ return {"command": params,
+ "dependencies": dependencies}
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ description = "<MarkDuplicates: %s>" \
+ % (describe_files(parameters.input_bams),)
+ PicardNode.__init__(self,
+ command=parameters.command.finalize(),
+ description=description,
+ dependencies=parameters.dependencies)
+
+
+class MergeSamFilesNode(PicardNode):
+ @create_customizable_cli_parameters
+ def customize(cls, config, input_bams, output_bam, dependencies=()):
+ params = picard_command(config, "MergeSamFiles")
+
+ params.set_option("OUTPUT", "%(OUT_BAM)s", sep="=")
+ params.set_option("CREATE_INDEX", "True", sep="=")
+ params.set_option("SO", "coordinate", sep="=", fixed=False)
+ params.add_multiple_options("I", input_bams, sep="=")
+
+ params.set_kwargs(OUT_BAM=output_bam,
+ OUT_BAI=swap_ext(output_bam, ".bai"))
+
+ return {"command": params,
+ "dependencies": dependencies}
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ description = "<Merge BAMs: %i file(s) -> '%s'>" \
+ % (len(parameters.input_bams), parameters.output_bam)
+ PicardNode.__init__(self,
+ command=parameters.command.finalize(),
+ description=description,
+ dependencies=parameters.dependencies)
+
+
+class MultiBAMInput(object):
+ """Container used to ease processing of 1 or more BAM files; used in
+ conjunctin with MultiBAMInputNode.
+ """
+
+ def __init__(self, config, input_bams, pipename="input.bam", indexed=True):
+ self.pipe = pipename
+ self.indexed = indexed
+ self.files = safe_coerce_to_tuple(input_bams)
+
+ self.commands = []
+ self.kwargs = {"TEMP_IN_BAM": self.pipe}
+ if len(self.files) > 1:
+ params = picard_command(config, "MergeSamFiles")
+
+ params.set_option("SO", "coordinate", sep="=", fixed=False)
+ params.set_option("CREATE_INDEX", "False", sep="=")
+ params.set_option("COMPRESSION_LEVEL", 0, sep="=")
+ params.set_option("OUTPUT", "%(TEMP_OUT_BAM)s", sep="=")
+ params.add_multiple_options("I", input_bams, sep="=")
+
+ params.set_kwargs(TEMP_OUT_BAM=self.pipe)
+
+ self.commands = [params.finalize()]
+ else:
+ # Ensure that the actual command depends on the input
+ self.kwargs["IN_FILE_00"] = self.files[0]
+
+ if indexed:
+ self.kwargs["IN_FILE_01"] = swap_ext(self.files[0], ".bai")
+
+ def setup(self, command):
+ command.set_kwargs(**self.kwargs)
+
+
+class MultiBAMInputNode(CommandNode):
+ """Node which provides concatenation of input BAM files. Takes a
+ MultiBAMInput object, and creates a pipe in the temporary folder which
+ yields the concatenated BAM resulting from the concatenation of all input
+ files. To avoid unnessary overhead, a symbolic link is used in the case
+ where there is only a single input file.
+
+ Usage example:
+ class ExampleNode(MultiBAMInputNode):
+ def __init__(self, config, input_bams):
+ bam_input = MultiBAMInput(config, input_bams)
+ command = AtomicCmd(['analyse_bam', '%(TEMP_IN_BAM)s'],
+ TEMP_IN_BAM=bam_input.pipe)
+ commands = ParallelCmds(bam_input.commands + [command])
+ MultiBAMInputNode.__init__(bam_input=bam_input,
+ command=commands)
+ """
+
+ def __init__(self, bam_input, *args, **kwargs):
+ self._bam_input = bam_input
+ CommandNode.__init__(self, *args, **kwargs)
+
+ def _setup(self, config, temp_root):
+ CommandNode._setup(self, config, temp_root)
+ dst_fname = os.path.join(temp_root, self._bam_input.pipe)
+ if len(self._bam_input.files) > 1:
+ os.mkfifo(dst_fname)
+ else:
+ src_fname, = self._bam_input.files
+ os.symlink(os.path.join(os.getcwd(), src_fname), dst_fname)
+
+ if self._bam_input.indexed:
+ src_fname = os.path.join(os.getcwd(), swap_ext(src_fname, ".bai"))
+ os.symlink(src_fname, dst_fname + ".bai")
+
+ def _teardown(self, config, temp_root):
+ pipe_fname = os.path.join(temp_root, self._bam_input.pipe)
+ os.remove(pipe_fname)
+ try_remove(pipe_fname + ".bai")
+ CommandNode._teardown(self, config, temp_root)
+
+
+###############################################################################
+
+_PICARD_JAR = "picard.jar"
+_PICARD_VERSION_CACHE = {}
+
+
+def picard_command(config, command):
+ """Returns basic AtomicJavaCmdBuilder for Picard tools commands."""
+ jar_path = os.path.join(config.jar_root, _PICARD_JAR)
+
+ if jar_path not in _PICARD_VERSION_CACHE:
+ params = AtomicJavaCmdBuilder(jar_path,
+ temp_root=config.temp_root,
+ jre_options=config.jre_options)
+
+ # Arbitrary command, since just '--version' does not work
+ params.set_option("MarkDuplicates")
+ params.set_option("--version")
+
+ requirement = versions.Requirement(call=params.finalized_call,
+ name="Picard tools",
+ search=r"^(\d+)\.(\d+)",
+ checks=versions.GE(1, 124))
+ _PICARD_VERSION_CACHE[jar_path] = requirement
+
+ version = _PICARD_VERSION_CACHE[jar_path]
+ params = AtomicJavaCmdBuilder(jar_path,
+ temp_root=config.temp_root,
+ jre_options=config.jre_options,
+ CHECK_JAR=version)
+ params.set_option(command)
+
+ return params
+
+
+# Fraction of per-process max open files to use
+_FRAC_MAX_OPEN_FILES = 0.95
+
+
+def _set_max_open_files(params, key):
+ """Sets the maximum number of open files a picard process
+ should use, at most. Conservatively lowered than the actual
+ ulimit.
+ """
+ max_open_files = paleomix.common.system.get_max_open_files()
+ if max_open_files is not None:
+ max_open_files = int(max_open_files * _FRAC_MAX_OPEN_FILES)
+ params.set_option(key, max_open_files, sep="=")
diff --git a/paleomix/nodes/raxml.py b/paleomix/nodes/raxml.py
new file mode 100644
index 0000000..65bb8e8
--- /dev/null
+++ b/paleomix/nodes/raxml.py
@@ -0,0 +1,350 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import re
+import random
+
+import paleomix.common.fileutils as fileutils
+import paleomix.common.versions as versions
+
+from paleomix.node import CommandNode
+from paleomix.atomiccmd.builder import \
+ AtomicCmdBuilder, \
+ use_customizable_cli_parameters, \
+ create_customizable_cli_parameters
+
+
+RAXML_VERSION = versions.Requirement(call = ("raxmlHPC", "-version"),
+ search = r"version (\d+)\.(\d+)\.(\d+)",
+ checks = versions.GE(7, 3, 2))
+RAXML_PTHREADS_VERSION = versions.Requirement(call = ("raxmlHPC-PTHREADS", "-version"),
+ search = r"version (\d+)\.(\d+)\.(\d+)",
+ checks = versions.GE(7, 3, 2))
+
+
+class RAxMLReduceNode(CommandNode):
+ @create_customizable_cli_parameters
+ def customize(cls, input_alignment, input_partition, output_alignment, output_partition, dependencies = ()):
+ command = AtomicCmdBuilder("raxmlHPC")
+
+ # Read and (in the case of empty columns) reduce input
+ command.set_option("-f", "c")
+ # Output files are saved with a .Pypeline postfix, and subsequently renamed
+ command.set_option("-n", "Pypeline")
+ # Model required, but not used
+ command.set_option("-m", "GTRGAMMA")
+ # Ensures that output is saved to the temporary directory
+ command.set_option("-w", "%(TEMP_DIR)s")
+
+ # Symlink to sequence and partitions, to prevent the creation of *.reduced files outside temp folder
+ # In addition, it may be nessesary to remove the .reduced files if created
+ command.set_option("-s", "%(TEMP_IN_ALIGNMENT)s")
+ command.set_option("-q", "%(TEMP_IN_PARTITION)s")
+
+ command.set_kwargs(IN_ALIGNMENT = input_alignment,
+ IN_PARTITION = input_partition,
+
+ TEMP_IN_ALIGNMENT = "RAxML_alignment",
+ TEMP_IN_PARTITION = "RAxML_partitions",
+ TEMP_OUT_INFO = "RAxML_info.Pypeline",
+
+ OUT_ALIGNMENT = output_alignment,
+ OUT_PARTITION = output_partition,
+ CHECK_VERSION = RAXML_VERSION)
+
+ return {"command" : command}
+
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ self._kwargs = parameters.command.kwargs
+ CommandNode.__init__(self,
+ command = parameters.command.finalize(),
+ description = "<RAxMLReduce: '%s' -> '%s'>" \
+ % (parameters.input_alignment, parameters.output_alignment),
+ dependencies = parameters.dependencies)
+
+
+ def _setup(self, config, temp):
+ for key in ("IN_ALIGNMENT", "IN_PARTITION"):
+ source = os.path.abspath(self._kwargs[key])
+ destination = os.path.join(temp, self._kwargs["TEMP_" + key])
+
+ os.symlink(source, destination)
+
+ CommandNode._setup(self, config, temp)
+
+
+ def _teardown(self, config, temp):
+ for postfix in ("ALIGNMENT", "PARTITION"):
+ filenames = [self._kwargs["TEMP_IN_" + postfix],
+ self._kwargs["TEMP_IN_" + postfix] + ".reduced",
+ self._kwargs["OUT_" + postfix]]
+
+ for (source, destination) in zip(filenames, filenames[1:]):
+ source = fileutils.reroot_path(temp, source)
+ destination = fileutils.reroot_path(temp, destination)
+
+ if not os.path.exists(destination):
+ fileutils.copy_file(source, destination)
+ os.remove(source)
+
+ CommandNode._teardown(self, config, temp)
+
+
+class RAxMLBootstrapNode(CommandNode):
+ @create_customizable_cli_parameters
+ def customize(cls, input_alignment, input_partition, template, start = 0, bootstraps = 50, dependencies = ()):
+ command = AtomicCmdBuilder("raxmlHPC", set_cwd = True)
+
+ # Read and (in the case of empty columns) reduce input
+ command.set_option("-f", "j")
+ # Output files are saved with a .Pypeline postfix, and subsequently renamed
+ command.set_option("-n", "Pypeline")
+ # Model required, but not used
+ command.set_option("-m", "GTRGAMMA")
+ # Set random seed for bootstrap generation. May be set to a fixed value to allow replicability.
+ command.set_option("-b", int(random.random() * 2**31 - 1), fixed = False)
+ # Generate a single bootstrap alignment (makes growing the number of bootstraps easier).
+ command.set_option("-N", int(bootstraps), fixed = False)
+
+ # Symlink to sequence and partitions, to prevent the creation of *.reduced files outside temp folder
+ # In addition, it may be nessesary to remove the .reduced files if created
+ command.set_option("-s", "input.alignment")
+ command.set_option("-q", "input.partition")
+
+ bootstrap_files = {"IN_ALIGNMENT" : input_alignment,
+ "IN_PARTITION" : input_partition,
+ "TEMP_OUT_INF" : "RAxML_info.Pypeline",
+ "TEMP_OUT_ALN" : "input.alignment",
+ "TEMP_OUT_PAR" : "input.partition",
+ "CHECK_VERSION": RAXML_VERSION}
+
+ for (index, (_, filename)) in enumerate(cls._bootstraps(template, bootstraps, start)):
+ bootstrap_files["OUT_BS_%03i" % index] = filename
+ command.set_kwargs(**bootstrap_files)
+
+ return {"command" : command}
+
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ self._input_alignment = parameters.input_alignment
+ self._input_partition = parameters.input_partition
+ self._output_template = parameters.template
+ self._bootstrap_num = parameters.bootstraps
+ self._bootstrap_start = parameters.start
+
+ CommandNode.__init__(self,
+ command = parameters.command.finalize(),
+ description = "<RAxMLBootstrap: '%s' -> '%s' (%i .. %i>" \
+ % (parameters.input_alignment, parameters.template,
+ parameters.start, parameters.start + parameters.bootstraps - 1),
+ dependencies = parameters.dependencies)
+
+ def _setup(self, config, temp):
+ os.symlink(os.path.realpath(self._input_alignment), os.path.join(temp, "input.alignment"))
+ os.symlink(os.path.realpath(self._input_partition), os.path.join(temp, "input.partition"))
+
+
+ def _teardown(self, config, temp):
+ template = self._output_template
+ bootstraps = self._bootstrap_num
+ start = self._bootstrap_start
+ for (src_file, dst_file) in self._bootstraps(template, bootstraps, start):
+ src_file = os.path.join(temp, src_file)
+ dst_file = fileutils.reroot_path(temp, dst_file)
+ fileutils.move_file(src_file, dst_file)
+ CommandNode._teardown(self, config, temp)
+
+ @classmethod
+ def _bootstraps(cls, template, number, start):
+ for bootstrap in range(number):
+ src_file = "input.alignment.BS%i" % (bootstrap,)
+ dst_file = template % (bootstrap + start,)
+ yield (src_file, dst_file)
+
+
+class RAxMLRapidBSNode(CommandNode):
+ @create_customizable_cli_parameters
+ def customize(cls, input_alignment, output_template, input_partition=None,
+ threads=1, dependencies=()):
+ """
+ Arguments:
+ input_alignment -- An alignment file in a format readable by RAxML.
+ input_partition -- A set of partitions in a format readable by RAxML.
+ output_template -- A template string used to construct final filenames. Should consist
+ of a full path, including a single '%s', which is replaced with the
+ variable part of RAxML output files (e.g. 'info', 'bestTree', ...).
+ Example destination: '/disk/project/SN013420.RAxML.%s'
+ Example output: '/disk/project/SN013420.RAxML.bestTree'
+ """
+
+ if threads > 1:
+ command = AtomicCmdBuilder("raxmlHPC-PTHREADS")
+ command.set_option("-T", threads)
+ version = RAXML_PTHREADS_VERSION
+ else:
+ command = AtomicCmdBuilder("raxmlHPC")
+ version = RAXML_VERSION
+
+ # Perform rapid bootstrapping
+ command.set_option("-f", "a")
+ # Output files are saved with a .PALEOMIX postfix, and subsequently renamed
+ command.set_option("-n", "PALEOMIX")
+ # Ensures that output is saved to the temporary directory
+ command.set_option("-w", "%(TEMP_DIR)s")
+ # Symlink to sequence and partitions, to prevent the creation of *.reduced files outside temp folder
+ # In addition, it may be nessesary to remove the .reduced files if created
+ command.set_option("-s", "%(TEMP_OUT_ALN)s")
+
+ if input_partition is not None:
+ command.set_option("-q", "%(TEMP_OUT_PART)s")
+ command.set_kwargs(IN_PARTITION=input_partition,
+ TEMP_OUT_PART=os.path.basename(input_partition),
+ TEMP_OUT_PART_R=os.path.basename(input_partition) + ".reduced")
+
+ command.set_kwargs( # Auto-delete: Symlinks and .reduced files that RAxML may generate
+ TEMP_OUT_ALN=os.path.basename(input_alignment),
+ TEMP_OUT_ALN_R=os.path.basename(input_alignment) + ".reduced",
+
+ # Input files, are not used directly (see below)
+ IN_ALIGNMENT=input_alignment,
+
+ # Final output files, are not created directly
+ OUT_INFO=output_template % "info",
+ OUT_BESTTREE=output_template % "bestTree",
+ OUT_BOOTSTRAP=output_template % "bootstrap",
+ OUT_BIPART=output_template % "bipartitions",
+ OUT_BIPARTLABEL=output_template % "bipartitionsBranchLabels",
+
+ CHECK_VERSION=version)
+
+ # Use the GTRGAMMA model of NT substitution by default
+ command.set_option("-m", "GTRGAMMAI", fixed=False)
+ # Enable Rapid Boostrapping and set random seed. May be set to a fixed value to allow replicability.
+ command.set_option("-x", int(random.random() * 2**31 - 1), fixed=False)
+ # Set random seed for parsimony inference. May be set to a fixed value to allow replicability.
+ command.set_option("-p", int(random.random() * 2**31 - 1), fixed=False)
+ # Terminate bootstrapping upon convergence, rather than after a fixed number of repetitions
+ command.set_option("-N", "autoMRE", fixed=False)
+
+ return {"command": command}
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ self._symlinks = [parameters.input_alignment,
+ parameters.input_partition]
+ self._template = os.path.basename(parameters.output_template)
+
+ CommandNode.__init__(self,
+ command=parameters.command.finalize(),
+ description="<RAxMLRapidBS: '%s' -> '%s'>"
+ % (parameters.input_alignment,
+ parameters.output_template % ("*",)),
+ threads=parameters.threads,
+ dependencies=parameters.dependencies)
+
+ def _setup(self, config, temp):
+ CommandNode._setup(self, config, temp)
+
+ # Required to avoid the creation of files outside the temp folder
+ for filename in self._symlinks:
+ if filename is not None:
+ source = os.path.abspath(filename)
+ destination = os.path.join(temp, os.path.basename(filename))
+
+ os.symlink(source, destination)
+
+ def _teardown(self, config, temp):
+ for filename in os.listdir(temp):
+ match = re.match("RAxML_(.*).PALEOMIX", filename)
+ if match:
+ source = os.path.join(temp, filename)
+ destination = os.path.join(temp, self._template % match.groups())
+
+ fileutils.move_file(source, destination)
+
+ CommandNode._teardown(self, config, temp)
+
+
+class RAxMLParsimonyTreeNode(CommandNode):
+ @create_customizable_cli_parameters
+ def customize(cls, input_alignment, input_partitions, output_tree, dependencies = ()):
+ command = AtomicCmdBuilder("raxmlHPC")
+
+ # Compute a randomized parsimony starting tree
+ command.set_option("-y")
+ # Output files are saved with a .Pypeline postfix, and subsequently renamed
+ command.set_option("-n", "Pypeline")
+ # Model required, but not used
+ command.set_option("-m", "GTRGAMMA")
+ # Ensures that output is saved to the temporary directory
+ command.set_option("-w", "%(TEMP_DIR)s")
+ # Set random seed for bootstrap generation. May be set to a fixed value to allow replicability.
+ command.set_option("-p", int(random.random() * 2**31 - 1), fixed = False)
+
+ # Symlink to sequence and partitions, to prevent the creation of *.reduced files outside temp folder
+ command.set_option("-s", "%(TEMP_OUT_ALIGNMENT)s")
+ command.set_option("-q", "%(TEMP_OUT_PARTITION)s")
+
+ command.set_kwargs(IN_ALIGNMENT = input_alignment,
+ IN_PARTITION = input_partitions,
+
+ # TEMP_OUT_ is used to automatically remove these files
+ TEMP_OUT_ALIGNMENT = "RAxML_alignment",
+ TEMP_OUT_PARTITION = "RAxML_partitions",
+ TEMP_OUT_INFO = "RAxML_info.Pypeline",
+
+ OUT_TREE = output_tree,
+
+ CHECK_VERSION = RAXML_VERSION)
+
+ return {"command" : command}
+
+
+ @use_customizable_cli_parameters
+ def __init__(self, parameters):
+ self._input_alignment = parameters.input_alignment
+ self._input_partitions = parameters.input_partitions
+ self._output_tree = parameters.output_tree
+
+ CommandNode.__init__(self,
+ command = parameters.command.finalize(),
+ description = "<RAxMLParsimonyTree: '%s' -> '%s'>" \
+ % (parameters.input_alignment, parameters.output_tree),
+ dependencies = parameters.dependencies)
+
+
+ def _setup(self, config, temp):
+ os.symlink(os.path.abspath(self._input_alignment), os.path.join(temp, "RAxML_alignment"))
+ os.symlink(os.path.abspath(self._input_partitions), os.path.join(temp, "RAxML_partitions"))
+ CommandNode._setup(self, config, temp)
+
+
+ def _teardown(self, config, temp):
+ basename = os.path.basename(self._output_tree)
+ os.rename(os.path.join(temp, "RAxML_parsimonyTree.Pypeline"),
+ os.path.join(temp, basename))
+
+ CommandNode._teardown(self, config, temp)
diff --git a/paleomix/nodes/samtools.py b/paleomix/nodes/samtools.py
new file mode 100644
index 0000000..d9fa409
--- /dev/null
+++ b/paleomix/nodes/samtools.py
@@ -0,0 +1,203 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+
+from paleomix.node import CommandNode
+from paleomix.atomiccmd.command import AtomicCmd
+from paleomix.atomiccmd.sets import SequentialCmds
+
+from paleomix.common.fileutils import reroot_path, swap_ext
+import paleomix.common.versions as versions
+
+
+_VERSION_REGEX = r"Version: (\d+)\.(\d+)(?:\.(\d+))?"
+
+# v0.2.0 was the pre-release version of v1.0, and lacks required features
+_COMMON_CHECK = versions.Or(versions.EQ(0, 1, 19),
+ versions.GE(1, 0, 0))
+
+SAMTOOLS_VERSION = versions.Requirement(call=("samtools",),
+ search=_VERSION_REGEX,
+ checks=_COMMON_CHECK)
+
+SAMTOOLS_VERSION_0119 = versions.Requirement(call=("samtools",),
+ search=_VERSION_REGEX,
+ checks=versions.EQ(0, 1, 19))
+
+BCFTOOLS_VERSION_0119 \
+ = versions.Requirement(call=("bcftools",),
+ search=_VERSION_REGEX,
+ checks=versions.EQ(0, 1, 19))
+
+TABIX_VERSION = versions.Requirement(call=("tabix",),
+ search=_VERSION_REGEX,
+ checks=versions.GE(0, 2, 5))
+
+
+class TabixIndexNode(CommandNode):
+ """Tabix indexes a BGZip compressed VCF or pileup file.
+
+ The class currently supports the following presets:
+ - vcf -- BGZipped VCF file.
+ - pileup -- BGZipped pileup (non-binary) as produced by 'mpileup'.
+ """
+
+ def __init__(self, infile, preset="vcf", dependencies=()):
+ assert infile.lower().endswith(".bgz")
+ if preset == "pileup":
+ call = ["tabix", "-s", 1, "-b", 2, "-e", 2]
+ elif preset == "vcf":
+ call = ["tabix", "-p", preset]
+ else:
+ assert False, "Unxpected preset: %r" % preset
+
+ self._infile = infile
+ cmd_tabix = AtomicCmd(call + ["%(TEMP_IN_VCFFILE)s"],
+ TEMP_IN_VCFFILE=os.path.basename(infile),
+ IN_VCFFILE=infile,
+ OUT_TBI=infile + ".tbi",
+ CHECK_TABIX=TABIX_VERSION)
+
+ CommandNode.__init__(self,
+ description="<TabixIndex (%s): '%s'>" % (preset,
+ infile,),
+ command=cmd_tabix,
+ dependencies=dependencies)
+
+ def _setup(self, config, temp):
+ """See CommandNode._setup."""
+ infile = os.path.abspath(self._infile)
+ outfile = reroot_path(temp, self._infile)
+ os.symlink(infile, outfile)
+
+ CommandNode._setup(self, config, temp)
+
+ def _teardown(self, config, temp):
+ """See CommandNode._teardown."""
+ os.remove(reroot_path(temp, self._infile))
+
+ CommandNode._teardown(self, config, temp)
+
+
+class FastaIndexNode(CommandNode):
+ """Indexed a FASTA file using 'samtools faidx'."""
+
+ def __init__(self, infile, dependencies=()):
+ self._infile = infile
+ cmd_faidx = AtomicCmd(["samtools", "faidx", "%(TEMP_IN_FASTA)s"],
+ TEMP_IN_FASTA=os.path.basename(infile),
+ IN_FASTA=infile,
+ OUT_TBI=infile + ".fai",
+ CHECK_SAM=SAMTOOLS_VERSION)
+
+ CommandNode.__init__(self,
+ description="<FastaIndex: '%s'>" % (infile,),
+ command=cmd_faidx,
+ dependencies=dependencies)
+
+ def _setup(self, config, temp):
+ """See CommandNode._setup."""
+ infile = os.path.abspath(self._infile)
+ outfile = reroot_path(temp, self._infile)
+ os.symlink(infile, outfile)
+
+ CommandNode._setup(self, config, temp)
+
+ def _teardown(self, config, temp):
+ """See CommandNode._teardown."""
+ os.remove(reroot_path(temp, self._infile))
+
+ CommandNode._teardown(self, config, temp)
+
+
+class BAMIndexNode(CommandNode):
+ """Indexed a BAM file using 'samtools index'."""
+
+ def __init__(self, infile, dependencies=()):
+ basename = os.path.basename(infile)
+
+ cmd_link = AtomicCmd(["ln", "-s", "%(IN_BAM)s", "%(TEMP_OUT_BAM)s"],
+ IN_BAM=infile,
+ TEMP_OUT_BAM=basename,
+ set_cwd=True)
+
+ cmd_index = AtomicCmd(["samtools", "index", "%(TEMP_IN_BAM)s"],
+ TEMP_IN_BAM=basename,
+ CHECK_SAM=SAMTOOLS_VERSION)
+
+ cmd_rename = AtomicCmd(["mv", "%(TEMP_IN_BAM)s", "%(OUT_BAM)s"],
+ TEMP_IN_BAM=basename + ".bai",
+ OUT_BAM=swap_ext(infile, ".bai"))
+
+ commands = SequentialCmds((cmd_link, cmd_index, cmd_rename))
+
+ CommandNode.__init__(self,
+ description="<BAMIndex: '%s'>" % (infile,),
+ command=commands,
+ dependencies=dependencies)
+
+
+class RMDuplicatesNode(CommandNode):
+ """Remove PCR duplicates from BAM file."""
+
+ def __init__(self, input_bam, output_bam, se_reads=False, force_se=False,
+ dependencies=()):
+ call = ["samtools", "rmdup"]
+ if se_reads:
+ call.append("-s")
+ if force_se:
+ call.append("-S")
+
+ command = AtomicCmd(call + ["%(IN_BAM)s", "%(OUT_BAM)s"],
+ IN_BAM=input_bam,
+ OUT_BAM=output_bam,
+ CHECK_SAM=SAMTOOLS_VERSION)
+
+ CommandNode.__init__(self,
+ description="<Samtools rmdup: %r -> %r>"
+ % (input_bam, output_bam),
+ command=command,
+ dependencies=dependencies)
+
+
+class FilterBAMNode(CommandNode):
+ """Filter BAM file using samtools view."""
+
+ def __init__(self, input_bam, output_bam, require_flags=0, exclude_flags=0,
+ dependencies=()):
+ call = ["samtools", "view", "-b"]
+ if require_flags:
+ call.extend(("-f", hex(require_flags)))
+ if exclude_flags:
+ call.extend(("-F", hex(exclude_flags)))
+
+ command = AtomicCmd(call + ["%(IN_BAM)s"],
+ IN_BAM=input_bam,
+ OUT_STDOUT=output_bam,
+ CHECK_SAM=SAMTOOLS_VERSION)
+
+ CommandNode.__init__(self,
+ description="<SAMTools view: %r -> %r>"
+ % (input_bam, output_bam),
+ command=command,
+ dependencies=dependencies)
diff --git a/paleomix/nodes/sequences.py b/paleomix/nodes/sequences.py
new file mode 100644
index 0000000..7023c16
--- /dev/null
+++ b/paleomix/nodes/sequences.py
@@ -0,0 +1,206 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+from __future__ import with_statement
+
+import os
+import copy
+import itertools
+import collections
+
+import pysam
+
+import paleomix.common.fileutils as fileutils
+import paleomix.common.utilities as utilities
+import paleomix.common.sequences as sequtils
+import paleomix.common.text as text
+
+from paleomix.common.formats.fasta import \
+ FASTA
+from paleomix.common.formats.msa import \
+ MSA
+from paleomix.node import \
+ NodeError, \
+ Node
+from paleomix.common.bedtools import \
+ BEDRecord
+
+
+class CollectSequencesNode(Node):
+ def __init__(self, fasta_files, sequences, destination, dependencies=()):
+ """
+ fasta_files -- { taxon_name_1 : filename_1, ... }
+ sequences -- { interval_name_1, ... }
+ """
+
+ self._infiles = copy.deepcopy(fasta_files)
+ self._sequences = utilities.safe_coerce_to_frozenset(sequences)
+ self._destination = copy.copy(destination)
+ self._outfiles = [os.path.join(destination, name + ".fasta")
+ for name in self._sequences]
+
+ input_files = list(self._infiles.itervalues())
+ for filename in self._infiles.itervalues():
+ input_files.append(filename + ".fai")
+
+ desc = "<CollectSequences: %i sequences from %i files -> '%s'>" \
+ % (len(self._sequences), len(self._infiles), self._destination)
+ Node.__init__(self,
+ description=desc,
+ input_files=input_files,
+ output_files=self._outfiles,
+ dependencies=dependencies)
+
+ def _setup(self, _config, _temp):
+ for filename in self._infiles.itervalues():
+ with open(filename + ".fai") as handle:
+ sequences = set()
+ for line in handle:
+ sequences.add(line.split("\t", 1)[0])
+
+ missing_sequences = list(self._sequences - sequences)
+ if missing_sequences:
+ if len(missing_sequences) >= 4:
+ missing_sequences = missing_sequences[:3]
+ missing_sequences.append("...")
+
+ message = ("FASTA file does not contain expected "
+ "sequences:\n File = %r\n "
+ "Sequences = %s\n") \
+ % (filename, ", ".join(missing_sequences))
+ raise NodeError(message)
+
+ def _run(self, _config, temp):
+ fasta_files = []
+ for (name, filename) in sorted(self._infiles.iteritems()):
+ fasta_files.append((name, pysam.Fastafile(filename)))
+
+ for sequence_name in sorted(self._sequences):
+ filename = os.path.join(temp, sequence_name + ".fasta")
+ with open(filename, "w") as out_handle:
+ for (sample, fasta_file) in fasta_files:
+ sequence = fasta_file.fetch(sequence_name)
+ fasta = FASTA(sample, sequence_name, sequence)
+ out_handle.write(str(fasta))
+
+ def _teardown(self, _config, temp):
+ for destination in sorted(self._outfiles):
+ source = fileutils.reroot_path(temp, destination)
+ fileutils.move_file(source, destination)
+
+
+class FilterSingletonsNode(Node):
+ def __init__(self, input_file, output_file, filter_by, dependencies):
+ self._input_file = input_file
+ self._output_file = output_file
+ self._filter_by = dict(filter_by)
+ for (to_filter, groups) in self._filter_by.items():
+ # The taxa to be filtered is implied to be part of the group,
+ # but is not needed when actually carrying out the filtering
+ groups = utilities.safe_coerce_to_frozenset(groups) \
+ - utilities.safe_coerce_to_frozenset(to_filter)
+
+ if not groups:
+ raise RuntimeError("Singleton filtering must involve at least "
+ "one other taxa")
+ self._filter_by[to_filter] = groups
+
+ Node.__init__(self,
+ description="<FilterSingleton: '%s' -> '%s'>"
+ % (input_file, output_file),
+ input_files=[input_file],
+ output_files=[output_file],
+ dependencies=dependencies)
+
+ def _run(self, _config, temp):
+ alignment = MSA.from_file(self._input_file)
+ for (to_filter, groups) in self._filter_by.iteritems():
+ alignment = alignment.filter_singletons(to_filter, groups)
+
+ temp_filename = fileutils.reroot_path(temp, self._output_file)
+ with open(temp_filename, "w") as handle:
+ alignment.to_file(handle)
+ fileutils.move_file(temp_filename, self._output_file)
+
+
+class ExtractReferenceNode(Node):
+ def __init__(self, reference, bedfile, outfile, dependencies=()):
+ self._reference = reference
+ self._bedfile = bedfile
+ self._outfile = outfile
+
+ description = "<ExtractReference: '%s' -> '%s'>" \
+ % (reference, outfile)
+ Node.__init__(self,
+ description=description,
+ input_files=[reference, bedfile],
+ output_files=[outfile],
+ dependencies=dependencies)
+
+ def _run(self, _config, temp):
+ def _by_name(bed):
+ return bed.name
+
+ fastafile = pysam.Fastafile(self._reference)
+ seqs = collections.defaultdict(list)
+ with open(self._bedfile) as bedfile:
+ bedrecords = text.parse_lines_by_contig(bedfile, BEDRecord)
+ for (contig, beds) in sorted(bedrecords.iteritems()):
+ beds.sort(key=lambda bed: (bed.contig, bed.name, bed.start))
+
+ for (gene, gene_beds) in itertools.groupby(beds, _by_name):
+ gene_beds = tuple(gene_beds)
+ sequence = self._collect_sequence(fastafile, gene_beds)
+ seqs[(contig, gene)] = sequence
+
+ temp_file = os.path.join(temp, "sequences.fasta")
+ with open(temp_file, "w") as out_file:
+ for ((_, gene), sequence) in sorted(seqs.items()):
+ FASTA(gene, None, sequence).write(out_file)
+
+ fileutils.move_file(temp_file, self._outfile)
+
+ @classmethod
+ def _collect_sequence(cls, fastafile, beds):
+ sequence = []
+ for bed in beds:
+ fragment = fastafile.fetch(bed.contig, bed.start, bed.end)
+ if len(fragment) != (bed.end - bed.start):
+ cls._report_failure(bed, fragment)
+
+ sequence.append(fragment)
+ sequence = "".join(sequence)
+
+ if any((bed.strand == "-") for bed in beds):
+ assert all((bed.strand == "-") for bed in beds)
+ sequence = sequtils.reverse_complement(sequence)
+
+ return sequence
+
+ @classmethod
+ def _report_failure(cls, bed, fragment):
+ message = "Failed to extract region from " \
+ "reference sequence at %s:%i-%i; got " \
+ "%i bp, but expected %i bp." \
+ % (bed.contig, bed.start, bed.end,
+ len(fragment), (bed.end - bed.start))
+ raise NodeError(message)
diff --git a/paleomix/nodes/validation.py b/paleomix/nodes/validation.py
new file mode 100644
index 0000000..d0989a5
--- /dev/null
+++ b/paleomix/nodes/validation.py
@@ -0,0 +1,401 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import collections
+import io
+import os
+import re
+
+import pysam
+
+from paleomix.node import \
+ Node, \
+ NodeError
+from paleomix.common.fileutils import \
+ describe_files, \
+ make_dirs
+from paleomix.common.utilities import \
+ chain_sorted
+from paleomix.common.sequences import \
+ reverse_complement
+
+import paleomix.common.formats.fastq as fastq
+import paleomix.common.procs as procs
+import paleomix.common.sampling as sampling
+import paleomix.tools.factory as factory
+
+
+class DetectInputDuplicationNode(Node):
+ """Attempts to detect reads included multiple times as input based on the
+ presence of reads with identical names AND sequences. This is compromise
+ between sensitivity, specificity, and running time.
+
+ A possible refinement would be to consider reads with the same name where
+ one read is the prefix of the other (due to different amounts of trimming
+ or collapsing of reads).
+ """
+
+ def __init__(self, input_files, output_file, dependencies=()):
+ Node.__init__(self,
+ description="<Detect Input Duplication: %s>"
+ % (describe_files(input_files)),
+ input_files=input_files,
+ output_files=output_file,
+ dependencies=dependencies)
+
+ def run(self, _):
+ handles = []
+ try:
+ last_pos = None
+ observed_reads = collections.defaultdict(list)
+ for (record, filename) in self._open_samfiles(handles, self.input_files):
+ curr_pos = (record.pos, record.tid)
+ if curr_pos != last_pos:
+ self._process_reads(observed_reads, self.output_files)
+ observed_reads.clear()
+ last_pos = curr_pos
+
+ # Stop once the trailing, unmapped reads are reached
+ if record.tid == -1:
+ break
+
+ observed_reads[record.qname].append((record, filename))
+ self._process_reads(observed_reads, self.output_files)
+
+ # Everything is ok, touch the output files
+ for fpath in self.output_files:
+ make_dirs(os.path.dirname(fpath))
+ with open(fpath, "w"):
+ pass
+ finally:
+ for handle in handles:
+ handle.close()
+
+ @classmethod
+ def _open_samfiles(cls, handles, filenames):
+ sequences = []
+ for filename in filenames:
+ handle = pysam.Samfile(filename)
+ handles.append(handle)
+
+ sequences.append(cls._read_samfile(handle, filename))
+
+ return chain_sorted(*sequences, key=cls._key_by_tid_pos)
+
+ @classmethod
+ def _read_samfile(cls, handle, filename):
+ for record in handle:
+ if record.is_unmapped and (not record.pos or record.mate_is_unmapped):
+ # Ignore unmapped reads except when these are sorted
+ # according to the mate position (if mapped)
+ continue
+ elif record.flag & 0x900:
+ # Ignore supplementary / secondary alignments
+ continue
+
+ yield (record, filename)
+
+ @classmethod
+ def _process_reads(cls, observed_reads, output_files):
+ for records_and_filenames in observed_reads.itervalues():
+ if len(records_and_filenames) == 1:
+ # Most read-names should be obseved at most once at a position
+ continue
+
+ result = collections.defaultdict(list)
+ for record, filename in records_and_filenames:
+ key = (record.is_reverse, record.qname, record.seq, record.qual)
+ result[key].append(filename)
+
+ for (is_reverse, name, seq, qual), filenames in result.iteritems():
+ if len(filenames) == 1:
+ # Two reads had same name, but different characterstics
+ continue
+
+ filename_counts = collections.defaultdict(int)
+ for filename in filenames:
+ filename_counts[filename] += 1
+
+ if is_reverse:
+ seq = reverse_complement(seq)
+ qual = qual[::-1]
+
+ message = ["The same read was found multiple times!",
+ " Name: %r" % (name,),
+ " Sequence: %r" % (seq,),
+ " Qualities: %r" % (qual,),
+ ""]
+
+ message.append("Read was found")
+ for filename, count in sorted(filename_counts.iteritems()):
+ message.append(" % 2ix in %r" % (count, filename))
+
+ message.append("")
+ message.append("This indicates that the same data files have "
+ "been included multiple times in the project. "
+ "Please review the input files used in this "
+ "project, to ensure that each set of data is "
+ "included only once!\n\n"
+
+ "If this is not the case, then execute the "
+ "following command(s) to mark this test as "
+ "having succeeded:")
+
+ for fpath in output_files:
+ message.append("$ touch '%s'" % (fpath,))
+
+ raise NodeError("\n".join(message))
+
+ @classmethod
+ def _key_by_tid_pos(cls, record):
+ return (record[0].tid, record[0].pos)
+
+
+class ValidateFASTQFilesNode(Node):
+ def __init__(self, input_files, output_file, offset, dependencies=()):
+ self._offset = offset
+ Node.__init__(self,
+ description="<Validate FASTQ Files: %s>"
+ % (describe_files(input_files)),
+ input_files=input_files,
+ output_files=output_file,
+ dependencies=dependencies)
+
+ def _run(self, _config, _temp):
+ check_fastq_files(self.input_files, self._offset, True)
+ output_file = tuple(self.output_files)[0]
+ make_dirs(os.path.dirname(output_file))
+ with open(output_file, "w"):
+ pass
+
+
+class ValidateFASTAFilesNode(Node):
+ def __init__(self, input_files, output_file, dependencies=()):
+ Node.__init__(self,
+ description="<Validate FASTA Files: %s>"
+ % (describe_files(input_files)),
+ input_files=input_files,
+ output_files=output_file,
+ dependencies=dependencies)
+
+ assert len(self.output_files) == 1, self.output_files
+
+ def _run(self, _config, _temp):
+ for filename in self.input_files:
+ check_fasta_file(filename)
+ output_file, = self.output_files
+ make_dirs(os.path.dirname(output_file))
+ with open(output_file, "w"):
+ pass
+
+
+def check_fastq_files(filenames, required_offset, allow_empty=False):
+ for filename in filenames:
+ qualities = _read_sequences(filename)
+ offsets = fastq.classify_quality_strings(qualities)
+ if offsets == fastq.OFFSET_BOTH:
+ raise NodeError("FASTQ file contains quality scores with both "
+ "quality offsets (33 and 64); file may be "
+ "unexpected format or corrupt. Please ensure "
+ "that this file contains valid FASTQ reads from a "
+ "single source.\n Filename = %r" % (filename,))
+ elif offsets == fastq.OFFSET_MISSING:
+ if allow_empty and not qualities:
+ return
+
+ raise NodeError("FASTQ file did not contain quality scores; file "
+ "may be unexpected format or corrupt. Ensure that "
+ "the file is a FASTQ file.\n Filename = %r"
+ % (filename,))
+ elif offsets not in (fastq.OFFSET_AMBIGIOUS, required_offset):
+ raise NodeError("FASTQ file contains quality scores with wrong "
+ "quality score offset (%i); expected reads with "
+ "quality score offset %i. Ensure that the "
+ "'QualityOffset' specified in the makefile "
+ "corresponds to the input.\n Filename = %s"
+ % (offsets, required_offset, filename))
+
+
+def _read_sequences(filename):
+ cat_call = factory.new("cat")
+ cat_call.add_multiple_values((filename,))
+ cat_call = cat_call.finalized_call
+
+ cat = None
+ try:
+ cat = procs.open_proc(cat_call,
+ bufsize=io.DEFAULT_BUFFER_SIZE,
+ stderr=procs.PIPE,
+ stdout=procs.PIPE)
+ qualities = _collect_qualities(cat.stdout, filename)
+
+ return sampling.reservoir_sampling(qualities, 100000)
+ except:
+ if cat:
+ cat.kill()
+ cat.wait()
+ cat = None
+ raise
+ finally:
+ rc_cat = cat.wait() if cat else 0
+ if rc_cat:
+ message = "Error running 'paleomix cat':\n" \
+ " Unicat return-code = %i\n\n%s" \
+ % (rc_cat, cat.stderr.read())
+ raise NodeError(message)
+
+
+def _collect_qualities(handle, filename):
+ header = handle.readline()
+ while header:
+ sequence = handle.readline()
+ seperator = handle.readline()
+ qualities = handle.readline()
+
+ if not header.startswith("@"):
+ if header.startswith(">"):
+ raise NodeError("Input file appears to be in FASTA format "
+ "(header starts with '>', expected '@'), "
+ "but only FASTQ files are supported\n"
+ "Filename = %r" % (filename,))
+
+ raise NodeError("Input file lacks FASTQ header (expected '@', "
+ "found %r), but only FASTQ files are supported\n"
+ " Filename = %r" % (header[:1], filename))
+ elif not qualities:
+ raise NodeError("Partial record found; is not 4 lines long:\n"
+ "Filename = %r\n Record = '%s'"
+ % (filename, header.rstrip()))
+ elif not seperator.startswith("+"):
+ raise NodeError("Input file lacks FASTQ seperator (expected '+', "
+ "found %r), but only FASTQ files are supported\n"
+ " Filename = %r" % (seperator[:1], filename))
+ elif len(sequence) != len(qualities):
+ raise NodeError("Input file contains malformed FASTQ records; "
+ "length of sequence / qualities are not the "
+ "same.\n Filename = %r\n Record = '%s'"
+ % (filename, header.rstrip()))
+
+ yield qualities
+ header = handle.readline()
+
+
+def check_fasta_file(filename):
+ with open(filename) as handle:
+ namecache = {}
+ state, linelength, linelengthchanged = _NA, None, False
+ for linenum, line in enumerate(handle, start=1):
+ # Only \n is allowed as not all tools (e.g. GATK) handle \r
+ line = line.rstrip('\n')
+
+ if not line:
+ if state in (_NA, _IN_WHITESPACE):
+ continue
+ elif state == _IN_HEADER:
+ raise NodeError("Expected FASTA sequence, found empty line"
+ "\n Filename = %r\n Line = %r"
+ % (filename, linenum))
+ elif state == _IN_SEQUENCE:
+ state = _IN_WHITESPACE
+ else:
+ assert False
+ elif line.startswith(">"):
+ if state in (_NA, _IN_SEQUENCE, _IN_WHITESPACE):
+ _validate_fasta_header(filename, linenum, line, namecache)
+ state = _IN_HEADER
+ linelength = None
+ linelengthchanged = False
+ elif state == _IN_HEADER:
+ raise NodeError("Empty sequences not allowed\n"
+ " Filename = %r\n Line = %r"
+ % (filename, linenum - 1))
+ else:
+ assert False
+ else:
+ if state == _NA:
+ raise NodeError("Expected FASTA header, found %r\n"
+ " Filename = %r\n Line = %r"
+ % (line, filename, linenum))
+ elif state == _IN_HEADER:
+ _validate_fasta_line(filename, linenum, line)
+ linelength = len(line)
+ state = _IN_SEQUENCE
+ elif state == _IN_SEQUENCE:
+ _validate_fasta_line(filename, linenum, line)
+ # If the length has changed, then that line must be the
+ # last line in the record, which may be shorter due to the
+ # sequence length. This is because the FAI index format
+ # expects that each line has the same length.
+ if linelengthchanged or (linelength < len(line)):
+ raise NodeError("Lines in FASTQ files must be of same "
+ "length\n Filename = %r\n"
+ " Line = %r" % (filename, linenum))
+ elif linelength != len(line):
+ linelengthchanged = True
+ elif state == _IN_WHITESPACE:
+ raise NodeError("Empty lines not allowed in sequences\n"
+ " Filename = %r\n Line = %r"
+ % (filename, linenum))
+ else:
+ assert False
+
+ if state in (_NA, _IN_HEADER):
+ raise NodeError("File does not contain any sequences"
+ " Filename = %r" % (filename, ))
+
+# Standard nucleotides + UIPAC codes
+_VALID_CHARS_STR = "ACGTN" "RYSWKMBDHV"
+_VALID_CHARS = frozenset(_VALID_CHARS_STR.upper() + _VALID_CHARS_STR.lower())
+_NA, _IN_HEADER, _IN_SEQUENCE, _IN_WHITESPACE = range(4)
+
+
+def _validate_fasta_header(filename, linenum, line, cache):
+ name = line.split(" ", 1)[0][1:]
+ if not name:
+ raise NodeError("FASTA sequence must have non-empty name\n"
+ " Filename = %r\n Line = %r\n"
+ % (filename, linenum))
+ elif not _RE_REF_NAME.match(name):
+ raise NodeError("Invalid name for FASTA sequence: %r\n"
+ " Filename = %r\n Line = %r\n"
+ % (name, filename, linenum))
+ elif name in cache:
+ raise NodeError("FASTA sequences have identical name\n"
+ " Filename = %r\n Name = %r\n"
+ " Line 1 = %r\n Line 2 = %r\n"
+ % (filename, name, linenum, cache[name]))
+ cache[name] = linenum
+_RE_REF_NAME = re.compile("[!-()+-<>-~][!-~]*")
+
+
+def _validate_fasta_line(filename, linenum, line):
+ invalid_chars = frozenset(line) - _VALID_CHARS
+ if invalid_chars:
+ if invalid_chars == frozenset('\r'):
+ raise NodeError("FASTA file contains carriage-returns ('\\r')!\n"
+ "Please convert file to unix format, using e.g. "
+ "dos2unix.\n Filename = %r\n" % (filename,))
+
+ raise NodeError("FASTA sequence contains invalid characters\n"
+ " Filename = %r\n Line = %r\n"
+ " Invalid characters = %r"
+ % (filename, linenum, "".join(invalid_chars)))
diff --git a/paleomix/pipeline.py b/paleomix/pipeline.py
new file mode 100644
index 0000000..373b8d5
--- /dev/null
+++ b/paleomix/pipeline.py
@@ -0,0 +1,492 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+from __future__ import print_function
+
+import errno
+import logging
+import multiprocessing
+import os
+import pickle
+import Queue
+import signal
+import traceback
+
+import paleomix.ui
+import paleomix.logger
+
+from paleomix.node import \
+ Node, \
+ NodeError, \
+ NodeUnhandledException
+from paleomix.nodegraph import \
+ FileStatusCache, \
+ NodeGraph, \
+ NodeGraphError
+from paleomix.common.utilities import \
+ safe_coerce_to_tuple, \
+ fast_pickle_test
+from paleomix.common.versions import \
+ VersionRequirementError
+
+
+class Pypeline(object):
+ def __init__(self, config):
+ self._nodes = []
+ self._config = config
+ self._logger = logging.getLogger(__name__)
+ # Set if a keyboard-interrupt (SIGINT) has been caught
+ self._interrupted = False
+ self._queue = multiprocessing.Queue()
+ self._pool = multiprocessing.Pool(1, _init_worker, (self._queue,))
+
+ def add_nodes(self, *nodes):
+ for subnodes in safe_coerce_to_tuple(nodes):
+ for node in safe_coerce_to_tuple(subnodes):
+ if not isinstance(node, Node):
+ raise TypeError("Node object expected, recieved %s"
+ % repr(node))
+ self._nodes.append(node)
+
+ def run(self, max_threads=1, dry_run=False, progress_ui="verbose"):
+ if max_threads < 1:
+ raise ValueError("Max threads must be >= 1")
+ _update_nprocesses(self._pool, max_threads)
+
+ try:
+ nodegraph = NodeGraph(self._nodes)
+ except NodeGraphError, error:
+ self._logger.error(error)
+ return False
+
+ for node in nodegraph.iterflat():
+ if (node.threads > max_threads):
+ message = "Node(s) use more threads than the max allowed; " \
+ "the pipeline may therefore use more than the " \
+ "expected number of threads.\n"
+ paleomix.ui.print_warn(message)
+ break
+
+ if dry_run:
+ progress_printer = paleomix.ui.QuietUI()
+ nodegraph.add_state_observer(progress_printer)
+ progress_printer.flush()
+ progress_printer.finalize()
+ self._logger.info("Dry run done ...")
+ return True
+
+ old_handler = signal.signal(signal.SIGINT, self._sigint_handler)
+ try:
+ return self._run(nodegraph, max_threads, progress_ui)
+ finally:
+ signal.signal(signal.SIGINT, old_handler)
+
+ return False
+
+ def _run(self, nodegraph, max_threads, progress_ui):
+ # Dictionary of nodes -> async-results
+ running = {}
+ # Set of remaining nodes to be run
+ remaining = set(nodegraph.iterflat())
+
+ is_ok = True
+ progress_printer = paleomix.ui.get_ui(progress_ui)
+ progress_printer.max_threads = max_threads
+ nodegraph.add_state_observer(progress_printer)
+
+ with paleomix.ui.CommandLine() as cli:
+ while running or (remaining and not self._interrupted):
+ is_ok &= self._poll_running_nodes(running,
+ nodegraph,
+ self._queue)
+
+ if not self._interrupted: # Prevent starting of new nodes
+ self._start_new_tasks(remaining, running, nodegraph,
+ max_threads, self._pool)
+
+ if running:
+ progress_printer.flush()
+
+ max_threads = cli.process_key_presses(nodegraph,
+ max_threads,
+ progress_printer)
+ progress_printer.max_threads = max_threads
+ _update_nprocesses(self._pool, max_threads)
+
+ self._pool.close()
+ self._pool.join()
+
+ progress_printer.flush()
+ progress_printer.finalize()
+
+ return is_ok
+
+ def _start_new_tasks(self, remaining, running, nodegraph, max_threads,
+ pool):
+ started_nodes = []
+ idle_processes = max_threads \
+ - sum(node.threads for (node, _) in running.itervalues())
+
+ if not idle_processes:
+ return False
+
+ for node in remaining:
+ if not running or (idle_processes >= node.threads):
+ state = nodegraph.get_node_state(node)
+ if state == nodegraph.RUNABLE:
+ try:
+ # The multi-processing module relies on pickling
+ fast_pickle_test(node)
+ except pickle.PicklingError, error:
+ self._logger.error("Node cannot be pickled; please "
+ "file a bug-report:\n"
+ "\tNode: %s\n\tError: %s"
+ % (self, error))
+ nodegraph.set_node_state(node, nodegraph.ERROR)
+ started_nodes.append(node)
+ continue
+
+ key = id(node)
+ proc_args = (key, node, self._config)
+ running[key] = (node, pool.apply_async(_call_run,
+ args=proc_args))
+ started_nodes.append(node)
+
+ nodegraph.set_node_state(node, nodegraph.RUNNING)
+ idle_processes -= node.threads
+ elif state in (nodegraph.DONE, nodegraph.ERROR):
+ started_nodes.append(node)
+ elif idle_processes <= 0:
+ break
+
+ for node in started_nodes:
+ remaining.remove(node)
+
+ def _poll_running_nodes(self, running, nodegraph, queue):
+ errors = None
+ blocking = False
+
+ while running and not errors:
+ node, proc = self._get_finished_node(queue, running, blocking)
+ if not node:
+ if blocking:
+ break
+
+ blocking = True
+ continue
+
+ try:
+ # Re-raise exceptions from the node-process
+ proc.get()
+ except (KeyboardInterrupt, SystemExit):
+ raise
+ except Exception, errors:
+ nodegraph.set_node_state(node, nodegraph.ERROR)
+
+ message = [str(node),
+ " Error (%r) occurred running command:"
+ % (type(errors).__name__)]
+
+ for line in str(errors).strip().split("\n"):
+ message.append(" %s" % (line,))
+ message.append("")
+
+ self._logger.error("\n".join(message))
+
+ if not errors:
+ nodegraph.set_node_state(node, nodegraph.DONE)
+
+ return not errors
+
+ @property
+ def nodes(self):
+ return set(self._nodes)
+
+ def walk_nodes(self, func):
+ skip_nodes = set()
+
+ def _walk_nodes(nodes):
+ for node in nodes:
+ if node in skip_nodes:
+ continue
+ elif not func(node):
+ return False
+
+ skip_nodes.add(node)
+ if not _walk_nodes(node.dependencies):
+ return False
+ return True
+
+ _walk_nodes(self._nodes)
+
+ def list_input_files(self):
+ """Returns a set containing the absolute path of all input files
+ required by the current pipeline. These do not include any file
+ generated by the pipeline itself (output files).
+ """
+ input_files = set()
+ output_files = set()
+
+ def collect_output_files(node):
+ for filename in node.input_files:
+ input_files.add(os.path.abspath(filename))
+
+ for filename in node.output_files:
+ output_files.add(os.path.abspath(filename))
+
+ return True
+
+ self.walk_nodes(collect_output_files)
+
+ return input_files - output_files
+
+ def list_output_files(self):
+ cache = FileStatusCache()
+ nodegraph = NodeGraph(self._nodes, lambda: cache)
+ output_files = {}
+
+ def collect_output_files(node):
+ state = None
+ if nodegraph.is_done(node, cache):
+ state = nodegraph.DONE
+ if nodegraph.is_outdated(node, cache):
+ state = nodegraph.OUTDATED
+
+ for filename in node.output_files:
+ output_files[os.path.abspath(filename)] = state
+
+ return True
+
+ self.walk_nodes(collect_output_files)
+
+ return output_files
+
+ def list_required_executables(self):
+ requirements = {}
+
+ def collect_requirements(node):
+ for executable in node.executables:
+ if executable not in requirements:
+ requirements[executable] = set()
+
+ for requirement in node.requirements:
+ if requirement.name not in requirements:
+ requirements[requirement.name] = set()
+
+ requirements[requirement.name].add(requirement)
+
+ executable = requirement.executable
+ if not requirements.get(executable):
+ requirements.pop(executable, None)
+
+ return True
+
+ self.walk_nodes(collect_requirements)
+ return requirements
+
+ def print_output_files(self, print_func=print):
+ output_files = self.list_output_files()
+
+ for filename, state in sorted(output_files.iteritems()):
+ if state == NodeGraph.DONE:
+ state = "Ready "
+ elif state == NodeGraph.OUTDATED:
+ state = "Outdated "
+ else:
+ state = "Missing "
+
+ print_func("%s\t%s" % (state, filename))
+
+ def print_input_files(self, print_func=print):
+ """Prints the absolute path of all input files required by the current
+ pipeline, excluding any file generated by the pipeline itself (output
+ files). One file is printed per line.
+ """
+ input_files = self.list_input_files()
+ for filename in sorted(input_files):
+ print_func("%s" % (filename,))
+
+ def print_required_executables(self, print_func=print):
+ template = "{: <40s} {: <11s} {}"
+ pipeline_executables = self.list_required_executables()
+ print_func(template.format("Executable",
+ "Version",
+ "Required version"))
+
+ for (name, requirements) in sorted(pipeline_executables.items()):
+ if not requirements:
+ print_func(template.format(name, "-", "any version"))
+ continue
+
+ for requirement in requirements:
+ try:
+ if requirement.version:
+ version = "v" + ".".join(map(str, requirement.version))
+ else:
+ version = "NA"
+ except VersionRequirementError:
+ version = "UNKNOWN"
+
+ print_func(template.format(name, version, requirement.checks))
+
+ def _sigint_handler(self, signum, frame):
+ """Signal handler; see signal.signal."""
+ if not self._interrupted:
+ self._interrupted = True
+ self._logger.error("\nKeyboard interrupt detected, waiting for "
+ "running tasks to complete ... Press CTRL-C "
+ "again to force termination.\n")
+ else:
+ self._pool.terminate()
+ raise signal.default_int_handler(signum, frame)
+
+ def to_dot(self, destination):
+ """Writes a simlpe dot file to the specified destination, representing
+ the full dependency tree. Nodes are named by their class.
+ """
+ try:
+ nodegraph = NodeGraph(self._nodes)
+ except NodeGraphError, error:
+ self._logger.error(error)
+ return False
+
+ # Dict recording all dependencies of nodes
+ meta_dependencies = {}
+ # Dict recording if anything depends on a speific node
+ meta_rev_dependencies = {}
+ for node in nodegraph.iterflat():
+ selection = node.dependencies
+ meta_dependencies[node] = selection
+ for dep in selection:
+ meta_rev_dependencies[dep] = True
+
+ return self._write_dot(destination,
+ meta_dependencies,
+ meta_rev_dependencies)
+
+ @classmethod
+ def _write_dot(cls, destination, meta_dependencies, meta_rev_dependencies):
+ """Writes simple dot file, in which each node is connected to their
+ dependencies, using the object IDs as the node names. Labels are
+ derived from the class names, excluding any "Node" postfix.
+ """
+ with open(destination, "w") as out:
+ out.write("digraph G {\n")
+ out.write(" graph [ dpi = 75 ];\n")
+ out.write(" node [shape=record,width=.1,height=.1];\n")
+ out.write(" splines=ortho;\n\n")
+
+ for node, dependencies in meta_dependencies.iteritems():
+ node_id = "Node_%i" % (id(node),)
+ node_type = node.__class__.__name__
+ if node_type.endswith("Node"):
+ node_type = node_type[:-4]
+
+ rank = None
+ color = "white"
+ if not meta_dependencies.get(node):
+ color = "red"
+ elif not meta_rev_dependencies.get(node):
+ color = "green"
+ rank = "sink"
+
+ if rank is not None:
+ out.write(" {")
+ out.write(" rank = %s;\n " % (rank,))
+
+ out.write(' %s [label="%s"; fillcolor=%s; style=filled]\n'
+ % (node_id, node_type, color))
+
+ if rank is not None:
+ out.write(" }")
+
+ for dependency in dependencies:
+ dep_id = "Node_%i" % (id(dependency),)
+ out.write(" %s -> %s\n" % (dep_id, node_id))
+ out.write("\n")
+
+ out.write("}\n")
+
+ return True
+
+ @classmethod
+ def _get_finished_node(cls, queue, running, blocking):
+ """Returns a tuple containing a node that has finished running
+ and it's async-result, or None for both if no such node could
+ be found (and blocking is False), or if an interrupt occured
+ while waiting for a node to finish.
+
+ If blocking is True, the function will timeout after 0.1s.
+ """
+ try:
+ key = queue.get(blocking, 0.1)
+ return running.pop(key)
+ except IOError, error:
+ # User pressed ctrl-c (SIGINT), or similar event ...
+ if error.errno != errno.EINTR:
+ raise
+ except Queue.Empty:
+ pass
+ return None, None
+
+
+def _init_worker(queue):
+ """Init function for subprocesses created by multiprocessing.Pool: Ensures
+ that KeyboardInterrupts only occur in the main process, allowing us to do
+ proper cleanup.
+ """
+ signal.signal(signal.SIGINT, signal.SIG_IGN)
+ # This is a workaround to avoid having to use multiprocessing.Manager
+ # to create the Queue objects; this is needed because the Manager class
+ # creates it own process, which inherits the signal-handlers of the main
+ # process, causing some rather odd behavior when the user causes a SIGINT.
+ _call_run.queue = queue
+
+
+def _call_run(key, node, config):
+ """Wrapper function, required in order to call Node.run()
+ in subprocesses, since it is not possible to pickle
+ bound functions (e.g. self.run)"""
+ try:
+ return node.run(config)
+ except NodeError:
+ raise
+ except Exception:
+ message = "Unhandled error running Node:\n\n%s" \
+ % (traceback.format_exc(),)
+
+ raise NodeUnhandledException(message)
+ finally:
+ # See comment in _init_worker
+ _call_run.queue.put(key)
+
+
+def _update_nprocesses(pool, processes):
+ """multiprocessing.Pool does not expose calls to change number of active
+ processes, but does in fact support this for the 'maxtasksperchild' option.
+ This function calls the related private functions to increase the number
+ of available processes."""
+ # FIXME: Catch ERRNO 11:
+ # OSError: [Errno 11] Resource temporarily unavailable
+ if pool._processes < processes:
+ pool._processes = processes
+ pool._repopulate_pool()
diff --git a/paleomix/resources/__init__.py b/paleomix/resources/__init__.py
new file mode 100644
index 0000000..6a3cbf4
--- /dev/null
+++ b/paleomix/resources/__init__.py
@@ -0,0 +1,75 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2016 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import argparse
+import os
+import shutil
+import sys
+
+from pkg_resources import Requirement, resource_filename
+
+
+_REQUIREMENT = Requirement.parse("PALEOMIX")
+
+
+def rscript(tool, script):
+ """Returns the path to an Rscript for a given tool."""
+
+ path = os.path.join("paleomix", "resources", "rscripts", tool, script)
+
+ return resource_filename(_REQUIREMENT, path)
+
+
+def report(tool, filename):
+ """Returns the path to a report-file for a given tool."""
+
+ path = os.path.join("paleomix", "resources", "reports", tool, filename)
+
+ return resource_filename(_REQUIREMENT, path)
+
+
+def copy_example(tool, argv):
+ """Command-line interface to copy a folder containing example data to a
+ folder specified by the user. Arguments are a tool name (e.g.
+ 'bam_pipeline'), and any command-line options specified by the user;
+ returns 0 on success, or 1 on errors.
+ """
+ parser = argparse.ArgumentParser()
+ parser.add_argument('root', help="Destination folder for example data.")
+
+ args = parser.parse_args(argv)
+
+ destination = os.path.join(args.root, tool)
+ if os.path.exists(destination):
+ sys.stderr.write("Example folder already exists at destination, "
+ "cannot proceed:\n")
+ sys.stderr.write(" - %r\n" % (destination,))
+ return 1
+
+ path = os.path.join("paleomix", "resources", "examples", tool)
+ source = resource_filename(_REQUIREMENT, path)
+
+ shutil.copytree(source, destination)
+
+ sys.stderr.write("Sucessfully saved example in %r\n" % (destination,))
+
+ return 0
diff --git a/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L1_R1_01.fastq.gz b/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L1_R1_01.fastq.gz
new file mode 100644
index 0000000..f33dcb5
Binary files /dev/null and b/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L1_R1_01.fastq.gz differ
diff --git a/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L1_R1_02.fastq.gz b/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L1_R1_02.fastq.gz
new file mode 100644
index 0000000..a9bcdb5
Binary files /dev/null and b/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L1_R1_02.fastq.gz differ
diff --git a/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L1_R1_03.fastq.gz b/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L1_R1_03.fastq.gz
new file mode 100644
index 0000000..7e1ca3b
Binary files /dev/null and b/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L1_R1_03.fastq.gz differ
diff --git a/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L1_R1_04.fastq.gz b/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L1_R1_04.fastq.gz
new file mode 100644
index 0000000..1b9b08c
Binary files /dev/null and b/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L1_R1_04.fastq.gz differ
diff --git a/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L1_R2_01.fastq.gz b/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L1_R2_01.fastq.gz
new file mode 100644
index 0000000..c140771
Binary files /dev/null and b/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L1_R2_01.fastq.gz differ
diff --git a/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L1_R2_02.fastq.gz b/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L1_R2_02.fastq.gz
new file mode 100644
index 0000000..f3e329c
Binary files /dev/null and b/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L1_R2_02.fastq.gz differ
diff --git a/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L1_R2_03.fastq.gz b/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L1_R2_03.fastq.gz
new file mode 100644
index 0000000..aedcdbe
Binary files /dev/null and b/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L1_R2_03.fastq.gz differ
diff --git a/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L1_R2_04.fastq.gz b/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L1_R2_04.fastq.gz
new file mode 100644
index 0000000..c3285ad
Binary files /dev/null and b/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L1_R2_04.fastq.gz differ
diff --git a/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L2/reads.collapsed.gz b/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L2/reads.collapsed.gz
new file mode 100644
index 0000000..51a7ef9
Binary files /dev/null and b/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L2/reads.collapsed.gz differ
diff --git a/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L2/reads.collapsed.truncated.gz b/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L2/reads.collapsed.truncated.gz
new file mode 100644
index 0000000..f212373
Binary files /dev/null and b/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L2/reads.collapsed.truncated.gz differ
diff --git a/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L2/reads.singleton.truncated.gz b/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L2/reads.singleton.truncated.gz
new file mode 100644
index 0000000..f212373
Binary files /dev/null and b/paleomix/resources/examples/bam_pipeline/000_data/ACGATA_L2/reads.singleton.truncated.gz differ
diff --git a/paleomix/resources/examples/bam_pipeline/000_data/GCTCTG_L1_R1_01.fastq.gz b/paleomix/resources/examples/bam_pipeline/000_data/GCTCTG_L1_R1_01.fastq.gz
new file mode 100644
index 0000000..4322705
Binary files /dev/null and b/paleomix/resources/examples/bam_pipeline/000_data/GCTCTG_L1_R1_01.fastq.gz differ
diff --git a/paleomix/resources/examples/bam_pipeline/000_data/GCTCTG_L1_R1_02.fastq.gz b/paleomix/resources/examples/bam_pipeline/000_data/GCTCTG_L1_R1_02.fastq.gz
new file mode 100644
index 0000000..c1051ce
Binary files /dev/null and b/paleomix/resources/examples/bam_pipeline/000_data/GCTCTG_L1_R1_02.fastq.gz differ
diff --git a/paleomix/resources/examples/bam_pipeline/000_data/GCTCTG_L1_R1_03.fastq.gz b/paleomix/resources/examples/bam_pipeline/000_data/GCTCTG_L1_R1_03.fastq.gz
new file mode 100644
index 0000000..65a6977
Binary files /dev/null and b/paleomix/resources/examples/bam_pipeline/000_data/GCTCTG_L1_R1_03.fastq.gz differ
diff --git a/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L1_R1_01.fastq.gz b/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L1_R1_01.fastq.gz
new file mode 100644
index 0000000..c77bded
Binary files /dev/null and b/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L1_R1_01.fastq.gz differ
diff --git a/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L1_R1_02.fastq.gz b/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L1_R1_02.fastq.gz
new file mode 100644
index 0000000..1a2a396
Binary files /dev/null and b/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L1_R1_02.fastq.gz differ
diff --git a/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L1_R1_03.fastq.gz b/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L1_R1_03.fastq.gz
new file mode 100644
index 0000000..ef13d23
Binary files /dev/null and b/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L1_R1_03.fastq.gz differ
diff --git a/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L2_R1_01.fastq.gz b/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L2_R1_01.fastq.gz
new file mode 100644
index 0000000..6b58ac3
Binary files /dev/null and b/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L2_R1_01.fastq.gz differ
diff --git a/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L2_R1_02.fastq.gz b/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L2_R1_02.fastq.gz
new file mode 100644
index 0000000..1111b12
Binary files /dev/null and b/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L2_R1_02.fastq.gz differ
diff --git a/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L2_R1_03.fastq.gz b/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L2_R1_03.fastq.gz
new file mode 100644
index 0000000..1dae41f
Binary files /dev/null and b/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L2_R1_03.fastq.gz differ
diff --git a/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L2_R2_01.fastq.gz b/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L2_R2_01.fastq.gz
new file mode 100644
index 0000000..596bace
Binary files /dev/null and b/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L2_R2_01.fastq.gz differ
diff --git a/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L2_R2_02.fastq.gz b/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L2_R2_02.fastq.gz
new file mode 100644
index 0000000..a77820a
Binary files /dev/null and b/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L2_R2_02.fastq.gz differ
diff --git a/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L2_R2_03.fastq.gz b/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L2_R2_03.fastq.gz
new file mode 100644
index 0000000..145ca36
Binary files /dev/null and b/paleomix/resources/examples/bam_pipeline/000_data/TGCTCA_L2_R2_03.fastq.gz differ
diff --git a/paleomix/resources/examples/bam_pipeline/000_makefile.yaml b/paleomix/resources/examples/bam_pipeline/000_makefile.yaml
new file mode 100644
index 0000000..ce59c6c
--- /dev/null
+++ b/paleomix/resources/examples/bam_pipeline/000_makefile.yaml
@@ -0,0 +1,177 @@
+# -*- mode: Yaml; -*-
+# Timestamp: 2013-10-16T21:00:50.786708
+#
+# Default options.
+# Can also be specific for a set of samples, libraries, and lanes,
+# by including the "Options" hierarchy at the same level as those
+# samples, libraries, or lanes below. This does not include
+# "Features", which may only be specific globally.
+Options:
+ # Sequencing platform, see SAM/BAM reference for valid values
+ Platform: Illumina
+ # Quality offset for Phred scores, either 33 (Sanger/Illumina 1.8+)
+ # or 64 (Illumina 1.3+ / 1.5+). For Bowtie2 it is also possible to
+ # specify 'Solexa', to handle reads on the Solexa scale. This is
+ # used during adapter-trimming and sequence alignment
+ QualityOffset: 33
+ # Split a lane into multiple entries, one for each (pair of) file(s)
+ # found using the search-string specified for a given lane. Each
+ # lane is named by adding a number to the end of the given barcode.
+ SplitLanesByFilenames: yes
+ # Compression format for FASTQ reads; 'gz' for GZip, 'bz2' for BZip2
+ CompressionFormat: bz2
+
+ # Settings for trimming of reads, see AdapterRemoval man-page
+ AdapterRemoval:
+ # Adapter sequences, set and uncomment to override defaults
+# --adapter1: AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG
+# --adapter2: AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT
+ # Some BAM pipeline defaults differ from AR defaults;
+ # To override, change these value(s):
+ --mm: 3
+ --minlength: 25
+ # Extra features enabled by default; change 'yes' to 'no' to disable
+ --collapse: yes
+ --trimns: yes
+ --trimqualities: yes
+
+ # Settings for aligners supported by the pipeline
+ Aligners:
+ # Choice of aligner software to use, either "BWA" or "Bowtie2"
+ Program: BWA
+
+ # Settings for mappings performed using BWA
+ BWA:
+ # One of "backtrack", "bwasw", or "mem"; see the BWA documentation
+ # for a description of each algorithm (defaults to 'backtrack')
+ Algorithm: backtrack
+ # Filter aligned reads with a mapping quality (Phred) below this value
+ MinQuality: 25
+ # Filter reads that did not map to the reference sequence
+ FilterUnmappedReads: yes
+ # May be disabled ("no") for aDNA alignments, as post-mortem damage
+ # localizes to the seed region, which BWA expects to have few
+ # errors (sets "-l"). See http://pmid.us/22574660
+ UseSeed: yes
+ # Additional command-line options may be specified for the "aln"
+ # call(s), as described below for Bowtie2 below.
+
+ # Settings for mappings performed using Bowtie2
+ Bowtie2:
+ # Filter aligned reads with a mapping quality (Phred) below this value
+ MinQuality: 0
+ # Filter reads that did not map to the reference sequence
+ FilterUnmappedReads: yes
+ # Examples of how to add additional command-line options
+# --trim5: 5
+# --trim3: 5
+ # Note that the colon is required, even if no value is specified
+ --very-sensitive:
+ # Example of how to specify multiple values for an option
+# --rg:
+# - CN:SequencingCenterNameHere
+# - DS:DescriptionOfReadGroup
+
+ # Mark / filter PCR duplicates. If set to 'filter', PCR duplicates are
+ # removed from the output files; if set to 'mark', PCR duplicates are
+ # flagged with bit 0x400, and not removed from the output files; if set to
+ # 'no', the reads are assumed to not have been amplified. Collapsed reads
+ # are filtered using the command 'paleomix rmdup_duplicates', while "normal"
+ # reads are filtered using Picard MarkDuplicates.
+ PCRDuplicates: filter
+
+ # Carry out quality base re-scaling of libraries using mapDamage
+ # This will be done using the options set for mapDamage below
+ RescaleQualities: yes
+
+ # Command-line options for mapDamage; note that the long-form
+ # options are expected; --length, not -l, etc. Uncomment the
+ # "mapDamage" line adding command-line options below.
+ mapDamage:
+ # By default, the pipeline will downsample the input to 100k hits
+ # when running mapDamage; remove to use all hits
+ --downsample: 100000
+
+ # Set to 'yes' exclude a type of trimmed reads from alignment / analysis;
+ # possible read-types reflect the output of AdapterRemoval
+ ExcludeReads:
+ Single: no # Single-ended reads / Orphaned paired-ended reads
+ Paired: no # Paired ended reads
+ Singleton: no # Paired reads for which the mate was discarded
+ Collapsed: no # Overlapping paired-ended reads collapsed into a
+ # single sequence by AdapterRemoval
+ CollapsedTruncated: no # Like 'Collapsed', except that the reads
+ # truncated due to the presence ambigious
+ # bases or low quality bases at read termini.
+
+ # Optional steps to perform during processing
+ Features:
+ RawBAM: no # Generate BAM from the raw libraries (no indel realignment)
+ # Location: {Destination}/{Target}.{Genome}.bam
+ RealignedBAM: yes # Generate indel-realigned BAM using the GATK Indel realigner
+ # Location: {Destination}/{Target}.{Genome}.realigned.bam
+ mapDamage: yes # Generate mapDamage plot for each (unrealigned) library
+ # Location: {Destination}/{Target}.{Genome}.mapDamage/{Library}/
+ Coverage: yes # Generate coverage information for the raw BAM (wo/ indel realignment)
+ # Location: {Destination}/{Target}.{Genome}.coverage
+ Depths: yes # Generate histogram of number of sites with a given read-depth
+ # Location: {Destination}/{Target}.{Genome}.depths
+ Summary: yes # Generate summary table for each target
+ # Location: {Destination}/{Target}.summary
+ DuplicateHist: no # Generate histogram of PCR duplicates, for use with PreSeq
+ # Location: {Destination}/{Target}.{Genome}.duphist/{Library}/
+
+
+# Map of prefixes by name, each having a Path key, which specifies the
+# location of the BWA/Bowtie2 index, and optional label, and an option
+# set of regions for which additional statistics are produced.
+Prefixes:
+ # Name of the prefix; is used as part of the output filenames
+ rCRS:
+
+ # Path to .fasta file containg a set of reference sequences.
+ Path: 000_prefixes/rCRS.fasta
+
+ # Label for prefix: One of nuclear, mitochondrial, chloroplast,
+ # plasmid, bacterial, or viral. Is used in the .summary files.
+ Label: "mitochondrial"
+
+ # Produce additional coverage / depth statistics for a set of
+ # regions defined in a BED file; if no names are specified for the
+ # BED records, results are named after the chromosome / contig.
+# RegionsOfInterest:
+# NAME: PATH_TO_BEDFILE
+
+
+# Targets are specified using the following structure:
+# The output-files will be prefixed with 'ExampleProject.'
+ExampleProject:
+ # The Sample tag will be set to 'Synthetic_Sample_1'
+ Synthetic_Sample_1:
+ # In this example, libraries are named using their DNA barcode:
+
+ ACGATA:
+ # Library contains a PE lane
+ Lane_1: 000_data/ACGATA_L1_R{Pair}_*.fastq.gz
+
+ # Library also contains a pre-trimmed PE lane
+ # As noted above, we only care about the collapsed reads
+ Lane_2:
+ Single: 000_data/ACGATA_L2/reads.singleton.truncated.gz
+ Collapsed: 000_data/ACGATA_L2/reads.collapsed.gz
+ CollapsedTruncated: 000_data/ACGATA_L2/reads.collapsed.truncated.gz
+
+ GCTCTG:
+ # Library contains a SE lane
+ Lane_1: 000_data/GCTCTG_L1_R1_*.fastq.gz
+
+ # Libray containing both a PE and a SE lane
+ TGCTCA:
+ # Options can be overridden at any level up to this level
+ # Any option not specified here is inherited from the
+ # previous levels.
+ Options:
+ SplitLanesByFilenames: no
+
+ Lane_1: 000_data/TGCTCA_L1_R1_*.fastq.gz
+ Lane_2: 000_data/TGCTCA_L2_R{Pair}_*.fastq.gz
diff --git a/paleomix/resources/examples/bam_pipeline/000_prefixes/rCRS.fasta b/paleomix/resources/examples/bam_pipeline/000_prefixes/rCRS.fasta
new file mode 100644
index 0000000..a2d2531
--- /dev/null
+++ b/paleomix/resources/examples/bam_pipeline/000_prefixes/rCRS.fasta
@@ -0,0 +1,239 @@
+>NC_012920_1
+GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGG
+GTATGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTC
+CTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACTTACTAAAGTGTGTTA
+ATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCACTTTCCACACAGACATC
+ATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCA
+AACCCCAAAAACAAAGAACCCTAACACCAGCCTAACCAGATTTCAAATTTTATCTTTTGGCGGTATGCAC
+TTTTAACAGTCACCCCCCAACTAACACATTATTTTCCCCTCCCACTCCCATACTACTAATCTCATCAATA
+CAACCCCCGCCCATCCTACCCAGCACACACACACCGCTGCTAACCCCATACCCCGAACCAACCAAACCCC
+AAAGACACCCCCCACAGTTTATGTAGCTTACCTCCTCAAAGCAATACACTGAAAATGTTTAGACGGGCTC
+ACATCACCCCATAAACAAATAGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTAAGATTACACATGCAA
+GCATCCCCGTTCCAGTGAGTTCACCCTCTAAATCACCACGATCAAAAGGAACAAGCATCAAGCACGCAGC
+AATGCAGCTCAAAACGCTTAGCCTAGCCACACCCCCACGGGAAACAGCAGTGATTAACCTTTAGCAATAA
+ACGAAAGTTTAACTAAGCTATACTAACCCCAGGGTTGGTCAATTTCGTGCCAGCCACCGCGGTCACACGA
+TTAACCCAAGTCAATAGAAGCCGGCGTAAAGAGTGTTTTAGATCACCCCCTCCCCAATAAAGCTAAAACT
+CACCTGAGTTGTAAAAAACTCCAGTTGACACAAAATAGACTACGAAAGTGGCTTTAACATATCTGAACAC
+ACAATAGCTAAGACCCAAACTGGGATTAGATACCCCACTATGCTTAGCCCTAAACCTCAACAGTTAAATC
+AACAAAACTGCTCGCCAGAACACTACGAGCCACAGCTTAAAACTCAAAGGACCTGGCGGTGCTTCATATC
+CCTCTAGAGGAGCCTGTTCTGTAATCGATAAACCCCGATCAACCTCACCACCTCTTGCTCAGCCTATATA
+CCGCCATCTTCAGCAAACCCTGATGAAGGCTACAAAGTAAGCGCAAGTACCCACGTAAAGACGTTAGGTC
+AAGGTGTAGCCCATGAGGTGGCAAGAAATGGGCTACATTTTCTACCCCAGAAAACTACGATAGCCCTTAT
+GAAACTTAAGGGTCGAAGGTGGATTTAGCAGTAAACTAAGAGTAGAGTGCTTAGTTGAACAGGGCCCTGA
+AGCGCGTACACACCGCCCGTCACCCTCCTCAAGTATACTTCAAAGGACATTTAACTAAAACCCCTACGCA
+TTTATATAGAGGAGACAAGTCGTAACATGGTAAGTGTACTGGAAAGTGCACTTGGACGAACCAGAGTGTA
+GCTTAACACAAAGCACCCAACTTACACTTAGGAGATTTCAACTTAACTTGACCGCTCTGAGCTAAACCTA
+GCCCCAAACCCACTCCACCTTACTACCAGACAACCTTAGCCAAACCATTTACCCAAATAAAGTATAGGCG
+ATAGAAATTGAAACCTGGCGCAATAGATATAGTACCGCAAGGGAAAGATGAAAAATTATAACCAAGCATA
+ATATAGCAAGGACTAACCCCTATACCTTCTGCATAATGAATTAACTAGAAATAACTTTGCAAGGAGAGCC
+AAAGCTAAGACCCCCGAAACCAGACGAGCTACCTAAGAACAGCTAAAAGAGCACACCCGTCTATGTAGCA
+AAATAGTGGGAAGATTTATAGGTAGAGGCGACAAACCTACCGAGCCTGGTGATAGCTGGTTGTCCAAGAT
+AGAATCTTAGTTCAACTTTAAATTTGCCCACAGAACCCTCTAAATCCCCTTGTAAATTTAACTGTTAGTC
+CAAAGAGGAACAGCTCTTTGGACACTAGGAAAAAACCTTGTAGAGAGAGTAAAAAATTTAACACCCATAG
+TAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTTCAAGCTCAACACCCACTACCTAAAAAATCCCAAAC
+ATATAACTGAACTCCTCACACCCAATTGGACCAATCTATCACCCTATAGAAGAACTAATGTTAGTATAAG
+TAACATGAAAACATTCTCCTCCGCATAAGCCTGCGTCAGATTAAAACACTGAACTGACAATTAACAGCCC
+AATATCTACAATCAACCAACAAGTCATTATTACCCTCACTGTCAACCCAACACAGGCATGCTCATAAGGA
+AAGGTTAAAAAAAGTAAAAGGAACTCGGCAAATCTTACCCCGCCTGTTTACCAAAAACATCACCTCTAGC
+ATCACCAGTATTAGAGGCACCGCCTGCCCAGTGACACATGTTTAACGGCCGCGGTACCCTAACCGTGCAA
+AGGTAGCATAATCACTTGTTCCTTAAATAGGGACCTGTATGAATGGCTCCACGAGGGTTCAGCTGTCTCT
+TACTTTTAACCAGTGAAATTGACCTGCCCGTGAAGAGGCGGGCATAACACAGCAAGACGAGAAGACCCTA
+TGGAGCTTTAATTTATTAATGCAAACAGTACCTAACAAACCCACAGGTCCTAAACTACCAAACCTGCATT
+AAAAATTTCGGTTGGGGCGACCTCGGAGCAGAACCCAACCTCCGAGCAGTACATGCTAAGACTTCACCAG
+TCAAAGCGAACTACTATACTCAATTGATCCAATAACTTGACCAACGGAACAAGTTACCCTAGGGATAACA
+GCGCAATCCTATTCTAGAGTCCATATCAACAATAGGGTTTACGACCTCGATGTTGGATCAGGACATCCCG
+ATGGTGCAGCCGCTATTAAAGGTTCGTTTGTTCAACGATTAAAGTCCTACGTGATCTGAGTTCAGACCGG
+AGTAATCCAGGTCGGTTTCTATCTACNTTCAAATTCCTCCCTGTACGAAAGGACAAGAGAAATAAGGCCT
+ACTTCACAAAGCGCCTTCCCCCGTAAATGATATCATCTCAACTTAGTATTATACCCACACCCACCCAAGA
+ACAGGGTTTGTTAAGATGGCAGAGCCCGGTAATCGCATAAAACTTAAAACTTTACAGTCAGAGGTTCAAT
+TCCTCTTCTTAACAACATACCCATGGCCAACCTCCTACTCCTCATTGTACCCATTCTAATCGCAATGGCA
+TTCCTAATGCTTACCGAACGAAAAATTCTAGGCTATATACAACTACGCAAAGGCCCCAACGTTGTAGGCC
+CCTACGGGCTACTACAACCCTTCGCTGACGCCATAAAACTCTTCACCAAAGAGCCCCTAAAACCCGCCAC
+ATCTACCATCACCCTCTACATCACCGCCCCGACCTTAGCTCTCACCATCGCTCTTCTACTATGAACCCCC
+CTCCCCATACCCAACCCCCTGGTCAACCTCAACCTAGGCCTCCTATTTATTCTAGCCACCTCTAGCCTAG
+CCGTTTACTCAATCCTCTGATCAGGGTGAGCATCAAACTCAAACTACGCCCTGATCGGCGCACTGCGAGC
+AGTAGCCCAAACAATCTCATATGAAGTCACCCTAGCCATCATTCTACTATCAACATTACTAATAAGTGGC
+TCCTTTAACCTCTCCACCCTTATCACAACACAAGAACACCTCTGATTACTCCTGCCATCATGACCCTTGG
+CCATAATATGATTTATCTCCACACTAGCAGAGACCAACCGAACCCCCTTCGACCTTGCCGAAGGGGAGTC
+CGAACTAGTCTCAGGCTTCAACATCGAATACGCCGCAGGCCCCTTCGCCCTATTCTTCATAGCCGAATAC
+ACAAACATTATTATAATAAACACCCTCACCACTACAATCTTCCTAGGAACAACATATGACGCACTCTCCC
+CTGAACTCTACACAACATATTTTGTCACCAAGACCCTACTTCTAACCTCCCTGTTCTTATGAATTCGAAC
+AGCATACCCCCGATTCCGCTACGACCAACTCATACACCTCCTATGAAAAAACTTCCTACCACTCACCCTA
+GCATTACTTATATGATATGTCTCCATACCCATTACAATCTCCAGCATTCCCCCTCAAACCTAAGAAATAT
+GTCTGATAAAAGAGTTACTTTGATAGAGTAAATAATAGGAGCTTAAACCCCCTTATTTCTAGGACTATGA
+GAATCGAACCCATCCCTGAGAATCCAAAATTCTCCGTGCCACCTATCACACCCCATCCTAAAGTAAGGTC
+AGCTAAATAAGCTATCGGGCCCATACCCCGAAAATGTTGGTTATACCCTTCCCGTACTAATTAATCCCCT
+GGCCCAACCCGTCATCTACTCTACCATCTTTGCAGGCACACTCATCACAGCGCTAAGCTCGCACTGATTT
+TTTACCTGAGTAGGCCTAGAAATAAACATGCTAGCTTTTATTCCAGTTCTAACCAAAAAAATAAACCCTC
+GTTCCACAGAAGCTGCCATCAAGTATTTCCTCACGCAAGCAACCGCATCCATAATCCTTCTAATAGCTAT
+CCTCTTCAACAATATACTCTCCGGACAATGAACCATAACCAATACTACCAATCAATACTCATCATTAATA
+ATCATAATAGCTATAGCAATAAAACTAGGAATAGCCCCCTTTCACTTCTGAGTCCCAGAGGTTACCCAAG
+GCACCCCTCTGACATCCGGCCTGCTTCTTCTCACATGACAAAAACTAGCCCCCATCTCAATCATATACCA
+AATCTCTCCCTCACTAAACGTAAGCCTTCTCCTCACTCTCTCAATCTTATCCATCATAGCAGGCAGTTGA
+GGTGGATTAAACCAAACCCAGCTACGCAAAATCTTAGCATACTCCTCAATTACCCACATAGGATGAATAA
+TAGCAGTTCTACCGTACAACCCTAACATAACCATTCTTAATTTAACTATTTATATTATCCTAACTACTAC
+CGCATTCCTACTACTCAACTTAAACTCCAGCACCACGACCCTACTACTATCTCGCACCTGAAACAAGCTA
+ACATGACTAACACCCTTAATTCCATCCACCCTCCTCTCCCTAGGAGGCCTGCCCCCGCTAACCGGCTTTT
+TGCCCAAATGGGCCATTATCGAAGAATTCACAAAAAACAATAGCCTCATCATCCCCACCATCATAGCCAC
+CATCACCCTCCTTAACCTCTACTTCTACCTACGCCTAATCTACTCCACCTCAATCACACTACTCCCCATA
+TCTAACAACGTAAAAATAAAATGACAGTTTGAACATACAAAACCCACCCCATTCCTCCCCACACTCATCG
+CCCTTACCACGCTACTCCTACCTATCTCCCCTTTTATACTAATAATCTTATAGAAATTTAGGTTAAATAC
+AGACCAAGAGCCTTCAAAGCCCTCAGTAAGTTGCAATACTTAATTTCTGTAACAGCTAAGGACTGCAAAA
+CCCCACTCTGCATCAACTGAACGCAAATCAGCCACTTTAATTAAGCTAAGCCCTTACTAGACCAATGGGA
+CTTAAACCCACAAACACTTAGTTAACAGCTAAGCACCCTAATCAACTGGCTTCAATCTACTTCTCCCGCC
+GCCGGGAAAAAAGGCGGGAGAAGCCCCGGCAGGTTTGAAGCTGCTTCTTCGAATTTGCAATTCAATATGA
+AAATCACCTCGGAGCTGGTAAAAAGAGGCCTAACCCCTGTCTTTAGATTTACAGTCCAATGCTTCACTCA
+GCCATTTTACCTCACCCCCACTGATGTTCGCCGACCGTTGACTATTCTCTACAAACCACAAAGACATTGG
+AACACTATACCTATTATTCGGCGCATGAGCTGGAGTCCTAGGCACAGCTCTAAGCCTCCTTATTCGAGCC
+GAGCTGGGCCAGCCAGGCAACCTTCTAGGTAACGACCACATCTACAACGTTATCGTCACAGCCCATGCAT
+TTGTAATAATCTTCTTCATAGTAATACCCATCATAATCGGAGGCTTTGGCAACTGACTAGTTCCCCTAAT
+AATCGGTGCCCCCGATATGGCGTTTCCCCGCATAAACAACATAAGCTTCTGACTCTTACCTCCCTCTCTC
+CTACTCCTGCTCGCATCTGCTATAGTGGAGGCCGGAGCAGGAACAGGTTGAACAGTCTACCCTCCCTTAG
+CAGGGAACTACTCCCACCCTGGAGCCTCCGTAGACCTAACCATCTTCTCCTTACACCTAGCAGGTGTCTC
+CTCTATCTTAGGGGCCATCAATTTCATCACAACAATTATCAATATAAAACCCCCTGCCATAACCCAATAC
+CAAACGCCCCTCTTCGTCTGATCCGTCCTAATCACAGCAGTCCTACTTCTCCTATCTCTCCCAGTCCTAG
+CTGCTGGCATCACTATACTACTAACAGACCGCAACCTCAACACCACCTTCTTCGACCCCGCCGGAGGAGG
+AGACCCCATTCTATACCAACACCTATTCTGATTTTTCGGTCACCCTGAAGTTTATATTCTTATCCTACCA
+GGCTTCGGAATAATCTCCCATATTGTAACTTACTACTCCGGAAAAAAAGAACCATTTGGATACATAGGTA
+TGGTCTGAGCTATGATATCAATTGGCTTCCTAGGGTTTATCGTGTGAGCACACCATATATTTACAGTAGG
+AATAGACGTAGACACACGAGCATATTTCACCTCCGCTACCATAATCATCGCTATCCCCACCGGCGTCAAA
+GTATTTAGCTGACTCGCCACACTCCACGGAAGCAATATGAAATGATCTGCTGCAGTGCTCTGAGCCCTAG
+GATTCATCTTTCTTTTCACCGTAGGTGGCCTGACTGGCATTGTATTAGCAAACTCATCACTAGACATCGT
+ACTACACGACACGTACTACGTTGTAGCCCACTTCCACTATGTCCTATCAATAGGAGCTGTATTTGCCATC
+ATAGGAGGCTTCATTCACTGATTTCCCCTATTCTCAGGCTACACCCTAGACCAAACCTACGCCAAAATCC
+ATTTCACTATCATATTCATCGGCGTAAATCTAACTTTCTTCCCACAACACTTTCTCGGCCTATCCGGAAT
+GCCCCGACGTTACTCGGACTACCCCGATGCATACACCACATGAAACATCCTATCATCTGTAGGCTCATTC
+ATTTCTCTAACAGCAGTAATATTAATAATTTTCATGATTTGAGAAGCCTTCGCTTCGAAGCGAAAAGTCC
+TAATAGTAGAAGAACCCTCCATAAACCTGGAGTGACTATATGGATGCCCCCCACCCTACCACACATTCGA
+AGAACCCGTATACATAAAATCTAGACAAAAAAGGAAGGAATCGAACCCCCCAAAGCTGGTTTCAAGCCAA
+CCCCATGGCCTCCATGACTTTTTCAAAAAGGTATTAGAAAAACCATTTCATAACTTTGTCAAAGTTAAAT
+TATAGGCTAAATCCTATATATCTTAATGGCACATGCAGCGCAAGTAGGTCTACAAGACGCTACTTCCCCT
+ATCATAGAAGAGCTTATCACCTTTCATGATCACGCCCTCATAATCATTTTCCTTATCTGCTTCCTAGTCC
+TGTATGCCCTTTTCCTAACACTCACAACAAAACTAACTAATACTAACATCTCAGACGCTCAGGAAATAGA
+AACCGTCTGAACTATCCTGCCCGCCATCATCCTAGTCCTCATCGCCCTCCCATCCCTACGCATCCTTTAC
+ATAACAGACGAGGTCAACGATCCCTCCCTTACCATCAAATCAATTGGCCACCAATGGTACTGAACCTACG
+AGTACACCGACTACGGCGGACTAATCTTCAACTCCTACATACTTCCCCCATTATTCCTAGAACCAGGCGA
+CCTGCGACTCCTTGACGTTGACAATCGAGTAGTACTCCCGATTGAAGCCCCCATTCGTATAATAATTACA
+TCACAAGACGTCTTGCACTCATGAGCTGTCCCCACATTAGGCTTAAAAACAGATGCAATTCCCGGACGTC
+TAAACCAAACCACTTTCACCGCTACACGACCGGGGGTATACTACGGTCAATGCTCTGAAATCTGTGGAGC
+AAACCACAGTTTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAAATAGGGCCCGTATTT
+ACCCTATAGCACCCCCTCTACCCCCTCTAGAGCCCACTGTAAAGCTAACTTAGCATTAACCTTTTAAGTT
+AAAGATTAAGAGAACCAACACCTCTTTACAGTGAAATGCCCCAACTAAATACTACCGTATGGCCCACCAT
+AATTACCCCCATACTCCTTACACTATTCCTCATCACCCAACTAAAAATATTAAACACAAACTACCACCTA
+CCTCCCTCACCAAAGCCCATAAAAATAAAAAATTATAACAAACCCTGAGAACCAAAATGAACGAAAATCT
+GTTCGCTTCATTCATTGCCCCCACAATCCTAGGCCTACCCGCCGCAGTACTGATCATTCTATTTCCCCCT
+CTATTGATCCCCACCTCCAAATATCTCATCAACAACCGACTAATCACCACCCAACAATGACTAATCAAAC
+TAACCTCAAAACAAATGATAACCATACACAACACTAAAGGACGAACCTGATCTCTTATACTAGTATCCTT
+AATCATTTTTATTGCCACAACTAACCTCCTCGGACTCCTGCCTCACTCATTTACACCAACCACCCAACTA
+TCTATAAACCTAGCCATGGCCATCCCCTTATGAGCGGGCACAGTGATTATAGGCTTTCGCTCTAAGATTA
+AAAATGCCCTAGCCCACTTCTTACCACAAGGCACACCTACACCCCTTATCCCCATACTAGTTATTATCGA
+AACCATCAGCCTACTCATTCAACCAATAGCCCTGGCCGTACGCCTAACCGCTAACATTACTGCAGGCCAC
+CTACTCATGCACCTAATTGGAAGCGCCACCCTAGCAATATCAACCATTAACCTTCCCTCTACACTTATCA
+TCTTCACAATTCTAATTCTACTGACTATCCTAGAAATCGCTGTCGCCTTAATCCAAGCCTACGTTTTCAC
+ACTTCTAGTAAGCCTCTACCTGCACGACAACACATAATGACCCACCAATCACATGCCTATCATATAGTAA
+AACCCAGCCCATGACCCCTAACAGGGGCCCTCTCAGCCCTCCTAATGACCTCCGGCCTAGCCATGTGATT
+TCACTTCCACTCCATAACGCTCCTCATACTAGGCCTACTAACCAACACACTAACCATATACCAATGATGG
+CGCGATGTAACACGAGAAAGCACATACCAAGGCCACCACACACCACCTGTCCAAAAAGGCCTTCGATACG
+GGATAATCCTATTTATTACCTCAGAAGTTTTTTTCTTCGCAGGATTTTTCTGAGCCTTTTACCACTCCAG
+CCTAGCCCCTACCCCCCAATTAGGAGGGCACTGGCCCCCAACAGGCATCACCCCGCTAAATCCCCTAGAA
+GTCCCACTCCTAAACACATCCGTATTACTCGCATCAGGAGTATCAATCACCTGAGCTCACCATAGTCTAA
+TAGAAAACAACCGAAACCAAATAATTCAAGCACTGCTTATTACAATTTTACTGGGTCTCTATTTTACCCT
+CCTACAAGCCTCAGAGTACTTCGAGTCTCCCTTCACCATTTCCGACGGCATCTACGGCTCAACATTTTTT
+GTAGCCACAGGCTTCCACGGACTTCACGTCATTATTGGCTCAACTTTCCTCACTATCTGCTTCATCCGCC
+AACTAATATTTCACTTTACATCCAAACATCACTTTGGCTTCGAAGCCGCCGCCTGATACTGGCATTTTGT
+AGATGTGGTTTGACTATTTCTGTATGTCTCCATCTATTGATGAGGGTCTTACTCTTTTAGTATAAATAGT
+ACCGTTAACTTCCAATTAACTAGTTTTGACAACATTCAAAAAAGAGTAATAAACTTCGCCTTAATTTTAA
+TAATCAACACCCTCCTAGCCTTACTACTAATAATTATTACATTTTGACTACCACAACTCAACGGCTACAT
+AGAAAAATCCACCCCTTACGAGTGCGGCTTCGACCCTATATCCCCCGCCCGCGTCCCTTTCTCCATAAAA
+TTCTTCTTAGTAGCTATTACCTTCTTATTATTTGATCTAGAAATTGCCCTCCTTTTACCCCTACCATGAG
+CCCTACAAACAACTAACCTGCCACTAATAGTTATGTCATCCCTCTTATTAATCATCATCCTAGCCCTAAG
+TCTGGCCTATGAGTGACTACAAAAAGGATTAGACTGAACCGAATTGGTATATAGTTTAAACAAAACGAAT
+GATTTCGACTCATTAAATTATGATAATCATATTTACCAAATGCCCCTCATTTACATAAATATTATACTAG
+CATTTACCATCTCACTTCTAGGAATACTAGTATATCGCTCACACCTCATATCCTCCCTACTATGCCTAGA
+AGGAATAATACTATCGCTGTTCATTATAGCTACTCTCATAACCCTCAACACCCACTCCCTCTTAGCCAAT
+ATTGTGCCTATTGCCATACTAGTCTTTGCCGCCTGCGAAGCAGCGGTGGGCCTAGCCCTACTAGTCTCAA
+TCTCCAACACATATGGCCTAGACTACGTACATAACCTAAACCTACTCCAATGCTAAAACTAATCGTCCCA
+ACAATTATATTACTACCACTGACATGACTTTCCAAAAAACACATAATTTGAATCAACACAACCACCCACA
+GCCTAATTATTAGCATCATCCCTCTACTATTTTTTAACCAAATCAACAACAACCTATTTAGCTGTTCCCC
+AACCTTTTCCTCCGACCCCCTAACAACCCCCCTCCTAATACTAACTACCTGACTCCTACCCCTCACAATC
+ATGGCAAGCCAACGCCACTTATCCAGTGAACCACTATCACGAAAAAAACTCTACCTCTCTATACTAATCT
+CCCTACAAATCTCCTTAATTATAACATTCACAGCCACAGAACTAATCATATTTTATATCTTCTTCGAAAC
+CACACTTATCCCCACCTTGGCTATCATCACCCGATGAGGCAACCAGCCAGAACGCCTGAACGCAGGCACA
+TACTTCCTATTCTACACCCTAGTAGGCTCCCTTCCCCTACTCATCGCACTAATTTACACTCACAACACCC
+TAGGCTCACTAAACATTCTACTACTCACTCTCACTGCCCAAGAACTATCAAACTCCTGAGCCAACAACTT
+AATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCT
+AAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCT
+ATGGTATAATACGCCTCACACTCATTCTCAACCCCCTGACAAAACACATAGCCTACCCCTTCCTTGTACT
+ATCCCTATGAGGCATAATTATAACAAGCTCCATCTGCCTACGACAAACAGACCTAAAATCGCTCATTGCA
+TACTCTTCAATCAGCCACATAGCCCTCGTAGTAACAGCCATTCTCATCCAAACCCCCTGAAGCTTCACCG
+GCGCAGTCATTCTCATAATCGCCCACGGGCTTACATCCTCATTACTATTCTGCCTAGCAAACTCAAACTA
+CGAACGCACTCACAGTCGCATCATAATCCTCTCTCAAGGACTTCAAACTCTACTCCCACTAATAGCTTTT
+TGATGACTTCTAGCAAGCCTCGCTAACCTCGCCTTACCCCCCACTATTAACCTACTGGGAGAACTCTCTG
+TGCTAGTAACCACGTTCTCCTGATCAAATATCACTCTCCTACTTACAGGACTCAACATACTAGTCACAGC
+CCTATACTCCCTCTACATATTTACCACAACACAATGGGGCTCACTCACCCACCACATTAACAACATAAAA
+CCCTCATTCACACGAGAAAACACCCTCATGTTCATACACCTATCCCCCATTCTCCTCCTATCCCTCAACC
+CCGACATCATTACCGGGTTTTCCTCTTGTAAATATAGTTTAACCAAAACATCAGATTGTGAATCTGACAA
+CAGAGGCTTACGACCCCTTATTTACCGAGAAAGCTCACAAGAACTGCTAACTCATGCCCCCATGTCTAAC
+AACATGGCTTTCTCAACTTTTAAAGGATAACAGCTATCCATTGGTCTTAGGCCCCAAAAATTTTGGTGCA
+ACTCCAAATAAAAGTAATAACCATGCACACTACTATAACCACCCTAACCCTGACTTCCCTAATTCCCCCC
+ATCCTTACCACCCTCGTTAACCCTAACAAAAAAAACTCATACCCCCATTATGTAAAATCCATTGTCGCAT
+CCACCTTTATTATCAGTCTCTTCCCCACAACAATATTCATGTGCCTAGACCAAGAAGTTATTATCTCGAA
+CTGACACTGAGCCACAACCCAAACAACCCAGCTCTCCCTAAGCTTCAAACTAGACTACTTCTCCATAATA
+TTCATCCCTGTAGCATTGTTCGTTACATGGTCCATCATAGAATTCTCACTGTGATATATAAACTCAGACC
+CAAACATTAATCAGTTCTTCAAATATCTACTCATCTTCCTAATTACCATACTAATCTTAGTTACCGCTAA
+CAACCTATTCCAACTGTTCATCGGCTGAGAGGGCGTAGGAATTATATCCTTCTTGCTCATCAGTTGATGA
+TACGCCCGAGCAGATGCCAACACAGCAGCCATTCAAGCAATCCTATACAACCGTATCGGCGATATCGGTT
+TCATCCTCGCCTTAGCATGATTTATCCTACACTCCAACTCATGAGACCCACAACAAATAGCCCTTCTAAA
+CGCTAATCCAAGCCTCACCCCACTACTAGGCCTCCTCCTAGCAGCAGCAGGCAAATCAGCCCAATTAGGT
+CTCCACCCCTGACTCCCCTCAGCCATAGAAGGCCCCACCCCAGTCTCAGCCCTACTCCACTCAAGCACTA
+TAGTTGTAGCAGGAATCTTCTTACTCATCCGCTTCCACCCCCTAGCAGAAAATAGCCCACTAATCCAAAC
+TCTAACACTATGCTTAGGCGCTATCACCACTCTGTTCGCAGCAGTCTGCGCCCTTACACAAAATGACATC
+AAAAAAATCGTAGCCTTCTCCACTTCAAGTCAACTAGGACTCATAATAGTTACAATCGGCATCAACCAAC
+CACACCTAGCATTCCTGCACATCTGTACCCACGCCTTCTTCAAAGCCATACTATTTATGTGCTCCGGGTC
+CATCATCCACAACCTTAACAATGAACAAGATATTCGAAAAATAGGAGGACTACTCAAAACCATACCTCTC
+ACTTCAACCTCCCTCACCATTGGCAGCCTAGCATTAGCAGGAATACCTTTCCTCACAGGTTTCTACTCCA
+AAGACCACATCATCGAAACCGCAAACATATCATACACAAACGCCTGAGCCCTATCTATTACTCTCATCGC
+TACCTCCCTGACAAGCGCCTATAGCACTCGAATAATTCTTCTCACCCTAACAGGTCAACCTCGCTTCCCC
+ACCCTTACTAACATTAACGAAAATAACCCCACCCTACTAAACCCCATTAAACGCCTGGCAGCCGGAAGCC
+TATTCGCAGGATTTCTCATTACTAACAACATTTCCCCCGCATCCCCCTTCCAAACAACAATCCCCCTCTA
+CCTAAAACTCACAGCCCTCGCTGTCACTTTCCTAGGACTTCTAACAGCCCTAGACCTCAACTACCTAACC
+AACAAACTTAAAATAAAATCCCCACTATGCACATTTTATTTCTCCAACATACTCGGATTCTACCCTAGCA
+TCACACACCGCACAATCCCCTATCTAGGCCTTCTTACGAGCCAAAACCTGCCCCTACTCCTCCTAGACCT
+AACCTGACTAGAAAAGCTATTACCTAAAACAATTTCACAGCACCAAATCTCCACCTCCATCATCACCTCA
+ACCCAAAAAGGCATAATTAAACTTTACTTCCTCTCTTTCTTCTTCCCACTCATCCTAACCCTACTCCTAA
+TCACATAACCTATTCCCCCGAGCAATCTCAATTACAATATATACACCAACAAACAATGTTCAACCAGTAA
+CTACTACTAATCAACGCCCATAATCATACAAAGCCCCCGCACCAATAGGATCCTCCCGAATCAACCCTGA
+CCCCTCTCCTTCATAAATTATTCAGCTTCCTACACTATTAAAGTTTACCACAACCACCACCCCATCATAC
+TCTTTCACCCACAGCACCAATCCTACCTCCATCGCTAACCCCACTAAAACACTCACCAAGACCTCAACCC
+CTGACCCCCATGCCTCAGGATACTCCTCAATAGCCATCGCTGTAGTATATCCAAAGACAACCATCATTCC
+CCCTAAATAAATTAAAAAAACTATTAAACCCATATAACCTCCCCCAAAATTCAGAATAATAACACACCCG
+ACCACACCGCTAACAATCAATACTAAACCCCCATAAATAGGAGAAGGCTTAGAAGAAAACCCCACAAACC
+CCATTACTAAACCCACACTCAACAGAAACAAAGCATACATCATTATTCTCGCACGGACTACAACCACGAC
+CAATGATATGAAAAACCATCGTTGTATTTCAACTACAAGAACACCAATGACCCCAATACGCAAAACTAAC
+CCCCTAATAAAATTAATTAACCACTCATTCATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAA
+ACTTCGGCTCACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAGCCATGCACTA
+CTCACCAGACGCCTCAACCGCCTTTTCATCAATCGCCCACATCACTCGAGACGTAAATTATGGCTGAATC
+ATCCGCTACCTTCACGCCAATGGCGCCTCAATATTCTTTATCTGCCTCTTCCTACACATCGGGCGAGGCC
+TATATTACGGATCATTTCTCTACTCAGAAACCTGAAACATCGGCATTATCCTCCTGCTTGCAACTATAGC
+AACAGCCTTCATAGGCTATGTCCTCCCGTGAGGCCAAATATCATTCTGAGGGGCCACAGTAATTACAAAC
+TTACTATCCGCCATCCCATACATTGGGACAGACCTAGTTCAATGAATCTGAGGAGGCTACTCAGTAGACA
+GTCCCACCCTCACACGATTCTTTACCTTTCACTTCATCTTGCCCTTCATTATTGCAGCCCTAGCAACACT
+CCACCTCCTATTCTTGCACGAAACGGGATCAAACAACCCCCTAGGAATCACCTCCCATTCCGATAAAATC
+ACCTTCCACCCTTACTACACAATCAAAGACGCCCTCGGCTTACTTCTCTTCCTTCTCTCCTTAATGACAT
+TAACACTATTCTCACCAGACCTCCTAGGCGACCCAGACAATTATACCCTAGCCAACCCCTTAAACACCCC
+TCCCCACATCAAGCCCGAATGATATTTCCTATTCGCCTACACAATTCTCCGATCCGTCCCTAACAAACTA
+GGAGGCGTCCTTGCCCTATTACTATCCATCCTCATCCTAGCAATAATCCCCATCCTCCATATATCCAAAC
+AACAAAGCATAATATTTCGCCCACTAAGCCAATCACTTTATTGACTCCTAGCCGCAGACCTCCTCATTCT
+AACCTGAATCGGAGGACAACCAGTAAGCTACCCTTTTACCATCATTGGACAAGTAGCATCCGTACTATAC
+TTCACAACAATCCTAATCCTAATACCAACTATCTCCCTAATTGAAAACAAAATACTCAAATGGGCCTGTC
+CTTGTAGTATAAACTAATACACCAGTCTTGTAAACCGGAGATGAAAACCTTTTTCCAAGGACAAATCAGA
+GAAAAAGTCTTTAACTCCACCATTAGCACCCAAAGCTAAGATTCTAATTTAAACTATTCTCTGTTCTTTC
+ATGGGGAAGCAGATTTGGGTACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTACA
+TTACTGCCAGCCACCATGAATATTGTACGGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCA
+ATCCACATCAAAACCCCCTCCCCATGCTTACAAGCAAGTACAGCAATCAACCCTCAACTATCACACATCA
+ACTGCAACTCCAAAGCCACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTTAACAGTACATAG
+TACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCCCC
+TCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCG
+CTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGTC
+ATAAAGCCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATG
+
diff --git a/paleomix/resources/examples/nature_protocols/alignment/000_makefile.yaml b/paleomix/resources/examples/nature_protocols/alignment/000_makefile.yaml
new file mode 100644
index 0000000..7689a49
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/alignment/000_makefile.yaml
@@ -0,0 +1,337 @@
+# -*- mode: Yaml; -*-
+# Timestamp: 2013-10-16T20:55:09.754672
+#
+# Default options.
+# Can also be specific for a set of samples, libraries, and lanes,
+# by including the "Options" hierarchy at the same level as those
+# samples, libraries, or lanes below. This does not include
+# "Features", which may only be specific globally.
+Options:
+ # Sequencing platform, see SAM/BAM reference for valid values
+ Platform: Illumina
+ # Quality offset for Phred scores, either 33 (Sanger/Illumina 1.8+)
+ # or 64 (Illumina 1.3+ / 1.5+). For Bowtie2 it is also possible to
+ # specify 'Solexa', to handle reads on the Solexa scale. This is
+ # used during adapter-trimming and sequence alignment
+ QualityOffset: 33
+ # Split a lane into multiple entries, one for each (pair of) file(s)
+ # found using the search-string specified for a given lane. Each
+ # lane is named by adding a number to the end of the given barcode.
+ SplitLanesByFilenames: yes
+ # Compression format for FASTQ reads; 'gz' for GZip, 'bz2' for BZip2
+ CompressionFormat: bz2
+
+ # Settings for trimming of reads, see AdapterRemoval man-page
+# AdapterRemoval:
+ # Adapter sequences, set and uncomment to override defaults
+# --pcr1: ...
+# --pcr2: ...
+ # Pipeline defaults that differ from AR defaults;
+ # To override, change the value(s) and uncomment the line(s):
+# --mm: 3
+# --minlength: 25
+ # Features enabled by default; uncomment to disable:
+# --collapse: no
+# --trimns: no
+# --trimqualities: no
+
+ # Settings for aligners supported by the pipeline
+ Aligners:
+ # Choice of aligner software to use, either "BWA" or "Bowtie2"
+ Program: BWA
+
+ # Settings for mappings performed using BWA
+ BWA:
+ # Filter hits with a mapping quality (Phred) below this value
+ MinQuality: 25
+ # Filter reads that did not map to the reference sequence
+ FilterUnmappedReads: yes
+ # Should be disabled ("no") for aDNA alignments, as post-mortem
+ # localizes to the seed region, which BWA expects to have few
+ # errors (sets "-l"). See http://pmid.us/22574660
+ UseSeed: yes
+ # Additional command-line options may be specified for the "aln"
+ # call(s), as described below for Bowtie2 below.
+
+ # Settings for mappings performed using Bowtie2
+ Bowtie2:
+ # Filter hits with a mapping quality (Phred) below this value
+ MinQuality: 0
+ # Filter reads that did not map to the reference sequence
+ FilterUnmappedReads: yes
+ # Examples of how to add additional command-line options
+# --trim5: 5
+# --trim3: 5
+ # Note that the colon is required, even if no value is specified
+ --very-sensitive:
+ # Example of how to specify multiple values for an option
+# --rg:
+# - CN:SequencingCenterNameHere
+# - DS:DescriptionOfReadGroup
+
+ # Mark / filter PCR duplicates. If set to 'filter', PCR duplicates
+ # are removed from the output files; if set to 'mark', these are
+ # flagged with bit 0x400; if set to 'no', the reads are assumed to
+ # not have been amplified. Collapsed reads are filtered using the
+ # command 'bam_rmdup_duplicates', while "normal" reads are filtered
+ # using Picard MarkDuplicates.
+ PCRDuplicates: filter
+
+ # Carry out quality base re-scaling of libraries using mapDamage
+ # This will be done using the options set for mapDamage below
+ RescaleQualities: no
+
+ # Command-line options for mapDamage; note that the long-form
+ # options are expected; --length, not -l, etc. Uncomment the
+ # "mapDamage" line adding command-line options below.
+ mapDamage:
+ # By default, the pipeline will downsample the input to 100k hits
+ # when running mapDamage; remove to use all hits
+ --downsample: 100000
+
+ # Exclude a type of trimmed reads from alignment/analysis; possible
+ # types reflect the output of AdapterRemoval
+# ExcludeReads:
+# - Single # Single-ended reads / Orphaned paired-ended reads
+# - Paired # Paired ended reads
+# - Collapsed # Overlapping paired-ended reads collapsed into a
+ # single sequence by AdapterRemoval
+# - CollapsedTruncated # Like 'Collapsed', except that the reads
+ # truncated due to the presence ambigious
+ # bases or low quality bases at termini.
+
+ # Optional steps to perform during processing
+ # To disable all features, replace with line "Features: []"
+ Features:
+# - Raw BAM # Generate BAM from the raw libraries (no indel realignment)
+ # Location: {Destination}/{Target}.{Genome}.bam
+ - Realigned BAM # Generate indel-realigned BAM using the GATK Indel realigner
+ # Location: {Destination}/{Target}.{Genome}.realigned.bam
+ - mapDamage # Generate mapDamage plot for each (unrealigned) library
+ # Location: {Destination}/{Target}.{Genome}.mapDamage/{Library}/
+ - Coverage # Generate coverage information for the raw BAM (wo/ indel realignment)
+ # Location: {Destination}/{Target}.{Genome}.coverage
+ - Depths # Generate histogram of number of sites with a given read-depth
+ # Location: {Destination}/{Target}.{Genome}.depths
+ - Summary # Generate target summary (uses statistics from raw BAM)
+ # Location: {Destination}/{Target}.summary
+ - DuplicateHist # Generate histogram of PCR duplicates, for use with PreSeq
+ # Location: {Destination}/{Target}.{Genome}.duphist/{Library}/
+
+
+# Map of prefixes by name, each having a Path key, which specifies the
+# location of the BWA/Bowtie2 index, and optional label, and an option
+# set of regions for which additional statistics are produced.
+Prefixes:
+ # Phytophthora infestans nuclear genome:
+ # Name of the prefix; is used as part of the output filenames
+ Pi_nucl:
+ # Path to .fasta file containg a set of reference sequences.
+ Path: 000_prefixes/Pi_nucl.fasta
+
+ # Label for prefix: One of nuclear, mitochondrial, chloroplast,
+ # plasmid, bacterial, or viral. Is used in the .summary files.
+ Label: nuclear
+
+ # Phytophthora infestans mitochondrial genome:
+ # Name of the prefix; is used as part of the output filenames
+ Pi_mito:
+ # Path to .fasta file containg a set of reference sequences.
+ Path: 000_prefixes/Pi_mito.fasta
+
+ # Label for prefix: One of nuclear, mitochondrial, chloroplast,
+ # plasmid, bacterial, or viral. Is used in the .summary files.
+ Label: mitochondrial
+
+
+################################################################################
+################################################################################
+## Yoshido et al. 2013
+
+# http://www.ebi.ac.uk/ena/data/view/ERS241538
+M-0182896:
+ # Options that apply to all data in this target
+ Options:
+ AdapterRemoval:
+ # Adapters that differ from AdapterRemoval defaults
+ --pcr1: "AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNNATCTCGTATGCCGTCTTCTGCTTG"
+ --pcr2: "AATGATACGGCGACCACCGAGATCTACACNNNNNNNACACTCTTTCCCTACACGACGCTCTTCCGATCT"
+
+ M-0182896:
+ M-0182896_NO_UDG:
+ # Options that apply only to library “M-0182896_NO_UDG”
+ # The remaining libraries have been treated with UDG, which removes the
+ # signature of post-mortem DNA damage.
+ Options:
+ # Rescale base qualities to account for post-mortem damage
+ RescaleQualities: yes
+
+ Aligners:
+ BWA:
+ # Disable seed for ancient DNA
+ UseSeed: no
+
+ ERR267888: "000_rawreads/M-0182896/ERR267888_{Pair}_*.fastq.gz"
+
+ M-0182896_UDG:
+ ERR267889: "000_rawreads/M-0182896/ERR267889_{Pair}_*.fastq.gz"
+
+ M-0182896_UDGa:
+ ERR267946: "000_rawreads/M-0182896/ERR267946_{Pair}_*.fastq.gz"
+
+ M-0182896_UDGb:
+ ERR267947: "000_rawreads/M-0182896/ERR267947_{Pair}_*.fastq.gz"
+
+ M-0182896_UDGc:
+ ERR267948: "000_rawreads/M-0182896/ERR267948_{Pair}_*.fastq.gz"
+
+
+# http://www.ebi.ac.uk/ena/data/view/ERS226850
+06_3928A:
+ # Options that apply to all data in this target
+ Options:
+ AdapterRemoval:
+ # Adapters that differ from AdapterRemoval defaults
+ --pcr1: "AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG"
+
+ 06_3928A:
+ 06_3928A:
+ ERR248815: "000_rawreads/06_3928A/ERR248815_{Pair}_*.fastq.gz"
+ ERR248816: "000_rawreads/06_3928A/ERR248816_{Pair}_*.fastq.gz"
+ ERR248817: "000_rawreads/06_3928A/ERR248817_{Pair}_*.fastq.gz"
+ ERR248818: "000_rawreads/06_3928A/ERR248818_{Pair}_*.fastq.gz"
+ ERR248819: "000_rawreads/06_3928A/ERR248819_{Pair}_*.fastq.gz"
+ ERR248820: "000_rawreads/06_3928A/ERR248820_{Pair}_*.fastq.gz"
+
+# http://www.ebi.ac.uk/ena/data/view/ERS226848
+DDR7602:
+ DDR7602:
+ DDR7602:
+ ERR248813: "000_rawreads/DDR7602/ERR248813_{Pair}_*.fastq.gz"
+
+# http://www.ebi.ac.uk/ena/data/view/ERS226849
+LBUS5:
+ LBUS5:
+ LBUS5:
+ ERR248814: "000_rawreads/LBUS5/ERR248814_{Pair}_*.fastq.gz"
+
+# http://www.ebi.ac.uk/ena/data/view/ERS226846
+NL07434:
+ # Options that apply to all data in this target
+ Options:
+ AdapterRemoval:
+ # Adapters that differ from AdapterRemoval defaults
+ --pcr1: "AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG"
+
+ NL07434:
+ NL07434:
+ ERR248806: "000_rawreads/NL07434/ERR248806_{Pair}_*.fastq.gz"
+ ERR248807: "000_rawreads/NL07434/ERR248807_{Pair}_*.fastq.gz"
+ ERR248808: "000_rawreads/NL07434/ERR248808_{Pair}_*.fastq.gz"
+ ERR248809: "000_rawreads/NL07434/ERR248809_{Pair}_*.fastq.gz"
+ ERR248810: "000_rawreads/NL07434/ERR248810_{Pair}_*.fastq.gz"
+ ERR248811: "000_rawreads/NL07434/ERR248811_{Pair}_*.fastq.gz"
+ ERR248812: "000_rawreads/NL07434/ERR248812_{Pair}_*.fastq.gz"
+
+# http://www.ebi.ac.uk/ena/data/view/ERS226844
+P13527:
+ P13527:
+ P13527:
+ ERR248791: "000_rawreads/P13527/ERR248791_{Pair}_*.fastq.gz"
+ ERR248792: "000_rawreads/P13527/ERR248792_{Pair}_*.fastq.gz"
+ ERR248793: "000_rawreads/P13527/ERR248793_{Pair}_*.fastq.gz"
+ ERR248794: "000_rawreads/P13527/ERR248794_{Pair}_*.fastq.gz"
+
+# http://www.ebi.ac.uk/ena/data/view/ERS226845
+P13626:
+ P13626:
+ P13626:
+ ERR248795: "000_rawreads/P13626/ERR248795_{Pair}_*.fastq.gz"
+ ERR248796: "000_rawreads/P13626/ERR248796_{Pair}_*.fastq.gz"
+ ERR248797: "000_rawreads/P13626/ERR248797_{Pair}_*.fastq.gz"
+ ERR248798: "000_rawreads/P13626/ERR248798_{Pair}_*.fastq.gz"
+
+# http://www.ebi.ac.uk/ena/data/view/ERS226847
+P17777:
+ # Options that apply to all data in this target
+ Options:
+ AdapterRemoval:
+ # Adapters that differ from AdapterRemoval defaults
+ --pcr1: "AGATCGGAAGAGCGGTTCAGCAGGAATGCCGAGACCGATCTCGTATGCCGTCTTCTGCTTG"
+
+ P17777:
+ P17777:
+ ERR248799: "000_rawreads/P17777/ERR248799_{Pair}_*.fastq.gz"
+ ERR248800: "000_rawreads/P17777/ERR248800_{Pair}_*.fastq.gz"
+ ERR248801: "000_rawreads/P17777/ERR248801_{Pair}_*.fastq.gz"
+ ERR248802: "000_rawreads/P17777/ERR248802_{Pair}_*.fastq.gz"
+ ERR248803: "000_rawreads/P17777/ERR248803_{Pair}_*.fastq.gz"
+ ERR248804: "000_rawreads/P17777/ERR248804_{Pair}_*.fastq.gz"
+ ERR248805: "000_rawreads/P17777/ERR248805_{Pair}_*.fastq.gz"
+
+
+################################################################################
+################################################################################
+## Martin et al. 2013
+
+# http://www.ebi.ac.uk/ena/data/view/ERS258003
+Pi1845A:
+ # Options that apply to all data in this target
+ Options:
+ # Rescale base qualities to account for post-mortem damage
+ RescaleQualities: yes
+
+ Aligners:
+ BWA:
+ # Disable seed for ancient DNA
+ UseSeed: no
+
+ Pi1845A:
+ Pi1845A_id_CGCTAT:
+ ERR299534: "000_rawreads/Pi1845A/ERR299534_*.fastq.gz"
+ ERR299565: "000_rawreads/Pi1845A/ERR299565_{Pair}_*.fastq.gz"
+ ERR299566: "000_rawreads/Pi1845A/ERR299566_{Pair}_*.fastq.gz"
+ ERR299567: "000_rawreads/Pi1845A/ERR299567_{Pair}_*.fastq.gz"
+ ERR299568: "000_rawreads/Pi1845A/ERR299568_{Pair}_*.fastq.gz"
+
+ Pi1845A_id_CATAGA:
+ ERR299535: "000_rawreads/Pi1845A/ERR299535_*.fastq.gz"
+ ERR299536: "000_rawreads/Pi1845A/ERR299536_*.fastq.gz"
+ ERR299537: "000_rawreads/Pi1845A/ERR299537_*.fastq.gz"
+ ERR299538: "000_rawreads/Pi1845A/ERR299538_*.fastq.gz"
+
+
+# http://www.ebi.ac.uk/ena/data/view/ERS258007
+Pi1889:
+ # Options that apply to all data in this target
+ Options:
+ # Rescale base qualities to account for post-mortem damage
+ RescaleQualities: yes
+
+ Aligners:
+ BWA:
+ # Disable seed for ancient DNA
+ UseSeed: no
+
+ Pi1889:
+ Pi1889_id_TAGCTT:
+ ERR299525: "000_rawreads/Pi1889/ERR299525_*.fastq.gz"
+ ERR299528: "000_rawreads/Pi1889/ERR299528_*.fastq.gz"
+ ERR299531: "000_rawreads/Pi1889/ERR299531_*.fastq.gz"
+ ERR299559: "000_rawreads/Pi1889/ERR299559_{Pair}_*.fastq.gz"
+ ERR299562: "000_rawreads/Pi1889/ERR299562_{Pair}_*.fastq.gz"
+
+ Pi1889_id_GGCTAC:
+ ERR299526: "000_rawreads/Pi1889/ERR299526_*.fastq.gz"
+ ERR299529: "000_rawreads/Pi1889/ERR299529_*.fastq.gz"
+ ERR299532: "000_rawreads/Pi1889/ERR299532_*.fastq.gz"
+ ERR299560: "000_rawreads/Pi1889/ERR299560_{Pair}_*.fastq.gz"
+ ERR299563: "000_rawreads/Pi1889/ERR299563_{Pair}_*.fastq.gz"
+
+ Pi1889_id_CTTGTA:
+ ERR299527: "000_rawreads/Pi1889/ERR299527_*.fastq.gz"
+ ERR299530: "000_rawreads/Pi1889/ERR299530_*.fastq.gz"
+ ERR299533: "000_rawreads/Pi1889/ERR299533_*.fastq.gz"
+ ERR299561: "000_rawreads/Pi1889/ERR299561_{Pair}_*.fastq.gz"
+ ERR299564: "000_rawreads/Pi1889/ERR299564_{Pair}_*.fastq.gz"
+
diff --git a/paleomix/resources/examples/nature_protocols/alignment/000_prefixes/README b/paleomix/resources/examples/nature_protocols/alignment/000_prefixes/README
new file mode 100644
index 0000000..694a628
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/alignment/000_prefixes/README
@@ -0,0 +1,6 @@
+Phytophthora infestans T30-4 nuclear genome (Pi_nucl)
+ http://protists.ensembl.org/Phytophthora_infestans/Info/Index
+ ftp://ftp.ensemblgenomes.org/pub/protists/release-19/fasta/phytophthora_infestans/dna/Phytophthora_infestans.ASM14294v1.19.dna.toplevel.fa.gz
+
+Phytophthora infestans Ia mitochondrial genome (Pi_mito)
+ http://www.ncbi.nlm.nih.gov/nuccore/AY894835.1
diff --git a/paleomix/resources/examples/nature_protocols/alignment/000_prefixes/setup.sh b/paleomix/resources/examples/nature_protocols/alignment/000_prefixes/setup.sh
new file mode 100755
index 0000000..5400149
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/alignment/000_prefixes/setup.sh
@@ -0,0 +1,29 @@
+#!/bin/bash
+
+set -o nounset # Fail on unset variables
+set -o errexit # Fail on uncaught non-zero returncodes
+set -o pipefail # Fail is a command in a chain of pipes fails
+
+cd $(dirname $0)
+
+if [ ! -e "Pi_nucl.fasta" ];
+then
+ echo "Fetching P. infestans T30-4 nuclear genome"
+ echo " See http://protists.ensembl.org/Phytophthora_infestans/Info/Index"
+ curl "ftp://ftp.ensemblgenomes.org/pub/protists/release-20/fasta/phytophthora_infestans/dna/Phytophthora_infestans.ASM14294v1.20.dna.toplevel.fa.gz" \
+ -C - -o "Pi_nucl.fasta.gz"
+ gunzip "Pi_nucl.fasta.gz"
+else
+ echo "Pi_mito.fasta already fetched; skipping ..."
+fi
+
+echo
+if [ ! -e "Pi_mito.fasta" ];
+then
+ echo "Fetching P. infestans mitochondrial genome"
+ echo " See http://www.ncbi.nlm.nih.gov/nuccore/AY894835.1"
+ curl "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nuccore&id=AY894835.1&rettype=fasta&retmode=text" \
+ -C - -o "Pi_mito.fasta"
+else
+ echo "Pi_mito.fasta already fetched; skipping ..."
+fi
diff --git a/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/06_3928A/000_ENA b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/06_3928A/000_ENA
new file mode 100644
index 0000000..b977f88
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/06_3928A/000_ENA
@@ -0,0 +1,7 @@
+study_accession secondary_study_accession sample_accession_list experiment_accession run_accession scientific_name instrument_model library_name library_layout fastq_ftp fastq_galaxy submitted_ftp submitted_galaxy col_taxonomy
+PRJEB1718 ERP002420 ERS226850 ERX223349 ERR248815 Phytophthora infestans Illumina Genome Analyzer IIx 06_3928A PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248815/ERR248815_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248815/ERR248815_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248815/ERR248815_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248815/ERR248815_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204661/fastq/06_3928A_ID101_lane8_Noindex_L.fastq.gz;ftp.sra.ebi.ac.uk/vo [...]
+PRJEB1718 ERP002420 ERS226850 ERX223350 ERR248816 Phytophthora infestans Illumina Genome Analyzer IIx 06_3928A PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248816/ERR248816_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248816/ERR248816_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248816/ERR248816_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248816/ERR248816_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204661/fastq/06_3928A_ID103_lane5_Noindex_L.fastq.gz;ftp.sra.ebi.ac.uk/vo [...]
+PRJEB1718 ERP002420 ERS226850 ERX223351 ERR248817 Phytophthora infestans Illumina Genome Analyzer IIx 06_3928A PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248817/ERR248817_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248817/ERR248817_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248817/ERR248817_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248817/ERR248817_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204661/fastq/06_3928A_ID103_lane6_Noindex_L.fastq.gz;ftp.sra.ebi.ac.uk/vo [...]
+PRJEB1718 ERP002420 ERS226850 ERX223352 ERR248818 Phytophthora infestans Illumina Genome Analyzer IIx 06_3928A PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248818/ERR248818_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248818/ERR248818_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248818/ERR248818_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248818/ERR248818_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204661/fastq/06_3928A_ID103_lane7_Noindex_L.fastq.gz;ftp.sra.ebi.ac.uk/vo [...]
+PRJEB1718 ERP002420 ERS226850 ERX223353 ERR248819 Phytophthora infestans Illumina Genome Analyzer IIx 06_3928A PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248819/ERR248819_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248819/ERR248819_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248819/ERR248819_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248819/ERR248819_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204661/fastq/06_3928A_ID103_lane8_Noindex_L.fastq.gz;ftp.sra.ebi.ac.uk/vo [...]
+PRJEB1718 ERP002420 ERS226850 ERX223354 ERR248820 Phytophthora infestans Illumina Genome Analyzer IIx 06_3928A PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248820/ERR248820_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248820/ERR248820_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248820/ERR248820_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248820/ERR248820_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204661/fastq/06_3928A_ID99_lane5_Noindex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol [...]
diff --git a/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/06_3928A/000_README b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/06_3928A/000_README
new file mode 100644
index 0000000..31a4a95
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/06_3928A/000_README
@@ -0,0 +1,2 @@
+Sample 06_3928A, from Yoshido et al, 2013:
+http://www.ebi.ac.uk/ena/data/view/PRJEB1718
diff --git a/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/DDR7602/000_ENA b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/DDR7602/000_ENA
new file mode 100644
index 0000000..8ee3710
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/DDR7602/000_ENA
@@ -0,0 +1,2 @@
+study_accession secondary_study_accession sample_accession_list experiment_accession run_accession scientific_name instrument_model library_name library_layout fastq_ftp fastq_galaxy submitted_ftp submitted_galaxy col_taxonomy
+PRJEB1717 ERP002419 ERS226848 ERX223347 ERR248813 Phytophthora infestans Illumina Genome Analyzer IIx DDR7602 PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248813/ERR248813_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248813/ERR248813_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248813/ERR248813_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248813/ERR248813_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204660/fastq/DDR7602_ID152_lane2_NoIndex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol1 [...]
diff --git a/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/DDR7602/000_README b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/DDR7602/000_README
new file mode 100644
index 0000000..2978a33
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/DDR7602/000_README
@@ -0,0 +1,2 @@
+Sample DDR7602, from Yoshido et al, 2013:
+http://www.ebi.ac.uk/ena/data/view/ERS226848
diff --git a/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/LBUS5/000_ENA b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/LBUS5/000_ENA
new file mode 100644
index 0000000..2c68e7b
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/LBUS5/000_ENA
@@ -0,0 +1,2 @@
+study_accession secondary_study_accession sample_accession_list experiment_accession run_accession scientific_name instrument_model library_name library_layout fastq_ftp fastq_galaxy submitted_ftp submitted_galaxy col_taxonomy
+PRJEB1717 ERP002419 ERS226849 ERX223348 ERR248814 Phytophthora infestans Illumina Genome Analyzer IIx LBUS5 PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248814/ERR248814_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248814/ERR248814_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248814/ERR248814_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248814/ERR248814_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204660/fastq/LBUS5_ID152_lane3_NoIndex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol1/ERA [...]
diff --git a/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/LBUS5/000_README b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/LBUS5/000_README
new file mode 100644
index 0000000..2ba0cee
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/LBUS5/000_README
@@ -0,0 +1,2 @@
+Sample LBUS5, from Yoshido et al, 2013:
+http://www.ebi.ac.uk/ena/data/view/ERS226849
diff --git a/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/M-0182896/000_ENA b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/M-0182896/000_ENA
new file mode 100644
index 0000000..f3794e7
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/M-0182896/000_ENA
@@ -0,0 +1,6 @@
+study_accession secondary_study_accession sample_accession_list experiment_accession run_accession scientific_name instrument_model library_name library_layout fastq_ftp fastq_galaxy submitted_ftp submitted_galaxy col_taxonomy
+PRJEB1877 ERP002550 ERS241539 ERX242384 ERR267888 Solanaceae Illumina HiSeq 2000 M-0182896_NO_UDG PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR267/ERR267888/ERR267888_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR267/ERR267888/ERR267888_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR267/ERR267888/ERR267888_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR267/ERR267888/ERR267888_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA210/ERA210142/fastq/s_4_s5_sequence_fwd.txt.gz;ftp.sra.ebi.ac.uk/vol1/ERA210/ERA210142/fastq/ [...]
+PRJEB1877 ERP002550 ERS241539 ERX242385 ERR267889 Solanaceae Illumina HiSeq 2000 M-0182896_UDG PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR267/ERR267889/ERR267889_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR267/ERR267889/ERR267889_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR267/ERR267889/ERR267889_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR267/ERR267889/ERR267889_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA210/ERA210142/fastq/s_4_s15_sequence_fwd.txt.gz;ftp.sra.ebi.ac.uk/vol1/ERA210/ERA210142/fastq/s_ [...]
+PRJEB1877 ERP002550 ERS241539 ERX242442 ERR267946 Solanaceae Illumina HiSeq 2000 M-0182896_UDGa PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR267/ERR267946/ERR267946_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR267/ERR267946/ERR267946_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR267/ERR267946/ERR267946_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR267/ERR267946/ERR267946_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA210/ERA210146/fastq/s_1_s15_sequence_fwd.txt.gz;ftp.sra.ebi.ac.uk/vol1/ERA210/ERA210146/fastq/s [...]
+PRJEB1877 ERP002550 ERS241539 ERX242443 ERR267947 Solanaceae Illumina HiSeq 2000 M-0182896_UDGb PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR267/ERR267947/ERR267947_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR267/ERR267947/ERR267947_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR267/ERR267947/ERR267947_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR267/ERR267947/ERR267947_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA210/ERA210146/fastq/s_2_s15_sequence_fwd.txt.gz;ftp.sra.ebi.ac.uk/vol1/ERA210/ERA210146/fastq/s [...]
+PRJEB1877 ERP002550 ERS241539 ERX242444 ERR267948 Solanaceae Illumina HiSeq 2000 M-0182896_UDGc PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR267/ERR267948/ERR267948_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR267/ERR267948/ERR267948_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR267/ERR267948/ERR267948_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR267/ERR267948/ERR267948_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA210/ERA210146/fastq/s_3_s15_sequence_fwd.txt.gz;ftp.sra.ebi.ac.uk/vol1/ERA210/ERA210146/fastq/s [...]
diff --git a/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/M-0182896/000_README b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/M-0182896/000_README
new file mode 100644
index 0000000..894be5d
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/M-0182896/000_README
@@ -0,0 +1,2 @@
+Sample M-0182896, from Yoshido et al, 2013:
+http://www.ebi.ac.uk/ena/data/view/ERS241539
diff --git a/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/NL07434/000_ENA b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/NL07434/000_ENA
new file mode 100644
index 0000000..cf7fadb
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/NL07434/000_ENA
@@ -0,0 +1,8 @@
+study_accession secondary_study_accession sample_accession_list experiment_accession run_accession scientific_name instrument_model library_name library_layout fastq_ftp fastq_galaxy submitted_ftp submitted_galaxy col_taxonomy
+PRJEB1717 ERP002419 ERS226846 ERX223340 ERR248806 Phytophthora infestans Illumina Genome Analyzer IIx NL07434 PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248806/ERR248806_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248806/ERR248806_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248806/ERR248806_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248806/ERR248806_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204660/fastq/NL07434_ID108_lane4_Noindex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol1 [...]
+PRJEB1717 ERP002419 ERS226846 ERX223341 ERR248807 Phytophthora infestans Illumina Genome Analyzer IIx NL07434 PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248807/ERR248807_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248807/ERR248807_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248807/ERR248807_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248807/ERR248807_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204660/fastq/NL07434_ID108_lane5_Noindex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol1 [...]
+PRJEB1717 ERP002419 ERS226846 ERX223342 ERR248808 Phytophthora infestans Illumina Genome Analyzer IIx NL07434 PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248808/ERR248808_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248808/ERR248808_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248808/ERR248808_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248808/ERR248808_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204660/fastq/NL07434_ID108_lane6_Noindex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol1 [...]
+PRJEB1717 ERP002419 ERS226846 ERX223343 ERR248809 Phytophthora infestans Illumina Genome Analyzer IIx NL07434 PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248809/ERR248809_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248809/ERR248809_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248809/ERR248809_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248809/ERR248809_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204660/fastq/NL07434_ID108_lane7_Noindex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol1 [...]
+PRJEB1717 ERP002419 ERS226846 ERX223344 ERR248810 Phytophthora infestans Illumina Genome Analyzer IIx NL07434 PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248810/ERR248810_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248810/ERR248810_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248810/ERR248810_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248810/ERR248810_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204660/fastq/NL07434_ID108_lane8_Noindex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol1 [...]
+PRJEB1717 ERP002419 ERS226846 ERX223345 ERR248811 Phytophthora infestans Illumina Genome Analyzer IIx NL07434 PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248811/ERR248811_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248811/ERR248811_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248811/ERR248811_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248811/ERR248811_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204660/fastq/NL07434_ID119_lane8_Noindex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol1 [...]
+PRJEB1717 ERP002419 ERS226846 ERX223346 ERR248812 Phytophthora infestans Illumina Genome Analyzer IIx NL07434 PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248812/ERR248812_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248812/ERR248812_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248812/ERR248812_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248812/ERR248812_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204660/fastq/NL07434_ID99_lane4_Noindex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol1/ [...]
diff --git a/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/NL07434/000_README b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/NL07434/000_README
new file mode 100644
index 0000000..f0fa643
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/NL07434/000_README
@@ -0,0 +1,2 @@
+Sample NL07434, from Yoshido et al, 2013:
+http://www.ebi.ac.uk/ena/data/view/ERS226846
diff --git a/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/P13527/000_ENA b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/P13527/000_ENA
new file mode 100644
index 0000000..a3d2804
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/P13527/000_ENA
@@ -0,0 +1,5 @@
+study_accession secondary_study_accession sample_accession_list experiment_accession run_accession scientific_name instrument_model library_name library_layout fastq_ftp fastq_galaxy submitted_ftp submitted_galaxy col_taxonomy
+PRJEB1717 ERP002419 ERS226844 ERX223325 ERR248791 Phytophthora infestans Illumina Genome Analyzer IIx P13527 PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248791/ERR248791_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248791/ERR248791_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248791/ERR248791_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248791/ERR248791_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204660/fastq/P13527_ID131_lane4_NoIndex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol1/E [...]
+PRJEB1717 ERP002419 ERS226844 ERX223326 ERR248792 Phytophthora infestans Illumina Genome Analyzer IIx P13527 PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248792/ERR248792_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248792/ERR248792_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248792/ERR248792_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248792/ERR248792_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204660/fastq/P13527_ID131_lane6_NoIndex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol1/E [...]
+PRJEB1717 ERP002419 ERS226844 ERX223327 ERR248793 Phytophthora infestans Illumina Genome Analyzer IIx P13527 PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248793/ERR248793_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248793/ERR248793_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248793/ERR248793_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248793/ERR248793_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204660/fastq/P13527_ID133_lane3_NoIndex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol1/E [...]
+PRJEB1717 ERP002419 ERS226844 ERX223328 ERR248794 Phytophthora infestans Illumina Genome Analyzer IIx P13527 PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248794/ERR248794_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248794/ERR248794_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248794/ERR248794_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248794/ERR248794_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204660/fastq/P13527_ID133_lane8_NoIndex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol1/E [...]
diff --git a/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/P13527/000_README b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/P13527/000_README
new file mode 100644
index 0000000..249db62
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/P13527/000_README
@@ -0,0 +1,2 @@
+Sample P13527, from Yoshido et al, 2013:
+http://www.ebi.ac.uk/ena/data/view/ERS226844
diff --git a/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/P13626/000_ENA b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/P13626/000_ENA
new file mode 100644
index 0000000..08a52fc
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/P13626/000_ENA
@@ -0,0 +1,5 @@
+study_accession secondary_study_accession sample_accession_list experiment_accession run_accession scientific_name instrument_model library_name library_layout fastq_ftp fastq_galaxy submitted_ftp submitted_galaxy col_taxonomy
+PRJEB1717 ERP002419 ERS226845 ERX223329 ERR248795 Phytophthora infestans Illumina Genome Analyzer IIx P13626 PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248795/ERR248795_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248795/ERR248795_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248795/ERR248795_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248795/ERR248795_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204660/fastq/P13626_ID131_lane7_NoIndex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol1/E [...]
+PRJEB1717 ERP002419 ERS226845 ERX223330 ERR248796 Phytophthora infestans Illumina Genome Analyzer IIx P13626 PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248796/ERR248796_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248796/ERR248796_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248796/ERR248796_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248796/ERR248796_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204660/fastq/P13626_ID133_lane6_NoIndex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol1/E [...]
+PRJEB1717 ERP002419 ERS226845 ERX223331 ERR248797 Phytophthora infestans Illumina Genome Analyzer IIx P13626 PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248797/ERR248797_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248797/ERR248797_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248797/ERR248797_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248797/ERR248797_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204660/fastq/P13626_ID133_lane7_NoIndex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol1/E [...]
+PRJEB1717 ERP002419 ERS226845 ERX223332 ERR248798 Phytophthora infestans Illumina Genome Analyzer IIx P13626 PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248798/ERR248798_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248798/ERR248798_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248798/ERR248798_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248798/ERR248798_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204660/fastq/P13626_ID133_lane8_NoIndex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol1/E [...]
diff --git a/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/P13626/000_README b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/P13626/000_README
new file mode 100644
index 0000000..8fa4700
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/P13626/000_README
@@ -0,0 +1,2 @@
+Sample P13626, from Yoshido et al, 2013:
+http://www.ebi.ac.uk/ena/data/view/ERS226845
diff --git a/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/P17777/000_ENA b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/P17777/000_ENA
new file mode 100644
index 0000000..4c40baa
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/P17777/000_ENA
@@ -0,0 +1,8 @@
+study_accession secondary_study_accession sample_accession_list experiment_accession run_accession scientific_name instrument_model library_name library_layout fastq_ftp fastq_galaxy submitted_ftp submitted_galaxy col_taxonomy
+PRJEB1717 ERP002419 ERS226847 ERX223333 ERR248799 Phytophthora infestans Illumina Genome Analyzer IIx P17777 PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248799/ERR248799_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248799/ERR248799_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248799/ERR248799_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248799/ERR248799_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204660/fastq/P17777_ID109_lane1_Noindex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol1/E [...]
+PRJEB1717 ERP002419 ERS226847 ERX223334 ERR248800 Phytophthora infestans Illumina Genome Analyzer IIx P17777 PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248800/ERR248800_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248800/ERR248800_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248800/ERR248800_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248800/ERR248800_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204660/fastq/P17777_ID112_lane6_Noindex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol1/E [...]
+PRJEB1717 ERP002419 ERS226847 ERX223335 ERR248801 Phytophthora infestans Illumina Genome Analyzer IIx P17777 PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248801/ERR248801_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248801/ERR248801_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248801/ERR248801_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248801/ERR248801_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204660/fastq/P17777_ID112_lane7_Noindex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol1/E [...]
+PRJEB1717 ERP002419 ERS226847 ERX223336 ERR248802 Phytophthora infestans Illumina Genome Analyzer IIx P17777 PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248802/ERR248802_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248802/ERR248802_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248802/ERR248802_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248802/ERR248802_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204660/fastq/P17777_ID112_lane8_Noindex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol1/E [...]
+PRJEB1717 ERP002419 ERS226847 ERX223337 ERR248803 Phytophthora infestans Illumina Genome Analyzer IIx P17777 PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248803/ERR248803_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248803/ERR248803_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248803/ERR248803_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248803/ERR248803_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204660/fastq/P17777_ID114_lane7_Noindex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol1/E [...]
+PRJEB1717 ERP002419 ERS226847 ERX223338 ERR248804 Phytophthora infestans Illumina Genome Analyzer IIx P17777 PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248804/ERR248804_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248804/ERR248804_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248804/ERR248804_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248804/ERR248804_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204660/fastq/P17777_ID114_lane8_Noindex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol1/E [...]
+PRJEB1717 ERP002419 ERS226847 ERX223339 ERR248805 Phytophthora infestans Illumina Genome Analyzer IIx P17777 PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248805/ERR248805_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248805/ERR248805_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248805/ERR248805_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR248/ERR248805/ERR248805_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA204/ERA204660/fastq/P17777_ID119_lane7_Noindex_L.fastq.gz;ftp.sra.ebi.ac.uk/vol1/E [...]
diff --git a/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/P17777/000_README b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/P17777/000_README
new file mode 100644
index 0000000..3d01fcc
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/P17777/000_README
@@ -0,0 +1,2 @@
+Sample P17777, from Yoshido et al, 2013:
+http://www.ebi.ac.uk/ena/data/view/ERS226847
diff --git a/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/Pi1845A/000_ENA b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/Pi1845A/000_ENA
new file mode 100644
index 0000000..2d223a4
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/Pi1845A/000_ENA
@@ -0,0 +1,10 @@
+study_accession secondary_study_accession sample_accession_list experiment_accession run_accession scientific_name instrument_model library_name library_layout fastq_ftp fastq_galaxy submitted_ftp submitted_galaxy col_taxonomy
+PRJEB4015 ERP003267 ERS258003 ERX272875 ERR299534 Phytophthora infestans Illumina HiSeq 2000 Pi1845A_id_CGCTAT SINGLE ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299534/ERR299534.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299534/ERR299534.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/Kew91_CGCTAT_L006_R1_combined.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/Kew91_CGCTAT_L006_R1_combined.fastq.gz
+PRJEB4015 ERP003267 ERS258003 ERX272876 ERR299535 Phytophthora infestans Illumina HiSeq 2000 Pi1845A_id_CATAGA SINGLE ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299535/ERR299535.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299535/ERR299535.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/run10_Kew91_ExpP38_sampleB_CATAGA_L002_R1_combined.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/run10_Kew91_ExpP38_sampleB_CATAGA_L002_R1_combined.fastq.gz
+PRJEB4015 ERP003267 ERS258003 ERX272877 ERR299536 Phytophthora infestans Illumina HiSeq 2000 Pi1845A_id_CATAGA SINGLE ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299536/ERR299536.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299536/ERR299536.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/Kew91_ExpP38_sampleB_CATAGA_L001_R1_combined.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/Kew91_ExpP38_sampleB_CATAGA_L001_R1_combined.fastq.gz
+PRJEB4015 ERP003267 ERS258003 ERX272878 ERR299537 Phytophthora infestans Illumina HiSeq 2000 Pi1845A_id_CATAGA SINGLE ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299537/ERR299537.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299537/ERR299537.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/Kew91_ExpP38_sampleB_CATAGA_L002_R1_combined.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/Kew91_ExpP38_sampleB_CATAGA_L002_R1_combined.fastq.gz
+PRJEB4015 ERP003267 ERS258003 ERX272879 ERR299538 Phytophthora infestans Illumina HiSeq 2000 Pi1845A_id_CATAGA SINGLE ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299538/ERR299538.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299538/ERR299538.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/Kew91_ExpP38_sampleB_CATAGA_L005_L006_L007_R1_combined.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/Kew91_ExpP38_sampleB_CATAGA_L005_L006_L007_R1_combined.fastq.gz
+PRJEB4015 ERP003267 ERS258003 ERX272906 ERR299565 Phytophthora infestans Illumina HiSeq 2000 Pi1845A_id_CGCTAT PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299565/ERR299565_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299565/ERR299565_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299565/ERR299565_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299565/ERR299565_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227641/fastq/run6_Kew91_CGCTAT_L008_R1_combined.fastq.gz;ftp.sra.ebi.ac.u [...]
+PRJEB4015 ERP003267 ERS258003 ERX272907 ERR299566 Phytophthora infestans Illumina HiSeq 2000 Pi1845A_id_CGCTAT PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299566/ERR299566_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299566/ERR299566_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299566/ERR299566_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299566/ERR299566_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227641/fastq/Kew91_CGCTAT_L006_R1_combined.fastq.gz;ftp.sra.ebi.ac.uk/vol [...]
+PRJEB4015 ERP003267 ERS258003 ERX272908 ERR299567 Phytophthora infestans Illumina HiSeq 2000 Pi1845A_id_CGCTAT PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299567/ERR299567_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299567/ERR299567_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299567/ERR299567_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299567/ERR299567_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227641/fastq/Kew91_CGCTAT_L007_R1_combined.fastq.gz;ftp.sra.ebi.ac.uk/vol [...]
+PRJEB4015 ERP003267 ERS258003 ERX272909 ERR299568 Phytophthora infestans Illumina HiSeq 2000 Pi1845A_id_CGCTAT PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299568/ERR299568_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299568/ERR299568_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299568/ERR299568_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299568/ERR299568_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227641/fastq/Kew91_CGCTAT_L008_R1_combined.fastq.gz;ftp.sra.ebi.ac.uk/vol [...]
diff --git a/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/Pi1845A/000_README b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/Pi1845A/000_README
new file mode 100644
index 0000000..0b89384
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/Pi1845A/000_README
@@ -0,0 +1,2 @@
+Sample Pi1845A, from Martin et al, 2013:
+http://www.ebi.ac.uk/ena/data/view/ERS258003
diff --git a/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/Pi1889/000_ENA b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/Pi1889/000_ENA
new file mode 100644
index 0000000..5a9bb6d
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/Pi1889/000_ENA
@@ -0,0 +1,16 @@
+study_accession secondary_study_accession sample_accession_list experiment_accession run_accession scientific_name instrument_model library_name library_layout fastq_ftp fastq_galaxy submitted_ftp submitted_galaxy col_taxonomy
+PRJEB4015 ERP003267 ERS258007 ERX272866 ERR299525 Phytophthora infestans Illumina HiSeq 2000 Pi1889_id_TAGCTT SINGLE ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299525/ERR299525.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299525/ERR299525.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/run8_K79_ExpP19_sample7_TAGCTT_L006_R1_combined.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/run8_K79_ExpP19_sample7_TAGCTT_L006_R1_combined.fastq.gz
+PRJEB4015 ERP003267 ERS258007 ERX272867 ERR299526 Phytophthora infestans Illumina HiSeq 2000 Pi1889_id_GGCTAC SINGLE ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299526/ERR299526.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299526/ERR299526.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/run8_K79_ExpP19_sample8_GGCTAC_L006_R1_combined.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/run8_K79_ExpP19_sample8_GGCTAC_L006_R1_combined.fastq.gz
+PRJEB4015 ERP003267 ERS258007 ERX272868 ERR299527 Phytophthora infestans Illumina HiSeq 2000 Pi1889_id_CTTGTA SINGLE ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299527/ERR299527.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299527/ERR299527.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/run8_K79_ExpP19_sample9_CTTGTA_L006_R1_combined.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/run8_K79_ExpP19_sample9_CTTGTA_L006_R1_combined.fastq.gz
+PRJEB4015 ERP003267 ERS258007 ERX272869 ERR299528 Phytophthora infestans Illumina HiSeq 2000 Pi1889_id_TAGCTT SINGLE ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299528/ERR299528.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299528/ERR299528.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/K79_ExpP19_sample7_TAGCTT_L003_L004_R1_combined.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/K79_ExpP19_sample7_TAGCTT_L003_L004_R1_combined.fastq.gz
+PRJEB4015 ERP003267 ERS258007 ERX272870 ERR299529 Phytophthora infestans Illumina HiSeq 2000 Pi1889_id_GGCTAC SINGLE ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299529/ERR299529.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299529/ERR299529.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/K79_ExpP19_sample8_GGCTAC_L003_L004_R1_combined.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/K79_ExpP19_sample8_GGCTAC_L003_L004_R1_combined.fastq.gz
+PRJEB4015 ERP003267 ERS258007 ERX272871 ERR299530 Phytophthora infestans Illumina HiSeq 2000 Pi1889_id_CTTGTA SINGLE ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299530/ERR299530.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299530/ERR299530.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/K79_ExpP19_sample9_CTTGTA_L003_L004_R1_combined.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/K79_ExpP19_sample9_CTTGTA_L003_L004_R1_combined.fastq.gz
+PRJEB4015 ERP003267 ERS258007 ERX272872 ERR299531 Phytophthora infestans Illumina HiSeq 2000 Pi1889_id_TAGCTT SINGLE ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299531/ERR299531.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299531/ERR299531.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/K79_ExpP19_sample7_TAGCTT_L006_R1_combined.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/K79_ExpP19_sample7_TAGCTT_L006_R1_combined.fastq.gz
+PRJEB4015 ERP003267 ERS258007 ERX272873 ERR299532 Phytophthora infestans Illumina HiSeq 2000 Pi1889_id_GGCTAC SINGLE ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299532/ERR299532.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299532/ERR299532.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/K79_ExpP19_sample8_GGCTAC_L006_R1_combined.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/K79_ExpP19_sample8_GGCTAC_L006_R1_combined.fastq.gz
+PRJEB4015 ERP003267 ERS258007 ERX272874 ERR299533 Phytophthora infestans Illumina HiSeq 2000 Pi1889_id_CTTGTA SINGLE ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299533/ERR299533.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299533/ERR299533.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/K79_ExpP19_sample9_CTTGTA_L006_R1_combined.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227639/fastq/K79_ExpP19_sample9_CTTGTA_L006_R1_combined.fastq.gz
+PRJEB4015 ERP003267 ERS258007 ERX272900 ERR299559 Phytophthora infestans Illumina HiSeq 2000 Pi1889_id_TAGCTT PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299559/ERR299559_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299559/ERR299559_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299559/ERR299559_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299559/ERR299559_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227641/fastq/ExpP19_sample7_TAGCTT_L003_R1_combined.fastq.gz;ftp.sra.ebi.a [...]
+PRJEB4015 ERP003267 ERS258007 ERX272901 ERR299560 Phytophthora infestans Illumina HiSeq 2000 Pi1889_id_GGCTAC PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299560/ERR299560_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299560/ERR299560_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299560/ERR299560_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299560/ERR299560_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227641/fastq/ExpP19_sample8_GGCTAC_L003_R1_combined.fastq.gz;ftp.sra.ebi.a [...]
+PRJEB4015 ERP003267 ERS258007 ERX272902 ERR299561 Phytophthora infestans Illumina HiSeq 2000 Pi1889_id_CTTGTA PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299561/ERR299561_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299561/ERR299561_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299561/ERR299561_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299561/ERR299561_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227641/fastq/ExpP19_sample9_CTTGTA_L003_R1_combined.fastq.gz;ftp.sra.ebi.a [...]
+PRJEB4015 ERP003267 ERS258007 ERX272903 ERR299562 Phytophthora infestans Illumina HiSeq 2000 Pi1889_id_TAGCTT PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299562/ERR299562_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299562/ERR299562_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299562/ERR299562_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299562/ERR299562_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227641/fastq/run4_ExpP19_sample7_TAGCTT_L003_R1_combined.fastq.gz;ftp.sra. [...]
+PRJEB4015 ERP003267 ERS258007 ERX272904 ERR299563 Phytophthora infestans Illumina HiSeq 2000 Pi1889_id_GGCTAC PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299563/ERR299563_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299563/ERR299563_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299563/ERR299563_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299563/ERR299563_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227641/fastq/run4_ExpP19_sample8_GGCTAC_L003_R1_combined.fastq.gz;ftp.sra. [...]
+PRJEB4015 ERP003267 ERS258007 ERX272905 ERR299564 Phytophthora infestans Illumina HiSeq 2000 Pi1889_id_CTTGTA PAIRED ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299564/ERR299564_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299564/ERR299564_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299564/ERR299564_1.fastq.gz;ftp.sra.ebi.ac.uk/vol1/fastq/ERR299/ERR299564/ERR299564_2.fastq.gz ftp.sra.ebi.ac.uk/vol1/ERA227/ERA227641/fastq/run4_ExpP19_sample9_CTTGTA_L003_R1_combined.fastq.gz;ftp.sra. [...]
diff --git a/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/Pi1889/000_README b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/Pi1889/000_README
new file mode 100644
index 0000000..ca57e73
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/Pi1889/000_README
@@ -0,0 +1,2 @@
+Sample Pi1889, from Martin et al, 2013:
+http://www.ebi.ac.uk/ena/data/view/ERS258007
diff --git a/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/setup.sh b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/setup.sh
new file mode 100755
index 0000000..6097b38
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/alignment/000_rawreads/setup.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+set -o nounset # Fail on unset variables
+set -o errexit # Fail on uncaught non-zero returncodes
+set -o pipefail # Fail is a command in a chain of pipes fails
+
+cd $(dirname $0)
+
+for ENA in `ls */000_ENA | sort`;
+do
+ SAMPLE=$(dirname ${ENA})
+ echo "Fetching FASTQ reads for sample '${SAMPLE}' ..."
+
+ tail -n +2 ${ENA} | cut -f10 | tr ";" "\n" |
+ while read URL;
+ do
+ FNAME=$(echo $URL | sed -e's#.*/##')
+ echo " - $FNAME"
+
+ if [ ! -e "${SAMPLE}/${FNAME}.DONE" ];
+ then
+ PREFIX=${FNAME/.fastq.gz/}
+ # Split into chunks of 10M reads
+ curl "${URL}" | gunzip | split -l 40000000 -a 3 - "${SAMPLE}/${PREFIX}_"
+
+ ls ${SAMPLE}/${PREFIX}_* |
+ while read FNAME;
+ do
+ mv ${FNAME} ${FNAME}.fastq
+ done
+
+ ls ${SAMPLE}/${PREFIX}_*.fastq | xargs -n 1 -P 8 gzip --verbose
+
+ touch "${SAMPLE}/${FNAME}.DONE";
+ fi
+ done
+done
+
+
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/M-0182896.Pi_mito.coverage b/paleomix/resources/examples/nature_protocols/example_results/alignment/M-0182896.Pi_mito.coverage
new file mode 100644
index 0000000..989289f
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/M-0182896.Pi_mito.coverage
@@ -0,0 +1,40 @@
+# Timestamp: 2013-10-22T17:26:38.684371
+#
+# Columns:
+# Contig: Contig, chromosome, or feature for which a depth histogram was
+# created. Unnamed features are named after the chromosome or
+# contig on which they are located, with a star appended. For
+# example "chr1*".
+# Size: The total size of the region. Multiple features with the same
+# name are combined into one row, with the size representing to
+# total of these. Note that overlapping bases are counted 2 (or
+# more) times.
+# Hits: Sum of SE, PE_1, and PE_2 hits
+# SE, PE_1, PE_2: Number of Single Ended, and Pair Ended (mate 1 and 2) hits overlapping
+# the current contig or intervals. Note that a hit may be counted multiple
+# times if it overlaps multiple intervals
+# Collapsed: Number of hits for PE pair collapsed into a single read
+# M, I, D: Number of aligned (M), inserted (I) and deleted (D) bases relative to references
+# Coverage: Average number of bases covering each position in the contig(s)/intervals(s).
+Name Sample Library Contig Size Hits SE PE_1 PE_2 Collapsed M I D Coverage
+M-0182896 * * * 37922 1868260 4642 29455 29734 1804429 157655063 13390 12420 4157.35095723
+M-0182896 * * gi|58012130|gb|AY894835.1| 37922 1868260 4642 29455 29734 1804429 157655063 13390 12420 4157.35095723
+#
+#
+M-0182896 M-0182896 * * 37922 1868260 4642 29455 29734 1804429 157655063 13390 12420 4157.35095723
+M-0182896 M-0182896 * gi|58012130|gb|AY894835.1| 37922 1868260 4642 29455 29734 1804429 157655063 13390 12420 4157.35095723
+#
+M-0182896 M-0182896 M-0182896_NO_UDG * 37922 6788 84 38 38 6628 548861 16 35 14.4734191235
+M-0182896 M-0182896 M-0182896_NO_UDG gi|58012130|gb|AY894835.1| 37922 6788 84 38 38 6628 548861 16 35 14.4734191235
+#
+M-0182896 M-0182896 M-0182896_UDG * 37922 67416 186 1110 1076 65044 5610565 499 447 147.950134487
+M-0182896 M-0182896 M-0182896_UDG gi|58012130|gb|AY894835.1| 37922 67416 186 1110 1076 65044 5610565 499 447 147.950134487
+#
+M-0182896 M-0182896 M-0182896_UDGa * 37922 598537 1503 9349 9430 578255 50450807 4287 4041 1330.38360319
+M-0182896 M-0182896 M-0182896_UDGa gi|58012130|gb|AY894835.1| 37922 598537 1503 9349 9430 578255 50450807 4287 4041 1330.38360319
+#
+M-0182896 M-0182896 M-0182896_UDGb * 37922 598195 1460 9375 9503 577857 50518911 4453 3937 1332.17950003
+M-0182896 M-0182896 M-0182896_UDGb gi|58012130|gb|AY894835.1| 37922 598195 1460 9375 9503 577857 50518911 4453 3937 1332.17950003
+#
+M-0182896 M-0182896 M-0182896_UDGc * 37922 597324 1409 9583 9687 576645 50525919 4135 3960 1332.36430041
+M-0182896 M-0182896 M-0182896_UDGc gi|58012130|gb|AY894835.1| 37922 597324 1409 9583 9687 576645 50525919 4135 3960 1332.36430041
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/M-0182896.Pi_mito.depths b/paleomix/resources/examples/nature_protocols/example_results/alignment/M-0182896.Pi_mito.depths
new file mode 100644
index 0000000..30cde0b
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/M-0182896.Pi_mito.depths
@@ -0,0 +1,38 @@
+# Timestamp: 2013-10-22T17:27:06.139226
+#
+# Columns:
+# Contig: Contig, chromosome, or feature for which a depth histogram was
+# created. Unnamed features are named after the chromosome or
+# contig on which they are located, with a star appended. For
+# example "chr1*".
+# Size: The total size of the region. Multiple features with the same
+# name are combined into one row, with the size representing to
+# total of these. Note that overlapping bases are counted 2 (or
+# more) times.
+# MaxDepth: Maximum depth to use when calling SNPs, in order to exclude
+# (at least) the 0.5% most extreme sites based on read depth,
+# not including sites with depth 0.
+# MD_*: Fraction of sites with a minimum depth of 1-200.
+#
+Name Sample Library Contig Size MaxDepth MD_001 MD_002 MD_003 MD_004 MD_005 MD_006 MD_007 MD_008 MD_009 MD_010 MD_011 MD_012 MD_013 MD_014 MD_015 MD_016 MD_017 MD_018 MD_019 MD_020 MD_021 MD_022 MD_023 MD_024 MD_025 MD_026 MD_027 MD_028 MD_029 MD_030 MD_031 MD_032 MD_033 MD_034 MD_035 MD_036 MD_037 MD_038 MD_039 MD_04 [...]
+M-0182896 * * * 37922 NA 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9998 0.9998 0.9998 0.9998 0.9998 0.9998 0.9997 0.9997 0.9997 0.999 [...]
+M-0182896 * * gi|58012130|gb|AY894835.1| 37922 NA 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9998 0.9998 0.9998 0.9998 0.9998 0.9998 0.9997 0.9997 0.9997 0.999 [...]
+#
+#
+M-0182896 M-0182896 * * 37922 NA 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9998 0.9998 0.9998 0.9998 0.9998 0.9998 0.9997 0.9997 0.9997 0.999 [...]
+M-0182896 M-0182896 * gi|58012130|gb|AY894835.1| 37922 NA 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9998 0.9998 0.9998 0.9998 0.9998 0.9998 0.9997 0.9997 0.9997 0.999 [...]
+#
+M-0182896 M-0182896 M-0182896_NO_UDG * 37922 43 0.9353 0.8682 0.8155 0.7693 0.7331 0.6947 0.6616 0.6330 0.6087 0.5823 0.5554 0.5304 0.5056 0.4816 0.4597 0.4369 0.4171 0.3945 0.3701 0.3476 0.3221 0.2985 0.2747 0.2523 0.2259 0.2006 0.1789 0.1577 0.1381 0.1173 0.1002 0.0842 0.0688 0.0562 0.0458 0.0355 0.0282 0.0223 0.0179 0.014 [...]
+M-0182896 M-0182896 M-0182896_NO_UDG gi|58012130|gb|AY894835.1| 37922 43 0.9353 0.8682 0.8155 0.7693 0.7331 0.6947 0.6616 0.6330 0.6087 0.5823 0.5554 0.5304 0.5056 0.4816 0.4597 0.4369 0.4171 0.3945 0.3701 0.3476 0.3221 0.2985 0.2747 0.2523 0.2259 0.2006 0.1789 0.1577 0.1381 0.1173 0.1002 0.0842 0.0688 0.0562 0.0458 0.0355 0.0282 0.0223 0.0179 0.014 [...]
+#
+M-0182896 M-0182896 M-0182896_UDG * 37922 NA 0.9999 0.9996 0.9994 0.9992 0.9984 0.9970 0.9953 0.9931 0.9907 0.9873 0.9821 0.9777 0.9733 0.9690 0.9648 0.9610 0.9585 0.9534 0.9486 0.9428 0.9376 0.9328 0.9272 0.9224 0.9173 0.9123 0.9076 0.9024 0.8983 0.8943 0.8891 0.8861 0.8821 0.8785 0.8753 0.8716 0.8685 0.8647 0.8612 0.857 [...]
+M-0182896 M-0182896 M-0182896_UDG gi|58012130|gb|AY894835.1| 37922 NA 0.9999 0.9996 0.9994 0.9992 0.9984 0.9970 0.9953 0.9931 0.9907 0.9873 0.9821 0.9777 0.9733 0.9690 0.9648 0.9610 0.9585 0.9534 0.9486 0.9428 0.9376 0.9328 0.9272 0.9224 0.9173 0.9123 0.9076 0.9024 0.8983 0.8943 0.8891 0.8861 0.8821 0.8785 0.8753 0.8716 0.8685 0.8647 0.8612 0.857 [...]
+#
+M-0182896 M-0182896 M-0182896_UDGa * 37922 NA 1.0000 1.0000 0.9999 0.9999 0.9999 0.9998 0.9998 0.9997 0.9997 0.9997 0.9997 0.9997 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9995 0.9995 0.9995 0.9995 0.9995 0.9994 0.9994 0.9994 0.9994 0.9994 0.9994 0.9994 0.9994 0.9994 0.9994 0.999 [...]
+M-0182896 M-0182896 M-0182896_UDGa gi|58012130|gb|AY894835.1| 37922 NA 1.0000 1.0000 0.9999 0.9999 0.9999 0.9998 0.9998 0.9997 0.9997 0.9997 0.9997 0.9997 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9995 0.9995 0.9995 0.9995 0.9995 0.9994 0.9994 0.9994 0.9994 0.9994 0.9994 0.9994 0.9994 0.9994 0.9994 0.999 [...]
+#
+M-0182896 M-0182896 M-0182896_UDGb * 37922 NA 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 0.9999 0.9999 0.9999 0.9999 0.9998 0.9998 0.9997 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9995 0.9995 0.9995 0.9995 0.9995 0.9995 0.9995 0.9995 0.9995 0.999 [...]
+M-0182896 M-0182896 M-0182896_UDGb gi|58012130|gb|AY894835.1| 37922 NA 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 0.9999 0.9999 0.9999 0.9999 0.9998 0.9998 0.9997 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9995 0.9995 0.9995 0.9995 0.9995 0.9995 0.9995 0.9995 0.9995 0.999 [...]
+#
+M-0182896 M-0182896 M-0182896_UDGc * 37922 NA 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9998 0.9998 0.9997 0.9997 0.9997 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9995 0.9995 0.9995 0.9995 0.9995 0.9994 0.9994 0.9994 0.9994 0.9994 0.9994 0.999 [...]
+M-0182896 M-0182896 M-0182896_UDGc gi|58012130|gb|AY894835.1| 37922 NA 1.0000 1.0000 1.0000 1.0000 1.0000 1.0000 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9999 0.9998 0.9998 0.9997 0.9997 0.9997 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9996 0.9995 0.9995 0.9995 0.9995 0.9995 0.9994 0.9994 0.9994 0.9994 0.9994 0.9994 0.999 [...]
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/M-0182896.Pi_nucl.coverage b/paleomix/resources/examples/nature_protocols/example_results/alignment/M-0182896.Pi_nucl.coverage
new file mode 100644
index 0000000..92d0723
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/M-0182896.Pi_nucl.coverage
@@ -0,0 +1,40 @@
+# Timestamp: 2013-10-24T15:04:28.208831
+#
+# Columns:
+# Contig: Contig, chromosome, or feature for which a depth histogram was
+# created. Unnamed features are named after the chromosome or
+# contig on which they are located, with a star appended. For
+# example "chr1*".
+# Size: The total size of the region. Multiple features with the same
+# name are combined into one row, with the size representing to
+# total of these. Note that overlapping bases are counted 2 (or
+# more) times.
+# Hits: Sum of SE, PE_1, and PE_2 hits
+# SE, PE_1, PE_2: Number of Single Ended, and Pair Ended (mate 1 and 2) hits overlapping
+# the current contig or intervals. Note that a hit may be counted multiple
+# times if it overlaps multiple intervals
+# Collapsed: Number of hits for PE pair collapsed into a single read
+# M, I, D: Number of aligned (M), inserted (I) and deleted (D) bases relative to references
+# Coverage: Average number of bases covering each position in the contig(s)/intervals(s).
+Name Sample Library Contig Size Hits SE PE_1 PE_2 Collapsed M I D Coverage
+M-0182896 * * * 228543505 35730015 242414 2329344 2316559 30841698 3513022562 862978 923586 15.3713515595
+M-0182896 * * <Genome> 228543505 35730015 242414 2329344 2316559 30841698 3513022562 862978 923586 15.3713515595
+#
+#
+M-0182896 M-0182896 * * 228543505 35730015 242414 2329344 2316559 30841698 3513022562 862978 923586 15.3713515595
+M-0182896 M-0182896 * <Genome> 228543505 35730015 242414 2329344 2316559 30841698 3513022562 862978 923586 15.3713515595
+#
+M-0182896 M-0182896 M-0182896_NO_UDG * 228543505 172069 3332 3713 3741 161283 15345286 2752 3146 0.0671438289178
+M-0182896 M-0182896 M-0182896_NO_UDG <Genome> 228543505 172069 3332 3713 3741 161283 15345286 2752 3146 0.0671438289178
+#
+M-0182896 M-0182896 M-0182896_UDG * 228543505 1411559 7194 74595 74473 1255297 128517262 28244 30720 0.562331718856
+M-0182896 M-0182896 M-0182896_UDG <Genome> 228543505 1411559 7194 74595 74473 1255297 128517262 28244 30720 0.562331718856
+#
+M-0182896 M-0182896 M-0182896_UDGa * 228543505 11562603 81905 749543 745279 9985876 1135262865 280835 301328 4.96738187769
+M-0182896 M-0182896 M-0182896_UDGa <Genome> 228543505 11562603 81905 749543 745279 9985876 1135262865 280835 301328 4.96738187769
+#
+M-0182896 M-0182896 M-0182896_UDGb * 228543505 11383290 78055 751807 746484 9806944 1123721041 277276 298173 4.91688022812
+M-0182896 M-0182896 M-0182896_UDGb <Genome> 228543505 11383290 78055 751807 746484 9806944 1123721041 277276 298173 4.91688022812
+#
+M-0182896 M-0182896 M-0182896_UDGc * 228543505 11200494 71928 749686 746582 9632298 1110176108 273871 290219 4.85761390594
+M-0182896 M-0182896 M-0182896_UDGc <Genome> 228543505 11200494 71928 749686 746582 9632298 1110176108 273871 290219 4.85761390594
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/M-0182896.Pi_nucl.depths b/paleomix/resources/examples/nature_protocols/example_results/alignment/M-0182896.Pi_nucl.depths
new file mode 100644
index 0000000..77fbbca
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/M-0182896.Pi_nucl.depths
@@ -0,0 +1,38 @@
+# Timestamp: 2013-10-24T15:19:32.822018
+#
+# Columns:
+# Contig: Contig, chromosome, or feature for which a depth histogram was
+# created. Unnamed features are named after the chromosome or
+# contig on which they are located, with a star appended. For
+# example "chr1*".
+# Size: The total size of the region. Multiple features with the same
+# name are combined into one row, with the size representing to
+# total of these. Note that overlapping bases are counted 2 (or
+# more) times.
+# MaxDepth: Maximum depth to use when calling SNPs, in order to exclude
+# (at least) the 0.5% most extreme sites based on read depth,
+# not including sites with depth 0.
+# MD_*: Fraction of sites with a minimum depth of 1-200.
+#
+Name Sample Library Contig Size MaxDepth MD_001 MD_002 MD_003 MD_004 MD_005 MD_006 MD_007 MD_008 MD_009 MD_010 MD_011 MD_012 MD_013 MD_014 MD_015 MD_016 MD_017 MD_018 MD_019 MD_020 MD_021 MD_022 MD_023 MD_024 MD_025 MD_026 MD_027 MD_028 MD_029 MD_030 MD_031 MD_032 MD_033 MD_034 MD_035 MD_036 MD_037 MD_038 MD_039 MD_040 MD_041 MD_0 [...]
+M-0182896 * * * 0 92 0.5257 0.4887 0.4699 0.4566 0.4460 0.4369 0.4288 0.4213 0.4142 0.4075 0.4010 0.3947 0.3884 0.3823 0.3761 0.3699 0.3637 0.3574 0.3510 0.3445 0.3379 0.3310 0.3239 0.3166 0.3090 0.3011 0.2929 0.2843 0.2753 0.2660 0.2563 0.2464 0.2362 0.2257 0.2150 0.2042 0.1933 0.1824 0.1716 0.1609 0.1505 0.14 [...]
+M-0182896 * * <Genome> 0 92 0.5257 0.4887 0.4699 0.4566 0.4460 0.4369 0.4288 0.4213 0.4142 0.4075 0.4010 0.3947 0.3884 0.3823 0.3761 0.3699 0.3637 0.3574 0.3510 0.3445 0.3379 0.3310 0.3239 0.3166 0.3090 0.3011 0.2929 0.2843 0.2753 0.2660 0.2563 0.2464 0.2362 0.2257 0.2150 0.2042 0.1933 0.1824 0.1716 0.1609 0.1505 0.14 [...]
+#
+#
+M-0182896 M-0182896 * * 0 92 0.5257 0.4887 0.4699 0.4566 0.4460 0.4369 0.4288 0.4213 0.4142 0.4075 0.4010 0.3947 0.3884 0.3823 0.3761 0.3699 0.3637 0.3574 0.3510 0.3445 0.3379 0.3310 0.3239 0.3166 0.3090 0.3011 0.2929 0.2843 0.2753 0.2660 0.2563 0.2464 0.2362 0.2257 0.2150 0.2042 0.1933 0.1824 0.1716 0.1609 0.1505 0.14 [...]
+M-0182896 M-0182896 * <Genome> 0 92 0.5257 0.4887 0.4699 0.4566 0.4460 0.4369 0.4288 0.4213 0.4142 0.4075 0.4010 0.3947 0.3884 0.3823 0.3761 0.3699 0.3637 0.3574 0.3510 0.3445 0.3379 0.3310 0.3239 0.3166 0.3090 0.3011 0.2929 0.2843 0.2753 0.2660 0.2563 0.2464 0.2362 0.2257 0.2150 0.2042 0.1933 0.1824 0.1716 0.1609 0.1505 0.14 [...]
+#
+M-0182896 M-0182896 M-0182896_NO_UDG * 0 3 0.0608 0.0055 0.0005 0.0001 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.00 [...]
+M-0182896 M-0182896 M-0182896_NO_UDG <Genome> 0 3 0.0608 0.0055 0.0005 0.0001 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.00 [...]
+#
+M-0182896 M-0182896 M-0182896_UDG * 0 6 0.2978 0.1531 0.0670 0.0255 0.0088 0.0030 0.0012 0.0006 0.0003 0.0002 0.0002 0.0002 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.00 [...]
+M-0182896 M-0182896 M-0182896_UDG <Genome> 0 6 0.2978 0.1531 0.0670 0.0255 0.0088 0.0030 0.0012 0.0006 0.0003 0.0002 0.0002 0.0002 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.00 [...]
+#
+M-0182896 M-0182896 M-0182896_UDGa * 0 32 0.4814 0.4402 0.4145 0.3928 0.3721 0.3510 0.3286 0.3047 0.2790 0.2518 0.2236 0.1951 0.1672 0.1407 0.1163 0.0946 0.0758 0.0599 0.0467 0.0361 0.0277 0.0211 0.0161 0.0124 0.0096 0.0075 0.0060 0.0049 0.0041 0.0035 0.0030 0.0026 0.0023 0.0021 0.0019 0.0017 0.0016 0.0015 0.0013 0.0012 0.0011 0.00 [...]
+M-0182896 M-0182896 M-0182896_UDGa <Genome> 0 32 0.4814 0.4402 0.4145 0.3928 0.3721 0.3510 0.3286 0.3047 0.2790 0.2518 0.2236 0.1951 0.1672 0.1407 0.1163 0.0946 0.0758 0.0599 0.0467 0.0361 0.0277 0.0211 0.0161 0.0124 0.0096 0.0075 0.0060 0.0049 0.0041 0.0035 0.0030 0.0026 0.0023 0.0021 0.0019 0.0017 0.0016 0.0015 0.0013 0.0012 0.0011 0.00 [...]
+#
+M-0182896 M-0182896 M-0182896_UDGb * 0 32 0.4816 0.4402 0.4144 0.3925 0.3714 0.3499 0.3270 0.3024 0.2761 0.2483 0.2196 0.1909 0.1630 0.1367 0.1127 0.0914 0.0729 0.0574 0.0447 0.0345 0.0264 0.0202 0.0154 0.0119 0.0092 0.0073 0.0058 0.0048 0.0040 0.0034 0.0029 0.0025 0.0023 0.0020 0.0018 0.0017 0.0015 0.0014 0.0013 0.0012 0.0011 0.00 [...]
+M-0182896 M-0182896 M-0182896_UDGb <Genome> 0 32 0.4816 0.4402 0.4144 0.3925 0.3714 0.3499 0.3270 0.3024 0.2761 0.2483 0.2196 0.1909 0.1630 0.1367 0.1127 0.0914 0.0729 0.0574 0.0447 0.0345 0.0264 0.0202 0.0154 0.0119 0.0092 0.0073 0.0058 0.0048 0.0040 0.0034 0.0029 0.0025 0.0023 0.0020 0.0018 0.0017 0.0015 0.0014 0.0013 0.0012 0.0011 0.00 [...]
+#
+M-0182896 M-0182896 M-0182896_UDGc * 0 32 0.4828 0.4404 0.4141 0.3918 0.3703 0.3482 0.3247 0.2995 0.2725 0.2440 0.2150 0.1861 0.1583 0.1322 0.1085 0.0876 0.0698 0.0548 0.0426 0.0328 0.0251 0.0192 0.0147 0.0113 0.0088 0.0069 0.0055 0.0045 0.0038 0.0032 0.0028 0.0024 0.0022 0.0019 0.0017 0.0016 0.0014 0.0013 0.0012 0.0011 0.0010 0.00 [...]
+M-0182896 M-0182896 M-0182896_UDGc <Genome> 0 32 0.4828 0.4404 0.4141 0.3918 0.3703 0.3482 0.3247 0.2995 0.2725 0.2440 0.2150 0.1861 0.1583 0.1322 0.1085 0.0876 0.0698 0.0548 0.0426 0.0328 0.0251 0.0192 0.0147 0.0113 0.0088 0.0069 0.0055 0.0045 0.0038 0.0032 0.0028 0.0024 0.0022 0.0019 0.0017 0.0016 0.0014 0.0013 0.0012 0.0011 0.0010 0.00 [...]
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/M-0182896.summary b/paleomix/resources/examples/nature_protocols/example_results/alignment/M-0182896.summary
new file mode 100644
index 0000000..ee87aaa
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/M-0182896.summary
@@ -0,0 +1,301 @@
+# Command:
+# /home/mischu/bin/pypeline/bin/bam_pipeline run 000_makefile.yaml --max-threads 16 --bwa-max-threads 4
+#
+# Directory:
+# /net/franklin/disk/franklin/data/mischu/projects/2013_09_nature_protocols/FINAL/alignment
+#
+# Makefile:
+# Filename: 000_makefile.yaml
+# SHA1Sum: ee7644dd0ecfee606a2873441020247def0e2355
+# MTime: 2013-10-21 17:37:32.317334
+#
+# Genomes:
+# Name Label Contigs Size Prefix
+# Pi_mito mitochondrial 1 37922 000_prefixes/Pi_mito.fasta
+# Pi_nucl nuclear 4921 228543505 000_prefixes/Pi_nucl.fasta
+#
+# Regions Of Interest:
+# Genome ROI Size NFeatures NIntervals Path
+#
+#
+Target Sample Library Measure Value # Description
+M-0182896 * * lib_type PE # SE, PE, or * (for both)
+M-0182896 * * seq_reads_pairs 577011167 # Total number of reads
+M-0182896 * * seq_trash_pe_1 5749672 # Total number of reads
+M-0182896 * * seq_trash_pe_1_frac 0.00996457664744 # Fraction of PE mate 1 reads trashed
+M-0182896 * * seq_trash_pe_2 8627683 # Total number of reads
+M-0182896 * * seq_trash_pe_2_frac 0.0149523674643 # Fraction of PE mate 2 reads trashed
+M-0182896 * * seq_collapsed 538271175 # Total number of pairs collapsed into one read
+M-0182896 * * seq_collapsed_frac 0.932860931962 # Fraction of PE pairs collapsed into one read
+M-0182896 * * seq_retained_reads 601373804 # Total number of retained reads
+M-0182896 * * seq_retained_nts 57554989302 # Total number of NTs in retained reads
+M-0182896 * * seq_retained_length 95.7058470442 # Average number of NTs in retained reads
+
+M-0182896 * * hits_raw(endogenous) 38165072 # Total number of hits against the nuclear and mitochondrial genome
+M-0182896 * * hits_raw_frac(endogenous) 0.0634631434661 # Total number of hits vs. total number of reads retained
+M-0182896 * * hits_clonality(endogenous) 0.0148511969269 # Fraction of hits that were PCR duplicates
+M-0182896 * * hits_unique(endogenous) 37598275 # Total number of unique reads (PCR duplicates removed)
+M-0182896 * * hits_unique_frac(endogenous) 0.0625206398249 # Total number of unique hits vs. total number of reads retained
+M-0182896 * * hits_coverage(endogenous) 16.0585121599 # Estimated coverage from unique hits
+M-0182896 * * hits_length(endogenous) 97.6288839049 # Average number of aligned bases per unique hit
+M-0182896 * * ratio_reads(nuc,mito) 19.1247551197 # Ratio of unique hits: Hits(nuc) / H(mito)
+M-0182896 * * ratio_genome(mito,nuc) 540.921979584 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(mito) / NTs(nuc)) / (Size(mito) / (2 * Size(nuc)))
+M-0182896 * * ratio_genome(nuc,mito) 0.00184869544545 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(nuc) / NTs(mito)) / ((2 * Size(nuc)) / Size(mito))
+
+M-0182896 * * hits_raw(mitochondrial) 2155421 # Total number of hits (prior to PCR duplicate filtering)
+M-0182896 * * hits_raw_frac(mitochondrial) 0.0035841617737 # Total number of hits vs. total number of reads retained
+M-0182896 * * hits_clonality(mitochondrial) 0.133227337026 # Fraction of hits that were PCR duplicates
+M-0182896 * * hits_unique(mitochondrial) 1868260 # Total number of hits (excluding any PCR duplicates)
+M-0182896 * * hits_unique_frac(mitochondrial) 0.00310665344512 # Total number of unique hits vs. total number of reads retained
+M-0182896 * * hits_coverage(mitochondrial) 4157.35095723 # Estimated coverage from unique hits
+M-0182896 * * hits_length(mitochondrial) 84.3860399516 # Average number of aligned bases per unique hit
+
+M-0182896 * * hits_raw(nuclear) 36009651 # Total number of hits (prior to PCR duplicate filtering)
+M-0182896 * * hits_raw_frac(nuclear) 0.0598789816924 # Total number of hits vs. total number of reads retained
+M-0182896 * * hits_clonality(nuclear) 0.0077655848428 # Fraction of hits that were PCR duplicates
+M-0182896 * * hits_unique(nuclear) 35730015 # Total number of hits (excluding any PCR duplicates)
+M-0182896 * * hits_unique_frac(nuclear) 0.0594139863798 # Total number of unique hits vs. total number of reads retained
+M-0182896 * * hits_coverage(nuclear) 15.3713515595 # Estimated coverage from unique hits
+M-0182896 * * hits_length(nuclear) 98.3213290563 # Average number of aligned bases per unique hit
+
+
+M-0182896 M-0182896 * lib_type PE # SE, PE, or * (for both)
+M-0182896 M-0182896 * seq_reads_pairs 577011167 # Total number of reads
+M-0182896 M-0182896 * seq_trash_pe_1 5749672 # Total number of reads
+M-0182896 M-0182896 * seq_trash_pe_1_frac 0.00996457664744 # Fraction of PE mate 1 reads trashed
+M-0182896 M-0182896 * seq_trash_pe_2 8627683 # Total number of reads
+M-0182896 M-0182896 * seq_trash_pe_2_frac 0.0149523674643 # Fraction of PE mate 2 reads trashed
+M-0182896 M-0182896 * seq_collapsed 538271175 # Total number of pairs collapsed into one read
+M-0182896 M-0182896 * seq_collapsed_frac 0.932860931962 # Fraction of PE pairs collapsed into one read
+M-0182896 M-0182896 * seq_retained_reads 601373804 # Total number of retained reads
+M-0182896 M-0182896 * seq_retained_nts 57554989302 # Total number of NTs in retained reads
+M-0182896 M-0182896 * seq_retained_length 95.7058470442 # Average number of NTs in retained reads
+
+M-0182896 M-0182896 * hits_raw(endogenous) 38165072 # Total number of hits against the nuclear and mitochondrial genome
+M-0182896 M-0182896 * hits_raw_frac(endogenous) 0.0634631434661 # Total number of hits vs. total number of reads retained
+M-0182896 M-0182896 * hits_clonality(endogenous) 0.0148511969269 # Fraction of hits that were PCR duplicates
+M-0182896 M-0182896 * hits_unique(endogenous) 37598275 # Total number of unique reads (PCR duplicates removed)
+M-0182896 M-0182896 * hits_unique_frac(endogenous) 0.0625206398249 # Total number of unique hits vs. total number of reads retained
+M-0182896 M-0182896 * hits_coverage(endogenous) 16.0585121599 # Estimated coverage from unique hits
+M-0182896 M-0182896 * hits_length(endogenous) 97.6288839049 # Average number of aligned bases per unique hit
+M-0182896 M-0182896 * ratio_reads(nuc,mito) 19.1247551197 # Ratio of unique hits: Hits(nuc) / H(mito)
+M-0182896 M-0182896 * ratio_genome(mito,nuc) 540.921979584 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(mito) / NTs(nuc)) / (Size(mito) / (2 * Size(nuc)))
+M-0182896 M-0182896 * ratio_genome(nuc,mito) 0.00184869544545 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(nuc) / NTs(mito)) / ((2 * Size(nuc)) / Size(mito))
+
+M-0182896 M-0182896 * hits_raw(mitochondrial) 2155421 # Total number of hits (prior to PCR duplicate filtering)
+M-0182896 M-0182896 * hits_raw_frac(mitochondrial) 0.0035841617737 # Total number of hits vs. total number of reads retained
+M-0182896 M-0182896 * hits_clonality(mitochondrial) 0.133227337026 # Fraction of hits that were PCR duplicates
+M-0182896 M-0182896 * hits_unique(mitochondrial) 1868260 # Total number of hits (excluding any PCR duplicates)
+M-0182896 M-0182896 * hits_unique_frac(mitochondrial) 0.00310665344512 # Total number of unique hits vs. total number of reads retained
+M-0182896 M-0182896 * hits_coverage(mitochondrial) 4157.35095723 # Estimated coverage from unique hits
+M-0182896 M-0182896 * hits_length(mitochondrial) 84.3860399516 # Average number of aligned bases per unique hit
+
+M-0182896 M-0182896 * hits_raw(nuclear) 36009651 # Total number of hits (prior to PCR duplicate filtering)
+M-0182896 M-0182896 * hits_raw_frac(nuclear) 0.0598789816924 # Total number of hits vs. total number of reads retained
+M-0182896 M-0182896 * hits_clonality(nuclear) 0.0077655848428 # Fraction of hits that were PCR duplicates
+M-0182896 M-0182896 * hits_unique(nuclear) 35730015 # Total number of hits (excluding any PCR duplicates)
+M-0182896 M-0182896 * hits_unique_frac(nuclear) 0.0594139863798 # Total number of unique hits vs. total number of reads retained
+M-0182896 M-0182896 * hits_coverage(nuclear) 15.3713515595 # Estimated coverage from unique hits
+M-0182896 M-0182896 * hits_length(nuclear) 98.3213290563 # Average number of aligned bases per unique hit
+
+
+M-0182896 M-0182896 M-0182896_NO_UDG lib_type PE # SE, PE, or * (for both)
+M-0182896 M-0182896 M-0182896_NO_UDG seq_reads_pairs 2735619 # Total number of reads
+M-0182896 M-0182896 M-0182896_NO_UDG seq_trash_pe_1 46091 # Total number of reads
+M-0182896 M-0182896 M-0182896_NO_UDG seq_trash_pe_1_frac 0.0168484719546 # Fraction of PE mate 1 reads trashed
+M-0182896 M-0182896 M-0182896_NO_UDG seq_trash_pe_2 122300 # Total number of reads
+M-0182896 M-0182896 M-0182896_NO_UDG seq_trash_pe_2_frac 0.0447065179764 # Fraction of PE mate 2 reads trashed
+M-0182896 M-0182896 M-0182896_NO_UDG seq_collapsed 2540128 # Total number of pairs collapsed into one read
+M-0182896 M-0182896 M-0182896_NO_UDG seq_collapsed_frac 0.928538659806 # Fraction of PE pairs collapsed into one read
+M-0182896 M-0182896 M-0182896_NO_UDG seq_retained_reads 2762719 # Total number of retained reads
+M-0182896 M-0182896 M-0182896_NO_UDG seq_retained_nts 236889647 # Total number of NTs in retained reads
+M-0182896 M-0182896 M-0182896_NO_UDG seq_retained_length 85.7451108853 # Average number of NTs in retained reads
+
+M-0182896 M-0182896 M-0182896_NO_UDG hits_raw(endogenous) 179893 # Total number of hits against the nuclear and mitochondrial genome
+M-0182896 M-0182896 M-0182896_NO_UDG hits_raw_frac(endogenous) 0.0651144759927 # Total number of hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_NO_UDG hits_clonality(endogenous) 0.0057589789486 # Fraction of hits that were PCR duplicates
+M-0182896 M-0182896 M-0182896_NO_UDG hits_unique(endogenous) 178857 # Total number of unique reads (PCR duplicates removed)
+M-0182896 M-0182896 M-0182896_NO_UDG hits_unique_frac(endogenous) 0.0647394830962 # Total number of unique hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_NO_UDG hits_coverage(endogenous) 0.069533851497 # Estimated coverage from unique hits
+M-0182896 M-0182896 M-0182896_NO_UDG hits_length(endogenous) 88.8651101159 # Average number of aligned bases per unique hit
+M-0182896 M-0182896 M-0182896_NO_UDG ratio_reads(nuc,mito) 25.3489982322 # Ratio of unique hits: Hits(nuc) / H(mito)
+M-0182896 M-0182896 M-0182896_NO_UDG ratio_genome(mito,nuc) 431.116883167 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(mito) / NTs(nuc)) / (Size(mito) / (2 * Size(nuc)))
+M-0182896 M-0182896 M-0182896_NO_UDG ratio_genome(nuc,mito) 0.00231955657281 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(nuc) / NTs(mito)) / ((2 * Size(nuc)) / Size(mito))
+
+M-0182896 M-0182896 M-0182896_NO_UDG hits_raw(mitochondrial) 6848 # Total number of hits (prior to PCR duplicate filtering)
+M-0182896 M-0182896 M-0182896_NO_UDG hits_raw_frac(mitochondrial) 0.00247871752429 # Total number of hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_NO_UDG hits_clonality(mitochondrial) 0.00876168224299 # Fraction of hits that were PCR duplicates
+M-0182896 M-0182896 M-0182896_NO_UDG hits_unique(mitochondrial) 6788 # Total number of hits (excluding any PCR duplicates)
+M-0182896 M-0182896 M-0182896_NO_UDG hits_unique_frac(mitochondrial) 0.00245699978898 # Total number of unique hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_NO_UDG hits_coverage(mitochondrial) 14.4734191235 # Estimated coverage from unique hits
+M-0182896 M-0182896 M-0182896_NO_UDG hits_length(mitochondrial) 80.8575427225 # Average number of aligned bases per unique hit
+
+M-0182896 M-0182896 M-0182896_NO_UDG hits_raw(nuclear) 173045 # Total number of hits (prior to PCR duplicate filtering)
+M-0182896 M-0182896 M-0182896_NO_UDG hits_raw_frac(nuclear) 0.0626357584684 # Total number of hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_NO_UDG hits_clonality(nuclear) 0.0056401514057 # Fraction of hits that were PCR duplicates
+M-0182896 M-0182896 M-0182896_NO_UDG hits_unique(nuclear) 172069 # Total number of hits (excluding any PCR duplicates)
+M-0182896 M-0182896 M-0182896_NO_UDG hits_unique_frac(nuclear) 0.0622824833072 # Total number of unique hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_NO_UDG hits_coverage(nuclear) 0.0671438289178 # Estimated coverage from unique hits
+M-0182896 M-0182896 M-0182896_NO_UDG hits_length(nuclear) 89.1810029697 # Average number of aligned bases per unique hit
+
+
+M-0182896 M-0182896 M-0182896_UDG lib_type PE # SE, PE, or * (for both)
+M-0182896 M-0182896 M-0182896_UDG seq_reads_pairs 21079271 # Total number of reads
+M-0182896 M-0182896 M-0182896_UDG seq_trash_pe_1 164780 # Total number of reads
+M-0182896 M-0182896 M-0182896_UDG seq_trash_pe_1_frac 0.00781715838275 # Fraction of PE mate 1 reads trashed
+M-0182896 M-0182896 M-0182896_UDG seq_trash_pe_2 251004 # Total number of reads
+M-0182896 M-0182896 M-0182896_UDG seq_trash_pe_2_frac 0.011907622422 # Fraction of PE mate 2 reads trashed
+M-0182896 M-0182896 M-0182896_UDG seq_collapsed 19833336 # Total number of pairs collapsed into one read
+M-0182896 M-0182896 M-0182896_UDG seq_collapsed_frac 0.940892880024 # Fraction of PE pairs collapsed into one read
+M-0182896 M-0182896 M-0182896_UDG seq_retained_reads 21909422 # Total number of retained reads
+M-0182896 M-0182896 M-0182896_UDG seq_retained_nts 2000628209 # Total number of NTs in retained reads
+M-0182896 M-0182896 M-0182896_UDG seq_retained_length 91.3136005596 # Average number of NTs in retained reads
+
+M-0182896 M-0182896 M-0182896_UDG hits_raw(endogenous) 1487329 # Total number of hits against the nuclear and mitochondrial genome
+M-0182896 M-0182896 M-0182896_UDG hits_raw_frac(endogenous) 0.0678853600063 # Total number of hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_UDG hits_clonality(endogenous) 0.0056167801475 # Fraction of hits that were PCR duplicates
+M-0182896 M-0182896 M-0182896_UDG hits_unique(endogenous) 1478975 # Total number of unique reads (PCR duplicates removed)
+M-0182896 M-0182896 M-0182896_UDG hits_unique_frac(endogenous) 0.0675040628639 # Total number of unique hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_UDG hits_coverage(endogenous) 0.586783575378 # Estimated coverage from unique hits
+M-0182896 M-0182896 M-0182896_UDG hits_length(endogenous) 90.6897188932 # Average number of aligned bases per unique hit
+M-0182896 M-0182896 M-0182896_UDG ratio_reads(nuc,mito) 20.9380414145 # Ratio of unique hits: Hits(nuc) / H(mito)
+M-0182896 M-0182896 M-0182896_UDG ratio_genome(mito,nuc) 526.202344721 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(mito) / NTs(nuc)) / (Size(mito) / (2 * Size(nuc)))
+M-0182896 M-0182896 M-0182896_UDG ratio_genome(nuc,mito) 0.00190040962385 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(nuc) / NTs(mito)) / ((2 * Size(nuc)) / Size(mito))
+
+M-0182896 M-0182896 M-0182896_UDG hits_raw(mitochondrial) 68828 # Total number of hits (prior to PCR duplicate filtering)
+M-0182896 M-0182896 M-0182896_UDG hits_raw_frac(mitochondrial) 0.00314147949681 # Total number of hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_UDG hits_clonality(mitochondrial) 0.020514906724 # Fraction of hits that were PCR duplicates
+M-0182896 M-0182896 M-0182896_UDG hits_unique(mitochondrial) 67416 # Total number of hits (excluding any PCR duplicates)
+M-0182896 M-0182896 M-0182896_UDG hits_unique_frac(mitochondrial) 0.00307703233796 # Total number of unique hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_UDG hits_coverage(mitochondrial) 147.950134487 # Estimated coverage from unique hits
+M-0182896 M-0182896 M-0182896_UDG hits_length(mitochondrial) 83.2230479411 # Average number of aligned bases per unique hit
+
+M-0182896 M-0182896 M-0182896_UDG hits_raw(nuclear) 1418501 # Total number of hits (prior to PCR duplicate filtering)
+M-0182896 M-0182896 M-0182896_UDG hits_raw_frac(nuclear) 0.0647438805095 # Total number of hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_UDG hits_clonality(nuclear) 0.00489389855911 # Fraction of hits that were PCR duplicates
+M-0182896 M-0182896 M-0182896_UDG hits_unique(nuclear) 1411559 # Total number of hits (excluding any PCR duplicates)
+M-0182896 M-0182896 M-0182896_UDG hits_unique_frac(nuclear) 0.064427030526 # Total number of unique hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_UDG hits_coverage(nuclear) 0.562331718856 # Estimated coverage from unique hits
+M-0182896 M-0182896 M-0182896_UDG hits_length(nuclear) 91.0463267919 # Average number of aligned bases per unique hit
+
+
+M-0182896 M-0182896 M-0182896_UDGa lib_type PE # SE, PE, or * (for both)
+M-0182896 M-0182896 M-0182896_UDGa seq_reads_pairs 185927902 # Total number of reads
+M-0182896 M-0182896 M-0182896_UDGa seq_trash_pe_1 1730901 # Total number of reads
+M-0182896 M-0182896 M-0182896_UDGa seq_trash_pe_1_frac 0.0093095279481 # Fraction of PE mate 1 reads trashed
+M-0182896 M-0182896 M-0182896_UDGa seq_trash_pe_2 2697738 # Total number of reads
+M-0182896 M-0182896 M-0182896_UDGa seq_trash_pe_2_frac 0.014509592003 # Fraction of PE mate 2 reads trashed
+M-0182896 M-0182896 M-0182896_UDGa seq_collapsed 173571472 # Total number of pairs collapsed into one read
+M-0182896 M-0182896 M-0182896_UDGa seq_collapsed_frac 0.933541819882 # Fraction of PE pairs collapsed into one read
+M-0182896 M-0182896 M-0182896_UDGa seq_retained_reads 193855693 # Total number of retained reads
+M-0182896 M-0182896 M-0182896_UDGa seq_retained_nts 18533873382 # Total number of NTs in retained reads
+M-0182896 M-0182896 M-0182896_UDGa seq_retained_length 95.6065467832 # Average number of NTs in retained reads
+
+M-0182896 M-0182896 M-0182896_UDGa hits_raw(endogenous) 12348320 # Total number of hits against the nuclear and mitochondrial genome
+M-0182896 M-0182896 M-0182896_UDGa hits_raw_frac(endogenous) 0.0636985161947 # Total number of hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_UDGa hits_clonality(endogenous) 0.0151583373285 # Fraction of hits that were PCR duplicates
+M-0182896 M-0182896 M-0182896_UDGa hits_unique(endogenous) 12161140 # Total number of unique reads (PCR duplicates removed)
+M-0182896 M-0182896 M-0182896_UDGa hits_unique_frac(endogenous) 0.0627329525989 # Total number of unique hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_UDGa hits_coverage(endogenous) 5.18727040758 # Estimated coverage from unique hits
+M-0182896 M-0182896 M-0182896_UDGa hits_length(endogenous) 97.5002073819 # Average number of aligned bases per unique hit
+M-0182896 M-0182896 M-0182896_UDGa ratio_reads(nuc,mito) 19.3181089891 # Ratio of unique hits: Hits(nuc) / H(mito)
+M-0182896 M-0182896 M-0182896_UDGa ratio_genome(mito,nuc) 535.647806407 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(mito) / NTs(nuc)) / (Size(mito) / (2 * Size(nuc)))
+M-0182896 M-0182896 M-0182896_UDGa ratio_genome(nuc,mito) 0.00186689833887 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(nuc) / NTs(mito)) / ((2 * Size(nuc)) / Size(mito))
+
+M-0182896 M-0182896 M-0182896_UDGa hits_raw(mitochondrial) 694267 # Total number of hits (prior to PCR duplicate filtering)
+M-0182896 M-0182896 M-0182896_UDGa hits_raw_frac(mitochondrial) 0.00358135987268 # Total number of hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_UDGa hits_clonality(mitochondrial) 0.137886432741 # Fraction of hits that were PCR duplicates
+M-0182896 M-0182896 M-0182896_UDGa hits_unique(mitochondrial) 598537 # Total number of hits (excluding any PCR duplicates)
+M-0182896 M-0182896 M-0182896_UDGa hits_unique_frac(mitochondrial) 0.00308753893547 # Total number of unique hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_UDGa hits_coverage(mitochondrial) 1330.38360319 # Estimated coverage from unique hits
+M-0182896 M-0182896 M-0182896_UDGa hits_length(mitochondrial) 84.2902059522 # Average number of aligned bases per unique hit
+
+M-0182896 M-0182896 M-0182896_UDGa hits_raw(nuclear) 11654053 # Total number of hits (prior to PCR duplicate filtering)
+M-0182896 M-0182896 M-0182896_UDGa hits_raw_frac(nuclear) 0.060117156322 # Total number of hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_UDGa hits_clonality(nuclear) 0.00784705544071 # Fraction of hits that were PCR duplicates
+M-0182896 M-0182896 M-0182896_UDGa hits_unique(nuclear) 11562603 # Total number of hits (excluding any PCR duplicates)
+M-0182896 M-0182896 M-0182896_UDGa hits_unique_frac(nuclear) 0.0596454136635 # Total number of unique hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_UDGa hits_coverage(nuclear) 4.96738187769 # Estimated coverage from unique hits
+M-0182896 M-0182896 M-0182896_UDGa hits_length(nuclear) 98.1840217986 # Average number of aligned bases per unique hit
+
+
+M-0182896 M-0182896 M-0182896_UDGb lib_type PE # SE, PE, or * (for both)
+M-0182896 M-0182896 M-0182896_UDGb seq_reads_pairs 184471049 # Total number of reads
+M-0182896 M-0182896 M-0182896_UDGb seq_trash_pe_1 1910477 # Total number of reads
+M-0182896 M-0182896 M-0182896_UDGb seq_trash_pe_1_frac 0.0103565139915 # Fraction of PE mate 1 reads trashed
+M-0182896 M-0182896 M-0182896_UDGb seq_trash_pe_2 2831698 # Total number of reads
+M-0182896 M-0182896 M-0182896_UDGb seq_trash_pe_2_frac 0.0153503653573 # Fraction of PE mate 2 reads trashed
+M-0182896 M-0182896 M-0182896_UDGb seq_collapsed 171961917 # Total number of pairs collapsed into one read
+M-0182896 M-0182896 M-0182896_UDGb seq_collapsed_frac 0.932189185957 # Fraction of PE pairs collapsed into one read
+M-0182896 M-0182896 M-0182896_UDGb seq_retained_reads 192238006 # Total number of retained reads
+M-0182896 M-0182896 M-0182896_UDGb seq_retained_nts 18440675007 # Total number of NTs in retained reads
+M-0182896 M-0182896 M-0182896_UDGb seq_retained_length 95.926270724 # Average number of NTs in retained reads
+
+M-0182896 M-0182896 M-0182896_UDGb hits_raw(endogenous) 12166201 # Total number of hits against the nuclear and mitochondrial genome
+M-0182896 M-0182896 M-0182896_UDGb hits_raw_frac(endogenous) 0.0632871784989 # Total number of hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_UDGb hits_clonality(endogenous) 0.0151827180892 # Fraction of hits that were PCR duplicates
+M-0182896 M-0182896 M-0182896_UDGb hits_unique(endogenous) 11981485 # Total number of unique reads (PCR duplicates removed)
+M-0182896 M-0182896 M-0182896_UDGb hits_unique_frac(endogenous) 0.0623263071091 # Total number of unique hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_UDGb hits_coverage(endogenous) 5.13707507828 # Estimated coverage from unique hits
+M-0182896 M-0182896 M-0182896_UDGb hits_length(endogenous) 98.0045421749 # Average number of aligned bases per unique hit
+M-0182896 M-0182896 M-0182896_UDGb ratio_reads(nuc,mito) 19.0293967686 # Ratio of unique hits: Hits(nuc) / H(mito)
+M-0182896 M-0182896 M-0182896_UDGb ratio_genome(mito,nuc) 541.879988212 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(mito) / NTs(nuc)) / (Size(mito) / (2 * Size(nuc)))
+M-0182896 M-0182896 M-0182896_UDGb ratio_genome(nuc,mito) 0.00184542707196 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(nuc) / NTs(mito)) / ((2 * Size(nuc)) / Size(mito))
+
+M-0182896 M-0182896 M-0182896_UDGb hits_raw(mitochondrial) 693559 # Total number of hits (prior to PCR duplicate filtering)
+M-0182896 M-0182896 M-0182896_UDGb hits_raw_frac(mitochondrial) 0.00360781415929 # Total number of hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_UDGb hits_clonality(mitochondrial) 0.137499477334 # Fraction of hits that were PCR duplicates
+M-0182896 M-0182896 M-0182896_UDGb hits_unique(mitochondrial) 598195 # Total number of hits (excluding any PCR duplicates)
+M-0182896 M-0182896 M-0182896_UDGb hits_unique_frac(mitochondrial) 0.00311174159807 # Total number of unique hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_UDGb hits_coverage(mitochondrial) 1332.17950003 # Estimated coverage from unique hits
+M-0182896 M-0182896 M-0182896_UDGb hits_length(mitochondrial) 84.4522455052 # Average number of aligned bases per unique hit
+
+M-0182896 M-0182896 M-0182896_UDGb hits_raw(nuclear) 11472642 # Total number of hits (prior to PCR duplicate filtering)
+M-0182896 M-0182896 M-0182896_UDGb hits_raw_frac(nuclear) 0.0596793643396 # Total number of hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_UDGb hits_clonality(nuclear) 0.0077882670792 # Fraction of hits that were PCR duplicates
+M-0182896 M-0182896 M-0182896_UDGb hits_unique(nuclear) 11383290 # Total number of hits (excluding any PCR duplicates)
+M-0182896 M-0182896 M-0182896_UDGb hits_unique_frac(nuclear) 0.059214565511 # Total number of unique hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_UDGb hits_coverage(nuclear) 4.91688022812 # Estimated coverage from unique hits
+M-0182896 M-0182896 M-0182896_UDGb hits_length(nuclear) 98.716719068 # Average number of aligned bases per unique hit
+
+
+M-0182896 M-0182896 M-0182896_UDGc lib_type PE # SE, PE, or * (for both)
+M-0182896 M-0182896 M-0182896_UDGc seq_reads_pairs 182797326 # Total number of reads
+M-0182896 M-0182896 M-0182896_UDGc seq_trash_pe_1 1897423 # Total number of reads
+M-0182896 M-0182896 M-0182896_UDGc seq_trash_pe_1_frac 0.0103799275488 # Fraction of PE mate 1 reads trashed
+M-0182896 M-0182896 M-0182896_UDGc seq_trash_pe_2 2724943 # Total number of reads
+M-0182896 M-0182896 M-0182896_UDGc seq_trash_pe_2_frac 0.0149069084304 # Fraction of PE mate 2 reads trashed
+M-0182896 M-0182896 M-0182896_UDGc seq_collapsed 170364322 # Total number of pairs collapsed into one read
+M-0182896 M-0182896 M-0182896_UDGc seq_collapsed_frac 0.931984759996 # Fraction of PE pairs collapsed into one read
+M-0182896 M-0182896 M-0182896_UDGc seq_retained_reads 190607964 # Total number of retained reads
+M-0182896 M-0182896 M-0182896_UDGc seq_retained_nts 18342923057 # Total number of NTs in retained reads
+M-0182896 M-0182896 M-0182896_UDGc seq_retained_length 96.2337704683 # Average number of NTs in retained reads
+
+M-0182896 M-0182896 M-0182896_UDGc hits_raw(endogenous) 11983329 # Total number of hits against the nuclear and mitochondrial genome
+M-0182896 M-0182896 M-0182896_UDGc hits_raw_frac(endogenous) 0.0628689837955 # Total number of hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_UDGc hits_clonality(endogenous) 0.0154807566412 # Fraction of hits that were PCR duplicates
+M-0182896 M-0182896 M-0182896_UDGc hits_unique(endogenous) 11797818 # Total number of unique reads (PCR duplicates removed)
+M-0182896 M-0182896 M-0182896_UDGc hits_unique_frac(endogenous) 0.061895724357 # Total number of unique hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_UDGc hits_coverage(endogenous) 5.07784924713 # Estimated coverage from unique hits
+M-0182896 M-0182896 M-0182896_UDGc hits_length(endogenous) 98.3827710344 # Average number of aligned bases per unique hit
+M-0182896 M-0182896 M-0182896_UDGc ratio_reads(nuc,mito) 18.7511199952 # Ratio of unique hits: Hits(nuc) / H(mito)
+M-0182896 M-0182896 M-0182896_UDGc ratio_genome(mito,nuc) 548.567393871 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(mito) / NTs(nuc)) / (Size(mito) / (2 * Size(nuc)))
+M-0182896 M-0182896 M-0182896_UDGc ratio_genome(nuc,mito) 0.00182293007418 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(nuc) / NTs(mito)) / ((2 * Size(nuc)) / Size(mito))
+
+M-0182896 M-0182896 M-0182896_UDGc hits_raw(mitochondrial) 691919 # Total number of hits (prior to PCR duplicate filtering)
+M-0182896 M-0182896 M-0182896_UDGc hits_raw_frac(mitochondrial) 0.00363006343219 # Total number of hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_UDGc hits_clonality(mitochondrial) 0.136713979527 # Fraction of hits that were PCR duplicates
+M-0182896 M-0182896 M-0182896_UDGc hits_unique(mitochondrial) 597324 # Total number of hits (excluding any PCR duplicates)
+M-0182896 M-0182896 M-0182896_UDGc hits_unique_frac(mitochondrial) 0.00313378301444 # Total number of unique hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_UDGc hits_coverage(mitochondrial) 1332.36430041 # Estimated coverage from unique hits
+M-0182896 M-0182896 M-0182896_UDGc hits_length(mitochondrial) 84.5871235711 # Average number of aligned bases per unique hit
+
+M-0182896 M-0182896 M-0182896_UDGc hits_raw(nuclear) 11291410 # Total number of hits (prior to PCR duplicate filtering)
+M-0182896 M-0182896 M-0182896_UDGc hits_raw_frac(nuclear) 0.0592389203633 # Total number of hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_UDGc hits_clonality(nuclear) 0.00805178449813 # Fraction of hits that were PCR duplicates
+M-0182896 M-0182896 M-0182896_UDGc hits_unique(nuclear) 11200494 # Total number of hits (excluding any PCR duplicates)
+M-0182896 M-0182896 M-0182896_UDGc hits_unique_frac(nuclear) 0.0587619413426 # Total number of unique hits vs. total number of reads retained
+M-0182896 M-0182896 M-0182896_UDGc hits_coverage(nuclear) 4.85761390594 # Estimated coverage from unique hits
+M-0182896 M-0182896 M-0182896_UDGc hits_length(nuclear) 99.1184949521 # Average number of aligned bases per unique hit
+
+
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.coverage b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.coverage
new file mode 100644
index 0000000..57c8353
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.coverage
@@ -0,0 +1,31 @@
+# Timestamp: 2013-10-22T17:45:16.276460
+#
+# Columns:
+# Contig: Contig, chromosome, or feature for which a depth histogram was
+# created. Unnamed features are named after the chromosome or
+# contig on which they are located, with a star appended. For
+# example "chr1*".
+# Size: The total size of the region. Multiple features with the same
+# name are combined into one row, with the size representing to
+# total of these. Note that overlapping bases are counted 2 (or
+# more) times.
+# Hits: Sum of SE, PE_1, and PE_2 hits
+# SE, PE_1, PE_2: Number of Single Ended, and Pair Ended (mate 1 and 2) hits overlapping
+# the current contig or intervals. Note that a hit may be counted multiple
+# times if it overlaps multiple intervals
+# Collapsed: Number of hits for PE pair collapsed into a single read
+# M, I, D: Number of aligned (M), inserted (I) and deleted (D) bases relative to references
+# Coverage: Average number of bases covering each position in the contig(s)/intervals(s).
+Name Sample Library Contig Size Hits SE PE_1 PE_2 Collapsed M I D Coverage
+Pi1845A * * * 37922 25868 21065 66 498 4239 1330543 1149 1176 35.086308739
+Pi1845A * * gi|58012130|gb|AY894835.1| 37922 25868 21065 66 498 4239 1330543 1149 1176 35.086308739
+#
+#
+Pi1845A Pi1845A * * 37922 25868 21065 66 498 4239 1330543 1149 1176 35.086308739
+Pi1845A Pi1845A * gi|58012130|gb|AY894835.1| 37922 25868 21065 66 498 4239 1330543 1149 1176 35.086308739
+#
+Pi1845A Pi1845A Pi1845A_id_CATAGA * 37922 18717 18717 0 0 0 1026928 768 816 27.0800063288
+Pi1845A Pi1845A Pi1845A_id_CATAGA gi|58012130|gb|AY894835.1| 37922 18717 18717 0 0 0 1026928 768 816 27.0800063288
+#
+Pi1845A Pi1845A Pi1845A_id_CGCTAT * 37922 7151 2348 66 498 4239 303615 381 360 8.00630241021
+Pi1845A Pi1845A Pi1845A_id_CGCTAT gi|58012130|gb|AY894835.1| 37922 7151 2348 66 498 4239 303615 381 360 8.00630241021
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.depths b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.depths
new file mode 100644
index 0000000..b2e585f
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.depths
@@ -0,0 +1,29 @@
+# Timestamp: 2013-10-22T17:45:17.920584
+#
+# Columns:
+# Contig: Contig, chromosome, or feature for which a depth histogram was
+# created. Unnamed features are named after the chromosome or
+# contig on which they are located, with a star appended. For
+# example "chr1*".
+# Size: The total size of the region. Multiple features with the same
+# name are combined into one row, with the size representing to
+# total of these. Note that overlapping bases are counted 2 (or
+# more) times.
+# MaxDepth: Maximum depth to use when calling SNPs, in order to exclude
+# (at least) the 0.5% most extreme sites based on read depth,
+# not including sites with depth 0.
+# MD_*: Fraction of sites with a minimum depth of 1-200.
+#
+Name Sample Library Contig Size MaxDepth MD_001 MD_002 MD_003 MD_004 MD_005 MD_006 MD_007 MD_008 MD_009 MD_010 MD_011 MD_012 MD_013 MD_014 MD_015 MD_016 MD_017 MD_018 MD_019 MD_020 MD_021 MD_022 MD_023 MD_024 MD_025 MD_026 MD_027 MD_028 MD_029 MD_030 MD_031 MD_032 MD_033 MD_034 MD_035 MD_036 MD_037 MD_038 MD_039 MD_040 [...]
+Pi1845A * * * 37922 NA 1.0000 1.0000 0.9999 0.9999 0.9999 0.9992 0.9985 0.9952 0.9903 0.9824 0.9726 0.9599 0.9431 0.9229 0.8955 0.8621 0.8258 0.7871 0.7515 0.7131 0.6758 0.6377 0.5999 0.5608 0.5237 0.4914 0.4614 0.4343 0.4085 0.3860 0.3625 0.3415 0.3244 0.3081 0.2948 0.2822 0.2683 0.2571 0.2455 0.2344 [...]
+Pi1845A * * gi|58012130|gb|AY894835.1| 37922 NA 1.0000 1.0000 0.9999 0.9999 0.9999 0.9992 0.9985 0.9952 0.9903 0.9824 0.9726 0.9599 0.9431 0.9229 0.8955 0.8621 0.8258 0.7871 0.7515 0.7131 0.6758 0.6377 0.5999 0.5608 0.5237 0.4914 0.4614 0.4343 0.4085 0.3860 0.3625 0.3415 0.3244 0.3081 0.2948 0.2822 0.2683 0.2571 0.2455 0.2344 [...]
+#
+#
+Pi1845A Pi1845A * * 37922 NA 1.0000 1.0000 0.9999 0.9999 0.9999 0.9992 0.9985 0.9952 0.9903 0.9824 0.9726 0.9599 0.9431 0.9229 0.8955 0.8621 0.8258 0.7871 0.7515 0.7131 0.6758 0.6377 0.5999 0.5608 0.5237 0.4914 0.4614 0.4343 0.4085 0.3860 0.3625 0.3415 0.3244 0.3081 0.2948 0.2822 0.2683 0.2571 0.2455 0.2344 [...]
+Pi1845A Pi1845A * gi|58012130|gb|AY894835.1| 37922 NA 1.0000 1.0000 0.9999 0.9999 0.9999 0.9992 0.9985 0.9952 0.9903 0.9824 0.9726 0.9599 0.9431 0.9229 0.8955 0.8621 0.8258 0.7871 0.7515 0.7131 0.6758 0.6377 0.5999 0.5608 0.5237 0.4914 0.4614 0.4343 0.4085 0.3860 0.3625 0.3415 0.3244 0.3081 0.2948 0.2822 0.2683 0.2571 0.2455 0.2344 [...]
+#
+Pi1845A Pi1845A Pi1845A_id_CATAGA * 37922 149 1.0000 1.0000 0.9999 0.9995 0.9993 0.9984 0.9965 0.9916 0.9834 0.9728 0.9554 0.9334 0.9021 0.8669 0.8260 0.7784 0.7277 0.6749 0.6236 0.5696 0.5167 0.4655 0.4232 0.3850 0.3500 0.3208 0.2940 0.2711 0.2500 0.2310 0.2155 0.2010 0.1883 0.1754 0.1634 0.1533 0.1434 0.1345 0.1270 0.1205 [...]
+Pi1845A Pi1845A Pi1845A_id_CATAGA gi|58012130|gb|AY894835.1| 37922 149 1.0000 1.0000 0.9999 0.9995 0.9993 0.9984 0.9965 0.9916 0.9834 0.9728 0.9554 0.9334 0.9021 0.8669 0.8260 0.7784 0.7277 0.6749 0.6236 0.5696 0.5167 0.4655 0.4232 0.3850 0.3500 0.3208 0.2940 0.2711 0.2500 0.2310 0.2155 0.2010 0.1883 0.1754 0.1634 0.1533 0.1434 0.1345 0.1270 0.1205 [...]
+#
+Pi1845A Pi1845A Pi1845A_id_CGCTAT * 37922 152 0.8414 0.7627 0.6529 0.5639 0.4829 0.4113 0.3626 0.3181 0.2772 0.2424 0.2164 0.1936 0.1712 0.1501 0.1314 0.1143 0.0994 0.0887 0.0795 0.0703 0.0633 0.0575 0.0526 0.0490 0.0441 0.0411 0.0378 0.0342 0.0317 0.0290 0.0269 0.0251 0.0238 0.0228 0.0219 0.0212 0.0209 0.0205 0.0199 0.0193 [...]
+Pi1845A Pi1845A Pi1845A_id_CGCTAT gi|58012130|gb|AY894835.1| 37922 152 0.8414 0.7627 0.6529 0.5639 0.4829 0.4113 0.3626 0.3181 0.2772 0.2424 0.2164 0.1936 0.1712 0.1501 0.1314 0.1143 0.0994 0.0887 0.0795 0.0703 0.0633 0.0575 0.0526 0.0490 0.0441 0.0411 0.0378 0.0342 0.0317 0.0290 0.0269 0.0251 0.0238 0.0228 0.0219 0.0212 0.0209 0.0205 0.0199 0.0193 [...]
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/3pGtoA_freq.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/3pGtoA_freq.txt
new file mode 100644
index 0000000..5760a5c
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/3pGtoA_freq.txt
@@ -0,0 +1,26 @@
+pos 5pG>A
+1 0.0393622321873443
+2 0.0248756218905473
+3 0.0225840336134454
+4 0.010275824770146
+5 0.0105652403592182
+6 0.00973499188750676
+7 0.00478468899521531
+8 0.0095
+9 0.0065359477124183
+10 0.0081888246628131
+11 0.00792171481826654
+12 0.00430828147438966
+13 0.00411522633744856
+14 0.00546946216955333
+15 0.00323325635103926
+16 0.00394910048266784
+17 0.00401069518716578
+18 0.00456829602558246
+19 0.00584795321637427
+20 0.00517647058823529
+21 0.00435835351089588
+22 0.00464468183929401
+23 0.00429389312977099
+24 0.00591133004926108
+25 0.00872359963269054
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/5pCtoT_freq.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/5pCtoT_freq.txt
new file mode 100644
index 0000000..a1aca14
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/5pCtoT_freq.txt
@@ -0,0 +1,26 @@
+pos 5pC>T
+1 0.0681240467717336
+2 0.0587915078933043
+3 0.0422222222222222
+4 0.0306919642857143
+5 0.0225522552255226
+6 0.0103036876355748
+7 0.00461301896463352
+8 0.00556962025316456
+9 0.00884086444007859
+10 0.0066006600660066
+11 0.00604651162790698
+12 0.00333333333333333
+13 0.00814848347668628
+14 0.00276370336250576
+15 0.00602130616025938
+16 0.00475285171102662
+17 0.0042174320524836
+18 0.00625902744342802
+19 0.0055996266915539
+20 0.00562587904360056
+21 0.00611764705882353
+22 0.00378429517502365
+23 0.0028436018957346
+24 0.00292682926829268
+25 0.00191296030607365
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/Fragmisincorporation_plot.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/Fragmisincorporation_plot.pdf
new file mode 100644
index 0000000..913d982
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/Fragmisincorporation_plot.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/Length_plot.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/Length_plot.pdf
new file mode 100644
index 0000000..15f9cc9
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/Length_plot.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/Runtime_log.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/Runtime_log.txt
new file mode 100644
index 0000000..4ea92ab
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/Runtime_log.txt
@@ -0,0 +1,4 @@
+2013-10-22 15:36:51,992 INFO main: Started with the command: /home/mischu/bin/mapDamage/bin/mapDamage --no-stats --merge-reference-sequences -t mapDamage plot for library 'Pi1845A_id_CATAGA' -i - -d /home/mischu/scratch/bam_pipeline/d7d39a51-0816-40cc-966e-2dbb47546fd0 -r 000_prefixes/Pi_mito.fasta --downsample 100000
+2013-10-22 15:36:55,325 DEBUG main: BAM read in 7.444609 seconds
+2013-10-22 15:36:56,206 INFO main: Successful run
+2013-10-22 15:36:56,206 DEBUG main: Run completed in 8.326392 seconds
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_correct_prob.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_correct_prob.csv
new file mode 100644
index 0000000..ba6c5c5
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_correct_prob.csv
@@ -0,0 +1,25 @@
+"","Position","C.T","G.A"
+"1",1,0.959361367687374,0.95716840427911
+"2",2,0.940974886852389,0.929583046736047
+"3",3,0.913839141947123,0.889527518813315
+"4",4,0.875167993584308,0.836652068956465
+"5",5,0.820966370208883,0.777103375790119
+"6",6,0.746739446711773,0.721830091349808
+"7",7,0.664806254281007,0.666310979682412
+"8",8,0.607122731566499,0.596601446569972
+"9",9,0.577196655365773,0.517392983103309
+"10",10,0.547101385096657,0.461672566533469
+"11",11,0.502715375464721,0.448190226837224
+"12",12,0.431649996405164,0.473328543672538
+"13",-12,0.272869913627161,0.553817143338728
+"14",-11,0.342010891541965,0.55605457061967
+"15",-10,0.402207831136856,0.572372043280078
+"16",-9,0.460558425645814,0.605015408646168
+"17",-8,0.522591396248845,0.653256044860146
+"18",-7,0.571492599563121,0.720728019684961
+"19",-6,0.606640636145163,0.794214708583718
+"20",-5,0.66213571379121,0.853982574415537
+"21",-4,0.736974680400382,0.899084217313553
+"22",-3,0.797655934618941,0.933552952717182
+"23",-2,0.857188451945215,0.956798306129164
+"24",-1,0.909778317514779,0.971846117533259
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_hist.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_hist.pdf
new file mode 100644
index 0000000..e563ed8
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_hist.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_iter_summ_stat.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_iter_summ_stat.csv
new file mode 100644
index 0000000..98a162a
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_iter_summ_stat.csv
@@ -0,0 +1,45 @@
+"","Theta","DeltaD","DeltaS","Lambda","Rho","LogLik"
+"Mean",0.0233879497748914,0.00384296191433956,0.385393824387922,0.379425864231239,1.53329380880119,-7724.11873389179
+"Std.",0.000610256006540237,0.000923301418586215,0.0238397294609559,0.0178869148095453,0.0438289657160833,1.56880185636383
+"Acceptance ratio",0.2471,0.19356,0.21472,0.28494,0.17168,0.71594
+"0%",0.0210080150753116,3.50421521183226e-05,0.291287688453625,0.291422985061004,1.37102915126088,-7736.75922406466
+"2.5%",0.0222255490697626,0.00199142186151454,0.340411042153135,0.343411693864587,1.44806033187435,-7727.98924708017
+"5%",0.0224024403928032,0.00227461937661671,0.347144794831181,0.349349570036093,1.46427783555798,-7727.07411869441
+"7.5%",0.0225218282494182,0.0024782593644658,0.351422069556535,0.353248179299323,1.47278463267993,-7726.55794876096
+"10%",0.0226099334029803,0.00263807037698638,0.354935820030159,0.356234245393017,1.47823293138977,-7726.17213719784
+"12.5%",0.0226929681390539,0.00277623998846736,0.358127597499125,0.359029704589812,1.48460482782598,-7725.8783616029
+"15%",0.0227525504821545,0.00288807076046571,0.360772517385298,0.361060553253254,1.48895732767101,-7725.62269075733
+"17.5%",0.0228142896229523,0.00300019312256911,0.362891679079081,0.362982981779668,1.49261338349655,-7725.41174279916
+"20%",0.0228733455205692,0.00308479470702851,0.365203996238892,0.364567456889457,1.49654418574092,-7725.22219101579
+"22.5%",0.022928650672039,0.00316727890260753,0.367438360864448,0.366076443177077,1.49988580489003,-7725.0527468289
+"25%",0.0229769401578356,0.00324104409583899,0.369149740171227,0.367552860496461,1.50320562576658,-7724.89461468848
+"27.5%",0.0230228749087454,0.00330695092326865,0.370789407371015,0.368922002342573,1.50667513577582,-7724.74545404769
+"30%",0.0230729876979759,0.00337223093934554,0.372509697088793,0.370242709774533,1.50980746264593,-7724.61444540831
+"32.5%",0.023113747726059,0.0034446120306388,0.374150380146292,0.371556103793614,1.51275539885365,-7724.49388096115
+"35%",0.0231547181239559,0.0035089623875028,0.375938686128653,0.372805448414993,1.51566534940719,-7724.37923525866
+"37.5%",0.023191614859881,0.0035715841879179,0.377470432150125,0.37392805195763,1.51816940704429,-7724.2617846847
+"40%",0.0232303003723684,0.00362450182095001,0.379049654641462,0.375157400637113,1.52052307504922,-7724.16793457961
+"42.5%",0.0232728202848384,0.00368587994958948,0.380494083557937,0.376364756558958,1.52331981037254,-7724.0700286283
+"45%",0.0233091833014146,0.00374544156038983,0.381931262359509,0.377510355598906,1.52578735284987,-7723.98088531527
+"47.5%",0.0233463159010742,0.00380076009832865,0.383448724927408,0.378603783784743,1.52854081972666,-7723.88868716761
+"50%",0.0233812562177226,0.00385396939164666,0.384966730536818,0.379842620950672,1.53111584206863,-7723.79817589455
+"52.5%",0.0234225239249504,0.00391619272464769,0.386599195387928,0.380914373530589,1.53412830142722,-7723.71165496721
+"55%",0.0234611138436922,0.00397880257904358,0.38795572739533,0.381863748605375,1.53743000854768,-7723.62809198772
+"57.5%",0.0235015141273147,0.00404031758539143,0.389508448338163,0.382987719568186,1.54073373372509,-7723.55097911208
+"60%",0.0235396376358954,0.00408817699240206,0.391155767130329,0.384045881311063,1.54420798567335,-7723.46795797591
+"62.5%",0.0235798558450135,0.00414504504030186,0.392654844627253,0.385192625647198,1.54724377562573,-7723.38128579153
+"65%",0.0236217148998982,0.00419735617721098,0.39437343860166,0.386489650210882,1.54946792289562,-7723.29865219971
+"67.5%",0.0236637269274995,0.00426323852548051,0.396023624165386,0.387737091264595,1.5527557439834,-7723.22065882423
+"70%",0.0237103757409195,0.00432787364746775,0.397697736280699,0.389087752374826,1.55611107001754,-7723.13896601108
+"72.5%",0.0237535305513657,0.00439415218947424,0.399437989551992,0.390342811506966,1.56004774359422,-7723.05740024616
+"75%",0.0237989101798572,0.00446934605149193,0.401296054326329,0.391744343506692,1.56334023523625,-7722.97852079677
+"77.5%",0.0238462269994488,0.00454544684014968,0.403185280836839,0.393267428383066,1.56724558481389,-7722.89726618717
+"80%",0.023893181358385,0.00463521809689652,0.405203821270682,0.394766769207822,1.57109551045536,-7722.81674653675
+"82.5%",0.0239546115062166,0.00470080409516955,0.407511049378907,0.396223255328585,1.57518601962289,-7722.73991090139
+"85%",0.0240128333296327,0.00478591103581871,0.409753581458439,0.398269268645473,1.57948204071339,-7722.65439236075
+"87.5%",0.0240803898896882,0.00487945857334979,0.412666155760764,0.400119919240341,1.58427795460359,-7722.56004533141
+"90%",0.0241480734812341,0.00500831494069869,0.416148472303591,0.402325159931634,1.58977954310943,-7722.46708527528
+"92.5%",0.0242358161824914,0.00516710438292316,0.420086217242946,0.4048325441756,1.59621792314947,-7722.35887634353
+"95%",0.0243745825787337,0.00533648927057912,0.42603254799372,0.40785392647622,1.60418476972676,-7722.23150016379
+"97.5%",0.0245999593221308,0.00560444211740944,0.433057519687509,0.412730685833178,1.61739786471995,-7722.07416855993
+"100%",0.0262243518215509,0.00730793077777768,0.503712874728317,0.451523004823385,1.7252144890952,-7721.67000834901
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_post_pred.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_post_pred.pdf
new file mode 100644
index 0000000..f647f21
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_post_pred.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_trace.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_trace.pdf
new file mode 100644
index 0000000..5746e09
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_trace.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/dnacomp.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/dnacomp.txt
new file mode 100644
index 0000000..7cf39f4
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/dnacomp.txt
@@ -0,0 +1,324 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_mito.fasta as reference file
+# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads
+Chr End Std Pos A C G T Total
+* 3p + -70 955 409 523 971 2858
+* 3p + -69 988 424 539 974 2925
+* 3p + -68 1019 413 525 1031 2988
+* 3p + -67 1021 434 564 1039 3058
+* 3p + -66 1089 427 564 1047 3127
+* 3p + -65 1082 445 581 1093 3201
+* 3p + -64 1117 466 586 1105 3274
+* 3p + -63 1166 463 591 1127 3347
+* 3p + -62 1153 463 604 1185 3405
+* 3p + -61 1214 487 601 1173 3475
+* 3p + -60 1221 481 610 1251 3563
+* 3p + -59 1249 497 632 1249 3627
+* 3p + -58 1221 548 632 1313 3714
+* 3p + -57 1332 550 614 1295 3791
+* 3p + -56 1376 500 633 1363 3872
+* 3p + -55 1383 563 679 1326 3951
+* 3p + -54 1425 515 651 1444 4035
+* 3p + -53 1481 499 663 1482 4125
+* 3p + -52 1474 608 664 1482 4228
+* 3p + -51 1511 598 687 1532 4328
+* 3p + -50 1528 585 728 1573 4414
+* 3p + -49 1572 656 732 1570 4530
+* 3p + -48 1675 606 758 1602 4641
+* 3p + -47 1689 576 752 1771 4788
+* 3p + -46 1776 630 793 1720 4919
+* 3p + -45 1811 630 796 1829 5066
+* 3p + -44 1845 686 803 1869 5203
+* 3p + -43 1966 678 808 1912 5364
+* 3p + -42 2000 703 818 1995 5516
+* 3p + -41 2036 719 885 2049 5689
+* 3p + -40 2124 730 857 2145 5856
+* 3p + -39 2204 718 868 2243 6033
+* 3p + -38 2323 716 888 2276 6203
+* 3p + -37 2334 741 909 2350 6334
+* 3p + -36 2396 802 874 2451 6523
+* 3p + -35 2445 776 908 2559 6688
+* 3p + -34 2546 782 963 2605 6896
+* 3p + -33 2604 864 911 2700 7079
+* 3p + -32 2761 787 1029 2709 7286
+* 3p + -31 2876 811 1079 2744 7510
+* 3p + -30 2907 852 1063 2909 7731
+* 3p + -29 3047 860 1054 3029 7990
+* 3p + -28 3107 947 1091 3139 8284
+* 3p + -27 3244 926 1080 3326 8576
+* 3p + -26 3426 886 1140 3480 8932
+* 3p + -25 3591 994 1167 3595 9347
+* 3p + -24 3513 909 1138 3787 9347
+* 3p + -23 3677 932 1169 3568 9346
+* 3p + -22 3610 1022 1158 3557 9347
+* 3p + -21 3619 1024 1101 3603 9347
+* 3p + -20 3605 980 1175 3587 9347
+* 3p + -19 3537 1062 1220 3528 9347
+* 3p + -18 3661 1015 1200 3471 9347
+* 3p + -17 3686 956 1183 3522 9347
+* 3p + -16 3479 1015 1213 3640 9347
+* 3p + -15 3617 1012 1155 3563 9347
+* 3p + -14 3793 991 1158 3405 9347
+* 3p + -13 3732 950 1196 3469 9347
+* 3p + -12 3684 919 1140 3604 9347
+* 3p + -11 3633 958 1179 3577 9347
+* 3p + -10 3693 986 1102 3566 9347
+* 3p + -9 3693 960 1069 3625 9347
+* 3p + -8 3786 921 1082 3558 9347
+* 3p + -7 3753 927 989 3678 9347
+* 3p + -6 3752 889 995 3711 9347
+* 3p + -5 3832 833 1038 3644 9347
+* 3p + -4 3785 873 1059 3630 9347
+* 3p + -3 3837 826 1057 3627 9347
+* 3p + -2 3928 893 1103 3423 9347
+* 3p + -1 3560 938 1105 3744 9347
+* 3p + 1 1839 1439 670 5399 9347
+* 3p + 2 3339 1036 1311 3659 9345
+* 3p + 3 3792 989 1243 3321 9345
+* 3p + 4 3662 962 1101 3620 9345
+* 3p + 5 3698 1053 1138 3456 9345
+* 3p + 6 3707 996 1270 3372 9345
+* 3p + 7 3597 1026 1154 3568 9345
+* 3p + 8 3708 975 1100 3562 9345
+* 3p + 9 3691 955 1167 3532 9345
+* 3p + 10 3613 991 1157 3583 9344
+* 3p - -70 902 515 398 983 2798
+* 3p - -69 952 524 418 966 2860
+* 3p - -68 962 514 444 1009 2929
+* 3p - -67 995 535 431 1018 2979
+* 3p - -66 1038 536 419 1040 3033
+* 3p - -65 1020 554 426 1098 3098
+* 3p - -64 1095 576 428 1074 3173
+* 3p - -63 1112 557 443 1121 3233
+* 3p - -62 1099 584 482 1130 3295
+* 3p - -61 1180 561 452 1182 3375
+* 3p - -60 1136 607 478 1218 3439
+* 3p - -59 1205 641 468 1198 3512
+* 3p - -58 1255 645 516 1173 3589
+* 3p - -57 1322 647 507 1201 3677
+* 3p - -56 1282 627 536 1305 3750
+* 3p - -55 1284 711 486 1334 3815
+* 3p - -54 1337 670 553 1346 3906
+* 3p - -53 1398 706 550 1333 3987
+* 3p - -52 1359 705 566 1440 4070
+* 3p - -51 1471 692 557 1474 4194
+* 3p - -50 1526 710 564 1490 4290
+* 3p - -49 1520 720 564 1599 4403
+* 3p - -48 1603 748 561 1611 4523
+* 3p - -47 1717 717 615 1596 4645
+* 3p - -46 1713 784 575 1702 4774
+* 3p - -45 1784 762 631 1739 4916
+* 3p - -44 1807 806 639 1789 5041
+* 3p - -43 1871 791 637 1907 5206
+* 3p - -42 1945 792 679 1955 5371
+* 3p - -41 2035 882 632 1994 5543
+* 3p - -40 2079 870 699 2065 5713
+* 3p - -39 2214 857 651 2194 5916
+* 3p - -38 2226 910 727 2246 6109
+* 3p - -37 2315 927 698 2308 6248
+* 3p - -36 2340 933 737 2429 6439
+* 3p - -35 2461 921 757 2488 6627
+* 3p - -34 2562 984 745 2551 6842
+* 3p - -33 2641 953 730 2730 7054
+* 3p - -32 2659 968 814 2819 7260
+* 3p - -31 2822 1017 850 2799 7488
+* 3p - -30 2926 1063 813 2938 7740
+* 3p - -29 3027 1048 823 3072 7970
+* 3p - -28 3149 1096 872 3143 8260
+* 3p - -27 3332 1060 887 3291 8570
+* 3p - -26 3441 1080 948 3469 8938
+* 3p - -25 3576 1083 1017 3693 9369
+* 3p - -24 3613 1148 897 3712 9370
+* 3p - -23 3751 1131 935 3552 9369
+* 3p - -22 3598 1162 996 3614 9370
+* 3p - -21 3660 1115 962 3633 9370
+* 3p - -20 3734 1156 956 3524 9370
+* 3p - -19 3662 1168 997 3543 9370
+* 3p - -18 3595 1166 992 3617 9370
+* 3p - -17 3654 1159 1060 3497 9370
+* 3p - -16 3592 1203 1074 3501 9370
+* 3p - -15 3561 1143 1018 3648 9370
+* 3p - -14 3725 1115 1038 3492 9370
+* 3p - -13 3675 1118 995 3582 9370
+* 3p - -12 3610 1142 952 3666 9370
+* 3p - -11 3652 1092 955 3671 9370
+* 3p - -10 3635 1077 969 3689 9370
+* 3p - -9 3700 1090 913 3667 9370
+* 3p - -8 3752 1052 907 3659 9370
+* 3p - -7 3740 1022 896 3712 9370
+* 3p - -6 3748 986 845 3791 9370
+* 3p - -5 3807 1032 852 3679 9370
+* 3p - -4 3836 1014 816 3704 9370
+* 3p - -3 3886 952 869 3663 9370
+* 3p - -2 3944 1036 933 3457 9370
+* 3p - -1 3514 1170 908 3778 9370
+* 3p - 1 1737 1709 560 5363 9369
+* 3p - 2 3240 1202 1177 3749 9368
+* 3p - 3 3701 1089 1113 3465 9368
+* 3p - 4 3578 1162 982 3646 9368
+* 3p - 5 3644 1119 1012 3593 9368
+* 3p - 6 3610 1123 1043 3592 9368
+* 3p - 7 3637 1107 998 3626 9368
+* 3p - 8 3626 1085 964 3693 9368
+* 3p - 9 3566 1121 967 3714 9368
+* 3p - 10 3683 1132 983 3569 9367
+* 5p + -10 3722 942 1060 3619 9343
+* 5p + -9 3755 916 1064 3608 9343
+* 5p + -8 3691 931 1083 3639 9344
+* 5p + -7 3753 949 1075 3568 9345
+* 5p + -6 3676 962 1020 3687 9345
+* 5p + -5 3700 898 1118 3630 9346
+* 5p + -4 3752 918 1076 3600 9346
+* 5p + -3 3699 980 1080 3587 9346
+* 5p + -2 3834 944 1194 3374 9346
+* 5p + -1 4684 564 1574 2524 9346
+* 5p + 1 3681 968 1072 3626 9347
+* 5p + 2 3493 896 978 3980 9347
+* 5p + 3 3769 887 983 3641 9280
+* 5p + 4 3722 853 945 3827 9347
+* 5p + 5 3779 829 948 3791 9347
+* 5p + 6 3768 875 976 3728 9347
+* 5p + 7 3796 905 985 3661 9347
+* 5p + 8 3770 912 1009 3656 9347
+* 5p + 9 3770 902 1073 3602 9347
+* 5p + 10 3686 963 1098 3600 9347
+* 5p + 11 3681 986 1081 3599 9347
+* 5p + 12 3621 977 1171 3578 9347
+* 5p + 13 3667 1042 1128 3510 9347
+* 5p + 14 3583 1007 1133 3624 9347
+* 5p + 15 3714 1021 1109 3503 9347
+* 5p + 16 3683 953 1175 3536 9347
+* 5p + 17 3650 982 1140 3575 9347
+* 5p + 18 3631 967 1135 3614 9347
+* 5p + 19 3566 957 1136 3688 9347
+* 5p + 20 3586 1006 1086 3669 9347
+* 5p + 21 3686 990 1122 3549 9347
+* 5p + 22 3682 961 1107 3597 9347
+* 5p + 23 3617 958 1132 3640 9347
+* 5p + 24 3662 967 1151 3567 9347
+* 5p + 25 3612 950 1136 3649 9347
+* 5p + 26 3506 940 1078 3408 8932
+* 5p + 27 3309 891 1099 3278 8577
+* 5p + 28 3162 890 1077 3155 8284
+* 5p + 29 3096 872 1007 3015 7990
+* 5p + 30 2952 850 1063 2866 7731
+* 5p + 31 2784 888 1013 2825 7510
+* 5p + 32 2785 846 986 2669 7286
+* 5p + 33 2656 788 1000 2637 7081
+* 5p + 34 2600 833 907 2558 6898
+* 5p + 35 2486 739 927 2536 6688
+* 5p + 36 2460 736 967 2361 6524
+* 5p + 37 2296 775 905 2358 6334
+* 5p + 38 2303 742 862 2297 6204
+* 5p + 39 2255 734 883 2162 6034
+* 5p + 40 2190 713 840 2115 5858
+* 5p + 41 2119 643 849 2081 5692
+* 5p + 42 2044 673 822 1977 5516
+* 5p + 43 1968 677 772 1948 5365
+* 5p + 44 1917 642 807 1837 5203
+* 5p + 45 1861 611 770 1824 5066
+* 5p + 46 1714 647 784 1775 4920
+* 5p + 47 1720 579 770 1721 4790
+* 5p + 48 1657 585 779 1620 4641
+* 5p + 49 1628 572 746 1584 4530
+* 5p + 50 1590 537 715 1573 4415
+* 5p + 51 1570 551 739 1468 4328
+* 5p + 52 1455 589 708 1477 4229
+* 5p + 53 1455 554 665 1452 4126
+* 5p + 54 1364 526 711 1434 4035
+* 5p + 55 1423 545 684 1299 3951
+* 5p + 56 1379 495 672 1326 3872
+* 5p + 57 1290 521 692 1288 3791
+* 5p + 58 1297 517 621 1279 3714
+* 5p + 59 1286 488 629 1225 3628
+* 5p + 60 1252 497 621 1193 3563
+* 5p + 61 1176 486 615 1198 3475
+* 5p + 62 1132 486 614 1175 3407
+* 5p + 63 1169 464 611 1104 3348
+* 5p + 64 1111 456 630 1077 3274
+* 5p + 65 1065 441 604 1092 3202
+* 5p + 66 1077 439 562 1049 3127
+* 5p + 67 1036 454 531 1037 3058
+* 5p + 68 1003 428 546 1013 2990
+* 5p + 69 1011 396 546 974 2927
+* 5p + 70 983 412 528 936 2859
+* 5p - -10 3640 1105 943 3675 9363
+* 5p - -9 3611 1133 896 3724 9364
+* 5p - -8 3624 1029 900 3812 9365
+* 5p - -7 3630 1071 969 3695 9365
+* 5p - -6 3580 1134 937 3715 9366
+* 5p - -5 3609 1054 891 3813 9367
+* 5p - -4 3622 1042 966 3737 9367
+* 5p - -3 3525 1139 972 3732 9368
+* 5p - -2 3702 1137 1001 3529 9369
+* 5p - -1 4657 678 1359 2676 9370
+* 5p - 1 3695 1107 901 3667 9370
+* 5p - 2 3477 1027 880 3985 9369
+* 5p - 3 3665 986 862 3786 9299
+* 5p - 4 3767 965 844 3794 9370
+* 5p - 5 3715 994 794 3867 9370
+* 5p - 6 3808 999 808 3755 9370
+* 5p - 7 3754 1072 852 3692 9370
+* 5p - 8 3779 1062 859 3670 9370
+* 5p - 9 3706 1118 900 3646 9370
+* 5p - 10 3714 1165 890 3600 9369
+* 5p - 11 3639 1171 907 3653 9370
+* 5p - 12 3677 1132 919 3642 9370
+* 5p - 13 3617 1155 979 3619 9370
+* 5p - 14 3573 1174 997 3626 9370
+* 5p - 15 3603 1158 997 3612 9370
+* 5p - 16 3641 1147 992 3590 9370
+* 5p - 17 3566 1153 986 3665 9370
+* 5p - 18 3583 1113 1015 3659 9370
+* 5p - 19 3560 1189 1002 3619 9370
+* 5p - 20 3596 1130 933 3711 9370
+* 5p - 21 3610 1137 974 3649 9370
+* 5p - 22 3642 1157 949 3622 9370
+* 5p - 23 3635 1151 920 3664 9370
+* 5p - 24 3689 1086 959 3636 9370
+* 5p - 25 3673 1147 953 3597 9370
+* 5p - 26 3511 1092 886 3450 8939
+* 5p - 27 3302 1056 916 3297 8571
+* 5p - 28 3129 1095 884 3156 8264
+* 5p - 29 3050 1010 847 3063 7970
+* 5p - 30 2891 1003 842 3004 7740
+* 5p - 31 2799 992 832 2865 7488
+* 5p - 32 2812 962 780 2708 7262
+* 5p - 33 2704 956 788 2607 7055
+* 5p - 34 2600 901 811 2530 6842
+* 5p - 35 2440 884 781 2522 6627
+* 5p - 36 2468 924 728 2321 6441
+* 5p - 37 2359 887 722 2279 6247
+* 5p - 38 2336 917 697 2159 6109
+* 5p - 39 2170 864 698 2185 5917
+* 5p - 40 2051 896 738 2030 5715
+* 5p - 41 2042 873 658 1971 5544
+* 5p - 42 1996 818 672 1888 5374
+* 5p - 43 1864 816 645 1882 5207
+* 5p - 44 1789 805 617 1832 5043
+* 5p - 45 1727 762 634 1795 4918
+* 5p - 46 1715 783 611 1665 4774
+* 5p - 47 1694 719 588 1645 4646
+* 5p - 48 1621 682 635 1587 4525
+* 5p - 49 1595 733 575 1500 4403
+* 5p - 50 1560 688 562 1481 4291
+* 5p - 51 1509 696 556 1433 4194
+* 5p - 52 1440 695 538 1398 4071
+* 5p - 53 1389 684 526 1389 3988
+* 5p - 54 1342 667 534 1365 3908
+* 5p - 55 1322 640 537 1317 3816
+* 5p - 56 1293 625 473 1360 3751
+* 5p - 57 1245 611 561 1261 3678
+* 5p - 58 1199 607 516 1268 3590
+* 5p - 59 1232 615 468 1197 3512
+* 5p - 60 1170 597 492 1181 3440
+* 5p - 61 1178 598 458 1141 3375
+* 5p - 62 1167 559 450 1120 3296
+* 5p - 63 1105 595 454 1079 3233
+* 5p - 64 1099 557 445 1072 3173
+* 5p - 65 1045 564 432 1058 3099
+* 5p - 66 1023 543 435 1034 3035
+* 5p - 67 984 560 394 1041 2979
+* 5p - 68 1022 510 431 966 2929
+* 5p - 69 1028 509 409 915 2861
+* 5p - 70 919 515 412 952 2798
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/dnacomp_genome.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/dnacomp_genome.csv
new file mode 100644
index 0000000..94d15ba
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/dnacomp_genome.csv
@@ -0,0 +1,2 @@
+A,C,G,T
+0.388112441327,0.105690628131,0.117477981119,0.388718949422
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/lgdistribution.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/lgdistribution.txt
new file mode 100644
index 0000000..e433f8f
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/lgdistribution.txt
@@ -0,0 +1,152 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_mito.fasta as reference file
+# Std: strand of reads
+Std Length Occurences
++ 23 6
++ 24 63
++ 25 357
++ 26 352
++ 27 293
++ 28 286
++ 29 253
++ 30 229
++ 31 218
++ 32 211
++ 33 184
++ 34 209
++ 35 161
++ 36 188
++ 37 138
++ 38 156
++ 39 179
++ 40 169
++ 41 176
++ 42 151
++ 43 163
++ 44 138
++ 45 149
++ 46 123
++ 47 150
++ 48 113
++ 49 114
++ 50 89
++ 51 102
++ 52 102
++ 53 85
++ 54 90
++ 55 78
++ 56 85
++ 57 71
++ 58 86
++ 59 65
++ 60 86
++ 61 72
++ 62 59
++ 63 71
++ 64 73
++ 65 73
++ 66 72
++ 67 68
++ 68 68
++ 69 64
++ 70 48
++ 71 73
++ 72 52
++ 73 51
++ 74 41
++ 75 52
++ 76 40
++ 77 52
++ 78 57
++ 79 37
++ 80 44
++ 81 43
++ 82 50
++ 83 39
++ 84 44
++ 85 50
++ 86 39
++ 87 46
++ 88 34
++ 89 28
++ 90 27
++ 91 57
++ 92 96
++ 93 214
++ 94 1520
++ 95 23
++ 96 2
+- 23 3
+- 24 60
+- 25 370
+- 26 366
+- 27 312
+- 28 283
+- 29 232
+- 30 253
+- 31 216
+- 32 220
+- 33 214
+- 34 209
+- 35 190
+- 36 190
+- 37 145
+- 38 196
+- 39 194
+- 40 174
+- 41 164
+- 42 163
+- 43 168
+- 44 130
+- 45 141
+- 46 121
+- 47 136
+- 48 115
+- 49 108
+- 50 100
+- 51 123
+- 52 85
+- 53 79
+- 54 89
+- 55 68
+- 56 77
+- 57 84
+- 58 80
+- 59 71
+- 60 64
+- 61 81
+- 62 63
+- 63 62
+- 64 72
+- 65 63
+- 66 54
+- 67 49
+- 68 69
+- 69 63
+- 70 64
+- 71 49
+- 72 58
+- 73 58
+- 74 56
+- 75 39
+- 76 48
+- 77 37
+- 78 56
+- 79 53
+- 80 53
+- 81 38
+- 82 47
+- 83 49
+- 84 36
+- 85 41
+- 86 40
+- 87 45
+- 88 41
+- 89 20
+- 90 38
+- 91 54
+- 92 83
+- 93 243
+- 94 1434
+- 95 19
+- 96 2
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/misincorporation.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/misincorporation.txt
new file mode 100644
index 0000000..2251259
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CATAGA/misincorporation.txt
@@ -0,0 +1,284 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_mito.fasta as reference file
+# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads
+Chr End Std Pos A C G T Total G>A C>T A>G T>C A>C A>T C>G C>A T>G T>A G>C G>T A>- T>- C>- G>- ->A ->T ->C ->G S
+* 3p + 1 3513 943 1109 3782 9347 43 18 33 19 20 62 5 26 16 93 5 10 0 0 0 0 0 0 0 0 0
+* 3p + 2 3868 895 1100 3483 9346 32 6 23 16 7 25 4 16 14 67 1 5 0 0 0 0 0 1 0 0 0
+* 3p + 3 3818 830 1044 3654 9346 20 3 16 10 3 27 5 9 19 36 0 7 0 0 0 0 0 1 0 0 0
+* 3p + 4 3765 874 1045 3661 9345 12 2 13 9 2 14 3 8 10 29 0 0 0 0 0 0 0 1 1 0 0
+* 3p + 5 3805 841 1039 3660 9345 13 3 7 1 3 9 2 8 5 23 1 1 0 0 1 0 1 0 1 0 0
+* 3p + 6 3742 883 1004 3710 9339 11 1 2 6 1 8 0 3 1 6 0 1 1 0 0 0 2 3 3 0 0
+* 3p + 7 3751 921 990 3679 9341 4 1 3 3 3 7 0 0 1 6 0 0 1 3 1 1 2 2 2 0 0
+* 3p + 8 3776 910 1090 3562 9338 9 3 3 8 4 5 1 1 0 7 3 0 1 3 0 0 6 3 0 0 0
+* 3p + 9 3691 946 1074 3626 9337 9 2 3 10 5 7 0 0 0 6 0 0 0 0 0 0 5 4 1 0 0
+* 3p + 10 3678 988 1106 3570 9342 10 3 6 2 2 4 0 2 0 7 0 0 0 1 0 0 2 2 0 1 0
+* 3p + 11 3620 955 1183 3573 9331 9 3 2 4 3 6 0 2 3 4 0 1 0 1 1 1 4 8 2 2 0
+* 3p + 12 3691 914 1138 3594 9337 4 1 3 3 1 6 0 0 1 1 0 1 0 0 0 0 1 5 2 2 0
+* 3p + 13 3734 951 1192 3458 9335 3 4 3 4 1 7 0 2 2 2 0 1 0 0 1 0 4 4 1 3 0
+* 3p + 14 3775 984 1165 3402 9326 11 3 1 6 4 6 0 5 0 8 1 0 0 0 0 0 4 9 3 5 0
+* 3p + 15 3610 1014 1149 3567 9340 5 5 6 3 0 1 0 1 3 6 0 0 1 0 0 0 2 1 2 2 0
+* 3p + 16 3470 1016 1206 3642 9334 4 3 6 0 0 4 0 0 5 9 1 1 0 0 0 0 6 4 1 2 0
+* 3p + 17 3687 952 1182 3518 9339 6 3 4 3 4 7 1 0 1 5 0 1 1 1 0 0 3 4 0 1 0
+* 3p + 18 3645 1016 1206 3468 9335 5 4 0 3 2 8 0 2 0 8 0 2 0 4 0 1 5 6 0 1 0
+* 3p + 19 3535 1056 1225 3524 9340 7 1 3 7 1 4 0 1 1 5 0 1 2 1 0 0 2 4 1 0 0
+* 3p + 20 3600 970 1174 3598 9342 6 3 7 9 2 4 0 0 0 13 0 1 1 3 1 1 2 2 1 0 0
+* 3p + 21 3615 1019 1108 3593 9335 6 5 2 7 2 5 0 2 1 2 0 1 1 0 0 1 3 5 3 1 0
+* 3p + 22 3603 1023 1149 3568 9343 4 6 10 7 2 8 1 3 3 10 0 1 2 4 0 0 1 2 1 0 0
+* 3p + 23 3696 929 1165 3550 9340 5 8 5 7 2 18 0 1 5 7 1 3 5 3 0 0 3 2 0 1 0
+* 3p + 24 3527 889 1138 3785 9339 9 4 11 13 6 35 0 1 4 24 1 8 7 4 2 0 2 2 3 1 0
+* 3p + 25 3612 993 1170 3562 9337 10 15 16 15 8 59 1 8 6 40 5 11 2 3 1 0 5 3 0 2 0
+* 3p + 26 3473 866 1148 3461 8948 15 7 13 19 6 69 1 5 6 36 5 11 8 2 0 0 5 5 2 0 0
+* 3p + 27 3257 910 1102 3313 8582 19 11 12 16 8 54 1 6 8 41 7 14 7 5 0 3 1 3 3 1 0
+* 3p + 28 3124 942 1088 3134 8288 6 13 11 11 12 57 0 4 10 50 3 10 2 7 1 1 1 1 0 1 0
+* 3p + 29 3074 847 1056 3021 7998 12 8 14 14 8 57 0 5 9 37 4 8 5 5 0 2 1 2 2 0 0
+* 3p + 30 2935 841 1061 2902 7739 8 11 14 13 7 38 0 3 10 34 2 10 8 2 0 0 2 4 1 0 0
+* 3p + 31 2887 793 1077 2756 7513 6 9 12 18 7 46 2 1 5 43 4 7 5 2 0 0 3 3 1 1 0
+* 3p + 32 2778 788 1030 2696 7292 6 8 7 11 9 47 4 8 4 34 2 8 6 4 0 1 1 3 0 0 0
+* 3p + 33 2611 847 928 2687 7073 10 8 10 11 8 38 0 4 1 36 6 8 4 4 1 1 3 6 3 0 0
+* 3p + 34 2574 768 961 2585 6888 9 2 12 10 8 44 2 5 5 24 2 6 4 5 2 1 2 6 2 1 0
+* 3p + 35 2453 782 912 2540 6687 13 13 8 6 6 44 1 4 7 27 1 10 8 6 1 3 1 1 0 2 0
+* 3p + 36 2392 796 889 2446 6523 14 6 8 11 8 41 1 6 4 43 2 12 1 2 0 0 3 3 0 1 0
+* 3p + 37 2348 736 918 2336 6338 10 8 6 6 10 45 2 4 5 31 3 8 1 2 3 1 2 2 1 0 0
+* 3p + 38 2333 705 890 2275 6203 7 5 9 12 5 47 0 5 3 42 3 5 6 6 0 1 4 0 0 1 0
+* 3p + 39 2223 723 870 2225 6041 9 9 8 5 4 41 2 3 7 24 3 6 5 6 1 1 2 5 1 0 0
+* 3p + 40 2147 726 866 2126 5865 9 9 7 4 7 39 1 3 2 32 3 9 2 3 1 0 0 4 1 1 0
+* 3p + 41 2055 717 905 2018 5695 12 10 7 5 8 43 1 2 1 26 6 10 5 3 0 0 2 3 0 0 0
+* 3p + 42 2028 699 805 1988 5520 6 4 15 2 5 40 1 4 5 27 1 2 4 2 0 0 1 1 0 3 0
+* 3p + 43 1978 680 812 1895 5365 7 9 7 7 2 41 0 1 5 26 3 4 3 0 0 4 4 2 1 2 0
+* 3p + 44 1849 683 810 1865 5207 8 10 3 6 8 26 0 3 6 29 1 4 4 2 1 2 1 0 2 0 0
+* 3p + 45 1832 616 804 1817 5069 12 5 6 9 8 34 0 1 1 14 0 1 5 0 0 3 1 1 0 1 0
+* 3p + 46 1780 618 794 1728 4920 10 4 8 7 7 15 0 2 3 19 1 2 2 0 1 1 2 2 1 0 0
+* 3p + 47 1682 583 757 1777 4799 6 5 5 4 3 17 1 4 1 21 3 4 3 3 1 1 0 1 0 0 0
+* 3p + 48 1679 597 760 1610 4646 5 6 4 6 6 12 0 2 5 16 0 3 2 3 1 1 4 2 0 0 0
+* 3p + 49 1576 653 735 1567 4531 6 6 7 6 6 16 0 2 2 15 2 5 2 1 0 1 1 1 2 0 0
+* 3p + 50 1525 588 733 1571 4417 5 6 4 4 3 13 0 3 2 13 1 2 2 2 1 1 1 2 0 0 0
+* 3p + 51 1526 596 684 1522 4328 2 10 4 6 5 15 0 3 4 9 2 2 1 1 0 1 0 2 1 0 0
+* 3p + 52 1480 609 667 1474 4230 3 6 3 3 4 15 0 6 2 12 1 3 1 4 0 1 1 1 0 0 0
+* 3p + 53 1487 500 662 1478 4127 1 5 4 6 4 14 1 1 0 8 2 1 2 2 1 0 1 0 0 0 0
+* 3p + 54 1432 510 650 1450 4042 1 4 3 6 3 9 0 0 1 10 0 0 3 2 1 1 1 0 0 1 0
+* 3p + 55 1384 560 686 1321 3951 5 2 0 3 4 8 1 2 0 6 0 4 3 0 0 0 1 1 0 1 0
+* 3p + 56 1393 502 637 1340 3872 2 6 1 5 0 19 0 1 2 7 0 3 3 0 1 0 1 2 0 0 0
+* 3p + 57 1314 545 616 1314 3789 3 4 1 4 3 6 1 1 2 13 0 3 1 1 0 0 0 0 2 2 0
+* 3p + 58 1237 547 627 1305 3716 3 1 4 1 1 5 1 0 0 7 0 2 2 0 0 0 0 1 1 0 0
+* 3p + 59 1244 498 631 1257 3630 4 3 1 1 3 6 0 1 1 7 0 0 2 2 0 0 1 1 0 0 0
+* 3p + 60 1226 481 610 1248 3565 1 2 5 2 3 7 0 2 1 4 0 0 1 3 0 2 0 1 0 2 0
+* 3p + 61 1221 488 597 1176 3482 1 4 3 4 2 5 1 1 2 3 0 1 3 0 1 0 0 0 0 0 0
+* 3p + 62 1158 463 606 1179 3406 2 2 1 2 3 7 0 2 0 6 1 0 1 0 1 0 2 0 0 0 0
+* 3p + 63 1170 466 586 1125 3347 1 5 4 3 2 11 1 1 1 7 0 1 1 1 1 0 0 1 0 0 0
+* 3p + 64 1118 461 592 1102 3273 3 5 3 2 3 5 0 1 1 5 0 1 0 0 0 2 0 2 2 1 0
+* 3p + 65 1090 447 577 1088 3202 1 8 0 3 2 5 0 1 0 6 1 0 1 0 0 0 0 1 0 1 0
+* 3p + 66 1089 430 569 1044 3132 2 4 2 2 3 2 0 1 0 4 1 1 0 1 1 0 0 1 0 0 0
+* 3p + 67 1017 436 564 1044 3061 5 2 3 3 1 3 0 1 1 7 0 2 1 0 0 0 0 0 0 1 0
+* 3p + 68 1016 414 528 1034 2992 3 4 1 4 0 2 0 2 0 4 0 1 0 0 0 0 0 0 0 0 0
+* 3p + 69 989 428 534 973 2924 1 5 1 1 2 3 0 1 2 5 1 0 1 1 0 0 0 0 0 0 0
+* 3p + 70 952 403 521 984 2860 1 2 1 3 1 3 0 1 3 5 0 2 1 2 0 0 0 0 0 0 0
+* 3p - 1 3457 1153 898 3862 9370 36 7 23 31 14 45 11 14 23 89 4 7 0 0 0 0 0 0 0 0 1
+* 3p - 2 3901 1047 910 3508 9366 18 4 17 13 6 32 10 17 21 60 0 7 0 0 0 0 3 0 1 0 1
+* 3p - 3 3862 949 860 3697 9368 23 4 13 15 7 27 6 9 14 39 0 1 0 0 0 0 0 2 0 0 0
+* 3p - 4 3816 1035 804 3706 9361 7 8 12 5 2 13 7 13 4 25 0 5 1 1 0 0 3 5 0 1 0
+* 3p - 5 3784 1028 854 3694 9360 7 1 6 4 8 13 2 6 1 29 1 3 2 0 0 0 7 3 0 0 0
+* 3p - 6 3742 981 845 3793 9361 7 3 6 6 5 8 0 4 1 11 0 3 3 4 1 0 2 5 1 1 0
+* 3p - 7 3733 1017 891 3715 9356 5 5 6 10 1 4 0 3 3 9 0 2 1 3 0 0 4 5 2 3 0
+* 3p - 8 3737 1057 910 3657 9361 10 9 7 1 4 3 1 2 3 8 0 2 1 3 0 0 5 1 3 0 0
+* 3p - 9 3695 1087 915 3666 9363 4 3 2 6 2 10 0 3 0 12 0 0 2 1 0 0 2 5 0 0 0
+* 3p - 10 3631 1077 970 3690 9368 7 5 1 4 2 4 0 1 2 6 0 0 1 1 0 0 0 0 0 2 0
+* 3p - 11 3642 1091 963 3665 9361 8 4 2 2 2 9 0 0 0 7 1 1 1 1 0 0 4 4 0 1 0
+* 3p - 12 3611 1132 951 3667 9361 5 1 3 11 1 7 1 1 2 6 0 1 1 1 0 0 3 5 1 0 0
+* 3p - 13 3665 1117 995 3583 9360 6 6 6 5 1 2 0 2 2 4 0 1 2 4 0 1 4 1 4 1 0
+* 3p - 14 3733 1108 1029 3488 9358 1 2 6 5 4 2 1 1 5 1 0 4 2 3 0 0 3 6 2 1 0
+* 3p - 15 3554 1142 1016 3648 9360 2 3 3 3 1 3 0 3 2 2 0 2 2 2 0 0 3 4 2 1 0
+* 3p - 16 3589 1204 1073 3489 9355 5 4 3 3 3 7 0 1 1 5 0 1 0 1 2 0 3 9 1 2 0
+* 3p - 17 3653 1161 1062 3486 9362 3 6 5 2 1 2 0 0 1 5 0 4 1 1 1 0 2 5 1 0 0
+* 3p - 18 3597 1165 983 3617 9362 5 5 14 6 0 9 0 3 0 8 1 0 2 2 0 0 3 4 0 1 0
+* 3p - 19 3668 1162 998 3539 9367 6 3 6 6 3 10 1 0 1 4 0 2 2 2 0 1 1 1 1 0 0
+* 3p - 20 3734 1159 951 3518 9362 5 3 6 3 1 2 0 3 2 5 0 0 4 3 0 0 2 5 0 1 0
+* 3p - 21 3669 1111 957 3627 9364 3 3 7 5 1 11 1 0 1 5 0 0 1 4 0 2 1 4 1 0 0
+* 3p - 22 3598 1159 1004 3603 9364 6 7 4 4 4 12 0 0 2 5 0 5 2 1 1 1 2 2 1 1 0
+* 3p - 23 3761 1132 931 3543 9367 4 6 6 7 3 16 1 1 2 11 0 3 1 5 0 2 1 1 0 0 0
+* 3p - 24 3620 1148 892 3705 9365 3 10 11 11 6 35 0 8 4 25 1 7 5 7 1 0 1 3 1 0 0
+* 3p - 25 3597 1072 1008 3688 9365 9 13 12 13 13 48 1 4 11 49 1 9 4 3 0 1 1 1 2 1 0
+* 3p - 26 3450 1063 943 3504 8960 17 4 16 13 8 47 0 7 15 54 0 9 5 4 0 1 2 2 1 2 0
+* 3p - 27 3307 1063 899 3311 8580 16 9 13 9 8 35 2 9 10 56 3 12 3 8 0 2 3 2 2 0 0
+* 3p - 28 3142 1076 877 3170 8265 19 7 11 15 12 46 2 3 10 55 2 11 3 4 0 0 4 0 1 0 0
+* 3p - 29 3014 1043 835 3086 7978 15 7 10 11 13 39 0 8 7 51 0 8 4 2 1 3 3 2 0 0 0
+* 3p - 30 2925 1055 821 2940 7741 11 9 8 14 6 53 3 5 4 49 1 9 4 4 0 0 4 3 4 1 0
+* 3p - 31 2858 1002 853 2779 7492 9 6 14 12 9 52 2 5 6 33 6 10 7 6 0 1 4 2 1 1 0
+* 3p - 32 2667 961 831 2810 7269 21 9 9 12 8 53 2 3 2 38 2 10 7 4 0 0 1 4 0 1 0
+* 3p - 33 2652 944 728 2725 7049 6 7 8 10 9 38 3 5 3 40 1 9 7 2 0 1 5 1 2 3 0
+* 3p - 34 2573 979 749 2533 6834 7 8 12 6 10 40 2 7 5 36 3 10 4 6 2 3 3 3 5 2 0
+* 3p - 35 2460 916 757 2489 6622 12 10 15 12 7 35 1 4 7 33 0 9 7 6 3 2 7 3 1 2 0
+* 3p - 36 2346 926 730 2435 6437 6 9 8 15 10 38 3 6 7 46 0 8 4 4 3 2 4 3 2 2 0
+* 3p - 37 2310 917 711 2318 6256 10 6 6 12 9 45 1 5 5 46 1 15 9 3 0 4 0 3 0 2 0
+* 3p - 38 2242 903 730 2239 6114 11 8 13 13 9 47 2 8 5 43 3 7 6 4 0 0 0 4 0 1 0
+* 3p - 39 2226 846 650 2198 5920 11 6 13 4 13 41 0 5 8 41 4 8 3 4 2 1 1 1 1 1 0
+* 3p - 40 2096 862 688 2072 5718 10 3 9 9 7 37 1 3 8 33 2 6 5 6 1 3 2 3 2 3 0
+* 3p - 41 2037 884 640 1980 5541 10 3 11 6 2 33 1 7 6 26 0 8 6 7 0 1 2 6 1 2 0
+* 3p - 42 1960 786 673 1962 5381 7 8 9 8 5 31 2 1 7 26 0 5 4 5 0 0 1 4 1 1 0
+* 3p - 43 1891 789 630 1908 5218 3 7 11 10 6 29 1 5 6 29 0 3 2 4 2 1 2 0 2 0 0
+* 3p - 44 1830 803 630 1791 5054 5 4 7 5 3 23 1 2 8 12 2 2 2 4 1 1 1 1 0 1 0
+* 3p - 45 1796 760 626 1737 4919 6 10 8 7 6 21 0 1 5 17 0 4 3 3 1 1 0 3 0 2 0
+* 3p - 46 1714 792 572 1701 4779 7 7 6 6 2 23 3 3 3 7 0 3 3 4 1 0 2 1 0 1 0
+* 3p - 47 1734 710 619 1589 4652 8 3 4 4 6 17 1 2 5 16 0 3 3 2 1 0 1 1 0 3 0
+* 3p - 48 1603 754 555 1615 4527 4 7 6 4 4 20 0 1 5 18 1 4 1 5 0 0 1 1 0 0 0
+* 3p - 49 1543 714 557 1594 4408 4 7 8 5 5 15 0 0 5 8 1 2 0 0 1 0 0 1 1 1 0
+* 3p - 50 1520 711 569 1498 4298 2 4 6 3 5 6 1 3 2 15 0 9 4 0 1 0 0 0 1 0 0
+* 3p - 51 1483 684 552 1481 4200 3 1 6 4 0 12 0 0 2 8 0 3 2 2 0 0 1 0 1 0 0
+* 3p - 52 1364 720 564 1427 4075 5 9 5 3 2 10 1 4 3 10 0 2 1 0 1 0 0 1 0 0 0
+* 3p - 53 1396 707 548 1339 3990 2 6 5 3 2 11 0 2 5 12 0 3 1 1 0 0 1 1 0 0 0
+* 3p - 54 1335 670 554 1349 3908 8 0 3 5 2 7 1 6 3 9 0 3 1 0 0 0 1 2 0 0 0
+* 3p - 55 1286 709 487 1340 3822 3 5 5 2 4 6 0 3 1 9 2 1 1 2 0 1 1 1 0 0 0
+* 3p - 56 1286 628 539 1300 3753 1 1 3 0 1 7 0 0 0 6 1 1 0 1 0 0 0 1 0 0 0
+* 3p - 57 1317 650 504 1203 3674 5 5 3 2 0 7 1 0 2 4 1 1 1 2 0 1 3 0 0 0 0
+* 3p - 58 1252 645 520 1173 3590 2 3 1 3 1 8 0 1 2 9 2 1 0 0 0 2 0 2 0 0 0
+* 3p - 59 1199 638 476 1198 3511 4 4 3 2 4 3 0 2 2 7 1 1 0 3 0 2 1 1 1 0 0
+* 3p - 60 1139 608 481 1213 3441 2 4 1 0 0 6 0 0 0 8 0 3 0 2 0 2 0 0 0 0 0
+* 3p - 61 1184 569 456 1170 3379 2 7 1 1 1 8 0 1 0 2 1 0 2 1 2 1 0 0 0 0 0
+* 3p - 62 1086 595 481 1136 3298 4 5 3 1 0 2 2 0 0 6 1 0 1 3 1 1 0 0 0 0 0
+* 3p - 63 1118 550 443 1124 3235 1 3 3 3 2 4 0 0 0 4 1 0 1 2 0 0 0 0 0 0 0
+* 3p - 64 1097 571 426 1080 3174 2 1 3 4 1 4 0 1 1 7 1 2 0 3 0 0 0 0 0 0 0
+* 3p - 65 1022 551 428 1099 3100 2 2 2 3 2 7 0 1 2 3 1 1 0 0 0 0 1 0 0 0 0
+* 3p - 66 1050 534 411 1039 3034 0 3 5 1 1 5 0 0 2 5 2 0 0 1 0 0 0 0 0 0 0
+* 3p - 67 1005 535 429 1012 2981 1 3 3 1 2 5 0 0 1 2 1 1 1 0 0 0 0 1 0 0 0
+* 3p - 68 963 519 453 999 2934 1 7 3 0 2 5 0 1 2 3 2 1 0 1 0 0 0 0 0 0 0
+* 3p - 69 969 519 413 961 2862 1 5 4 1 3 7 0 1 1 1 0 1 1 1 0 0 1 0 0 0 0
+* 3p - 70 898 524 393 988 2803 0 5 4 1 1 1 0 1 0 6 0 0 1 1 0 1 0 0 0 0 0
+* 5p + 1 3729 908 1106 3604 9347 78 63 84 85 79 412 10 56 52 393 25 77 0 0 0 0 0 0 0 0 15
+* 5p + 2 3666 853 1015 3803 9337 43 54 58 50 54 376 6 22 33 246 23 69 12 10 4 6 2 3 0 5 1
+* 5p + 3 3823 845 993 3594 9255 43 43 42 45 39 183 3 17 34 142 14 31 26 18 6 9 6 11 6 3 0
+* 5p + 4 3767 840 964 3750 9321 27 28 26 20 23 113 1 14 20 70 10 24 30 12 5 6 8 8 6 3 0
+* 5p + 5 3804 825 955 3720 9304 14 24 9 19 14 73 2 9 9 51 7 9 26 12 6 8 13 17 8 5 0
+* 5p + 6 3771 852 975 3701 9299 10 9 13 10 7 46 2 3 4 32 2 8 13 11 1 9 12 19 10 7 0
+* 5p + 7 3788 893 992 3646 9319 10 3 7 9 6 31 1 3 3 30 2 4 15 14 2 1 9 13 2 4 0
+* 5p + 8 3771 908 1008 3633 9320 5 3 8 6 5 22 2 4 2 15 0 3 8 12 1 3 8 11 6 2 0
+* 5p + 9 3757 906 1079 3589 9331 6 8 3 4 6 10 0 3 1 14 1 3 6 4 0 0 4 10 1 1 0
+* 5p + 10 3678 963 1088 3606 9335 1 9 4 7 2 8 0 0 3 13 0 2 5 3 1 1 4 5 1 2 0
+* 5p + 11 3676 978 1080 3602 9336 3 5 3 4 1 5 0 1 0 6 0 0 1 3 2 0 4 3 2 2 0
+* 5p + 12 3618 971 1164 3580 9333 4 2 5 5 2 6 0 3 5 6 2 0 3 0 0 0 2 5 3 4 0
+* 5p + 13 3685 1040 1117 3494 9336 7 7 4 4 4 3 0 3 3 2 0 0 0 0 0 0 4 5 0 2 0
+* 5p + 14 3560 1010 1126 3634 9330 6 2 5 4 3 4 0 2 3 5 0 2 1 1 0 0 6 4 3 4 0
+* 5p + 15 3711 1006 1106 3508 9331 4 3 6 5 3 5 0 0 0 5 2 0 1 0 1 0 4 8 1 3 0
+* 5p + 16 3672 957 1178 3527 9334 8 7 2 5 4 6 0 3 1 8 0 0 0 3 1 0 4 7 1 1 0
+* 5p + 17 3641 982 1142 3571 9336 9 4 4 6 0 3 0 1 1 8 0 0 1 0 0 0 4 5 2 0 0
+* 5p + 18 3620 961 1133 3624 9338 2 4 8 4 4 3 0 0 2 5 0 0 0 0 0 0 5 4 0 0 0
+* 5p + 19 3570 949 1137 3686 9342 2 7 2 9 2 4 0 1 0 5 0 0 0 1 0 0 1 3 0 1 0
+* 5p + 20 3581 1000 1092 3670 9343 9 4 5 4 1 7 2 0 0 2 1 1 0 2 0 1 2 1 1 0 0
+* 5p + 21 3680 989 1122 3552 9343 7 4 5 5 2 4 1 1 2 6 0 0 1 2 0 1 3 0 1 0 0
+* 5p + 22 3693 957 1099 3594 9343 2 4 3 6 1 6 0 0 2 5 1 0 1 3 0 0 3 1 0 0 0
+* 5p + 23 3603 961 1127 3654 9345 6 3 6 6 1 2 0 2 1 6 0 1 3 0 0 0 1 1 0 0 0
+* 5p + 24 3652 972 1151 3571 9346 11 3 6 4 0 3 1 3 0 6 1 1 1 4 0 0 1 0 0 0 0
+* 5p + 25 3616 940 1143 3648 9347 9 0 5 4 1 17 0 1 1 5 0 0 1 0 0 0 0 0 0 0 0
+* 5p + 26 3523 951 1084 3399 8957 9 4 6 3 4 5 2 3 2 11 0 0 1 1 0 0 0 2 1 0 0
+* 5p + 27 3310 899 1112 3270 8591 13 9 1 4 3 4 0 1 3 7 0 0 0 1 0 0 0 0 0 0 0
+* 5p + 28 3164 884 1070 3172 8290 5 6 11 3 2 6 0 0 1 8 0 3 0 0 0 0 1 0 0 0 0
+* 5p + 29 3099 868 1011 3023 8001 3 7 2 5 2 10 1 1 1 4 1 1 1 0 0 0 0 2 0 0 0
+* 5p + 30 2959 855 1058 2873 7745 3 4 7 3 0 6 0 0 3 5 0 1 0 0 0 0 0 1 0 0 0
+* 5p + 31 2796 890 1014 2819 7519 1 3 1 3 0 7 0 1 1 4 1 1 1 1 0 0 1 0 1 0 0
+* 5p + 32 2773 846 996 2680 7295 6 4 2 3 0 2 0 2 3 8 0 1 0 1 0 0 0 0 1 0 0
+* 5p + 33 2662 787 996 2642 7087 3 5 3 5 1 5 0 0 1 3 0 1 0 0 0 0 0 0 0 0 0
+* 5p + 34 2585 831 911 2573 6900 9 5 3 5 1 1 0 1 4 4 0 3 1 0 0 0 0 1 0 0 0
+* 5p + 35 2504 740 925 2520 6689 4 4 5 1 2 6 0 1 2 7 0 1 0 2 0 0 2 0 0 0 0
+* 5p + 36 2452 737 965 2375 6529 2 4 5 6 1 5 0 1 1 5 0 2 0 1 1 0 0 1 0 0 0
+* 5p + 37 2307 772 906 2358 6343 4 3 1 8 1 6 2 1 0 6 0 0 1 1 0 1 0 1 0 0 0
+* 5p + 38 2295 748 861 2304 6208 7 2 5 4 0 5 0 4 1 5 0 2 1 0 0 0 0 1 0 0 0
+* 5p + 39 2239 736 894 2179 6048 3 4 1 4 1 4 0 1 1 4 0 2 1 0 0 0 0 2 0 0 0
+* 5p + 40 2204 720 837 2111 5872 5 3 3 1 1 1 1 3 2 3 0 0 2 0 0 0 0 0 1 0 0
+* 5p + 41 2121 640 852 2089 5702 4 3 2 5 1 1 0 0 0 11 0 1 0 0 0 0 0 0 0 0 0
+* 5p + 42 2051 675 827 1972 5525 4 6 3 4 2 7 0 1 0 2 0 0 0 0 0 0 1 0 0 0 0
+* 5p + 43 1967 667 775 1966 5375 4 0 1 3 0 3 0 3 0 5 0 0 0 1 0 0 0 0 0 0 0
+* 5p + 44 1913 650 807 1840 5210 3 5 2 2 1 2 1 2 2 8 0 2 0 0 0 0 0 0 0 0 0
+* 5p + 45 1863 620 765 1824 5072 3 3 4 2 0 1 0 3 1 5 0 0 0 1 0 0 0 0 0 0 0
+* 5p + 46 1709 643 788 1786 4926 4 3 0 4 0 2 1 0 0 5 0 0 1 0 0 0 0 0 0 0 0
+* 5p + 47 1716 582 773 1731 4802 5 4 1 4 0 1 0 0 0 2 0 0 1 0 0 0 0 0 0 0 0
+* 5p + 48 1654 590 782 1626 4652 5 4 2 4 0 0 0 0 1 1 0 0 2 0 0 0 0 0 0 0 0
+* 5p + 49 1630 567 755 1583 4535 8 1 2 2 0 2 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0
+* 5p + 50 1578 541 727 1574 4420 9 3 0 2 1 0 0 1 0 1 0 0 0 1 1 0 1 0 0 0 0
+* 5p + 51 1574 547 736 1473 4330 3 0 3 3 0 1 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0
+* 5p + 52 1457 588 705 1482 4232 2 1 2 4 0 3 0 0 1 3 0 0 0 0 0 0 1 0 0 0 0
+* 5p + 53 1450 557 663 1458 4128 1 6 1 0 0 2 0 0 0 4 0 0 0 0 0 0 1 0 0 0 0
+* 5p + 54 1376 520 720 1427 4043 3 2 0 1 1 2 0 1 0 3 0 0 1 1 0 0 1 0 0 0 0
+* 5p + 55 1424 542 677 1310 3953 2 1 3 4 2 0 0 0 0 3 0 0 0 0 0 0 1 0 0 0 0
+* 5p + 56 1366 508 675 1326 3875 1 3 1 0 0 2 0 0 0 1 0 1 1 0 0 0 0 0 0 0 0
+* 5p + 57 1303 519 694 1277 3793 3 3 0 2 1 3 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 58 1292 513 626 1287 3718 1 1 1 4 1 1 0 0 0 5 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 59 1295 491 626 1221 3633 3 1 3 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 60 1249 497 621 1201 3568 1 5 1 2 0 2 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 61 1176 483 624 1198 3481 6 0 0 2 1 0 0 0 0 2 0 0 0 1 0 0 0 1 0 0 0
+* 5p + 62 1125 482 619 1182 3408 5 0 3 3 0 0 0 1 0 1 0 0 0 2 0 1 1 1 0 0 0
+* 5p + 63 1177 466 609 1097 3349 1 0 1 1 1 1 0 2 1 1 0 2 0 0 0 0 0 0 0 0 0
+* 5p + 64 1108 457 629 1083 3277 3 0 2 3 0 0 1 0 0 0 0 2 1 0 0 0 1 0 0 0 0
+* 5p + 65 1069 440 602 1094 3205 3 2 3 1 1 0 0 0 0 1 0 2 0 1 0 0 0 0 0 0 0
+* 5p + 66 1072 439 574 1048 3133 7 3 1 0 4 2 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 67 1043 451 534 1034 3062 1 1 1 1 2 0 0 1 1 0 0 1 0 0 0 0 0 0 0 0 0
+* 5p + 68 1002 425 540 1026 2993 1 1 1 3 3 2 0 0 0 3 1 0 0 1 0 0 0 0 0 0 0
+* 5p + 69 1007 387 551 982 2927 3 0 0 4 1 4 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0
+* 5p + 70 979 411 533 938 2861 3 3 1 4 1 2 0 1 0 3 0 0 0 1 0 0 0 0 0 0 0
+* 5p - 1 3697 1059 897 3717 9370 76 71 88 91 90 382 18 63 65 419 19 72 0 0 0 0 0 0 0 0 20
+* 5p - 2 3597 984 877 3908 9366 50 54 82 62 60 324 4 35 57 259 16 73 6 5 2 3 1 0 0 2 9
+* 5p - 3 3722 955 855 3761 9293 27 33 36 41 43 199 8 20 43 172 6 45 8 8 3 3 2 0 4 0 1
+* 5p - 4 3770 952 842 3779 9343 20 27 23 26 26 106 5 11 17 97 5 19 9 16 5 8 11 5 4 7 0
+* 5p - 5 3720 993 787 3827 9327 12 17 15 15 9 66 3 9 9 55 3 8 21 20 7 8 10 15 9 9 0
+* 5p - 6 3803 992 809 3725 9329 6 10 7 5 9 32 1 4 8 33 1 14 27 18 2 5 10 18 7 6 0
+* 5p - 7 3747 1058 851 3684 9340 3 6 6 7 4 18 2 1 3 19 0 2 24 16 1 7 13 6 5 6 0
+* 5p - 8 3769 1067 857 3655 9348 11 8 8 5 0 20 1 5 2 20 3 1 13 9 3 3 8 7 4 3 0
+* 5p - 9 3692 1130 894 3636 9352 5 10 11 5 1 14 2 4 3 15 1 4 11 9 3 1 6 7 2 3 0
+* 5p - 10 3682 1158 893 3613 9346 10 5 6 5 1 4 0 1 2 9 0 0 6 8 0 4 7 12 2 2 0
+* 5p - 11 3635 1172 904 3641 9352 3 8 4 4 1 5 0 1 2 4 0 1 3 8 1 2 4 12 1 1 0
+* 5p - 12 3667 1129 919 3637 9352 10 5 4 5 2 2 1 2 2 5 0 1 2 4 0 1 3 11 2 2 0
+* 5p - 13 3605 1169 975 3614 9363 2 11 4 3 0 3 0 3 2 3 0 0 1 4 0 0 4 2 0 1 0
+* 5p - 14 3603 1161 995 3598 9357 5 4 6 9 2 4 0 0 1 6 0 0 0 2 0 1 3 5 5 0 0
+* 5p - 15 3621 1153 987 3597 9358 4 10 8 3 2 5 0 0 3 8 0 0 3 1 1 0 5 6 1 0 0
+* 5p - 16 3608 1147 984 3622 9361 5 3 1 3 1 4 0 0 1 6 0 1 2 0 0 0 5 1 0 3 0
+* 5p - 17 3558 1152 989 3667 9366 7 5 3 5 2 9 0 2 0 4 0 0 1 0 0 0 1 3 0 0 0
+* 5p - 18 3579 1116 1015 3653 9363 6 9 4 3 1 6 0 0 0 13 0 0 5 2 0 0 1 2 4 0 0
+* 5p - 19 3547 1194 1004 3617 9362 7 5 7 5 1 6 0 1 1 6 0 1 0 2 0 0 3 3 0 2 0
+* 5p - 20 3595 1133 936 3699 9363 8 8 4 2 1 3 0 0 1 1 0 1 2 2 0 0 0 5 2 0 0
+* 5p - 21 3625 1136 973 3633 9367 4 9 5 4 3 4 0 0 3 3 0 3 2 1 1 0 1 2 0 0 0
+* 5p - 22 3637 1157 959 3614 9367 5 4 5 3 3 9 1 4 0 4 1 4 2 3 0 0 1 2 0 0 0
+* 5p - 23 3636 1149 918 3665 9368 5 3 4 3 3 7 3 2 0 6 0 3 1 4 0 0 0 2 0 0 0
+* 5p - 24 3701 1078 968 3620 9367 7 3 4 7 1 7 1 1 1 13 1 3 0 4 0 0 1 2 0 0 0
+* 5p - 25 3659 1151 956 3600 9366 8 4 6 6 1 8 0 2 1 14 0 1 0 0 0 0 0 3 1 0 0
+* 5p - 26 3495 1084 889 3499 8967 6 2 3 4 2 6 0 2 1 8 0 0 1 2 0 0 1 1 0 0 0
+* 5p - 27 3303 1066 910 3308 8587 3 6 6 4 0 5 2 3 2 12 0 1 2 1 0 0 1 0 0 0 0
+* 5p - 28 3142 1091 877 3163 8273 1 2 6 12 1 6 3 2 1 5 1 1 0 1 0 0 0 0 0 0 0
+* 5p - 29 3050 1016 844 3073 7983 6 6 2 6 0 4 1 0 0 7 0 0 1 1 0 0 0 1 0 0 0
+* 5p - 30 2927 1007 839 2978 7751 2 3 7 6 0 6 0 0 1 3 1 0 1 1 0 0 1 1 0 0 0
+* 5p - 31 2798 989 838 2872 7497 5 0 5 4 0 7 1 0 1 7 0 3 0 2 0 0 1 2 0 0 0
+* 5p - 32 2805 958 790 2723 7276 8 3 2 4 2 7 0 2 2 5 0 0 0 1 0 0 0 1 0 0 0
+* 5p - 33 2703 964 781 2613 7061 4 2 4 3 1 2 1 2 2 3 0 1 1 0 0 0 0 0 0 0 0
+* 5p - 34 2599 898 812 2537 6846 7 4 2 4 0 4 0 1 1 5 0 0 1 2 0 0 0 0 0 1 0
+* 5p - 35 2443 885 782 2523 6633 2 3 3 2 0 1 0 0 0 8 0 0 2 2 0 0 1 1 0 0 0
+* 5p - 36 2473 925 716 2336 6450 2 4 6 5 2 1 0 1 1 3 0 0 1 3 0 0 0 0 0 0 0
+* 5p - 37 2358 890 734 2278 6260 5 1 4 3 1 5 0 1 1 5 0 1 0 1 1 0 0 0 0 0 0
+* 5p - 38 2334 903 689 2193 6119 3 2 5 7 0 5 1 1 4 7 0 0 1 1 0 0 0 0 0 0 0
+* 5p - 39 2180 863 706 2176 5925 6 0 1 3 1 4 0 2 0 6 0 2 0 0 0 0 0 0 0 0 0
+* 5p - 40 2047 900 730 2053 5730 3 2 3 3 0 4 0 2 2 4 0 0 0 0 0 0 0 0 0 0 0
+* 5p - 41 2022 870 663 1994 5549 6 2 4 4 1 1 0 1 1 7 0 0 0 1 0 0 3 1 0 0 0
+* 5p - 42 2011 814 673 1891 5389 4 1 2 1 1 3 2 1 1 4 0 1 0 1 1 0 1 0 1 0 0
+* 5p - 43 1856 825 652 1890 5223 4 5 1 3 0 2 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0
+* 5p - 44 1795 806 622 1835 5058 5 0 1 3 0 5 2 0 1 6 0 1 0 0 0 0 0 0 0 0 0
+* 5p - 45 1736 761 632 1796 4925 3 3 2 1 1 8 0 0 1 3 0 0 0 1 0 0 0 1 0 0 0
+* 5p - 46 1726 788 606 1662 4782 3 2 2 4 1 0 0 1 1 6 0 0 0 0 0 0 0 0 1 0 0
+* 5p - 47 1690 722 589 1656 4657 3 1 2 1 0 3 1 2 0 3 0 0 1 0 0 0 1 0 0 1 0
+* 5p - 48 1614 685 643 1589 4531 5 3 2 1 0 3 0 3 1 2 0 1 0 0 0 0 0 0 0 0 0
+* 5p - 49 1590 732 576 1511 4409 3 4 1 3 1 2 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0
+* 5p - 50 1564 691 562 1483 4300 2 8 1 5 2 0 1 1 0 1 0 1 0 0 0 0 0 0 0 0 0
+* 5p - 51 1509 691 561 1441 4202 6 1 3 1 0 1 0 0 0 3 0 0 0 1 0 0 0 0 0 0 0
+* 5p - 52 1443 702 536 1394 4075 4 2 4 2 2 1 0 0 0 2 0 0 0 1 0 0 2 0 0 0 0
+* 5p - 53 1384 691 528 1390 3993 4 3 0 1 0 0 0 0 0 3 0 1 0 0 0 0 0 0 0 0 0
+* 5p - 54 1348 667 538 1359 3912 3 4 1 3 0 2 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0
+* 5p - 55 1320 637 543 1324 3824 4 0 2 1 1 0 0 2 2 2 0 0 0 0 0 0 0 1 0 0 0
+* 5p - 56 1287 627 468 1373 3755 2 0 1 0 2 0 1 0 0 2 0 1 0 1 0 0 0 0 0 0 0
+* 5p - 57 1246 607 563 1262 3678 1 2 1 2 0 0 0 0 1 3 0 1 0 1 0 0 0 0 0 0 0
+* 5p - 58 1202 609 524 1257 3592 5 1 1 2 0 0 0 1 0 1 0 3 0 0 0 0 1 0 0 0 0
+* 5p - 59 1229 611 469 1204 3513 1 1 3 3 1 0 0 0 0 3 0 1 0 2 0 0 1 0 0 0 0
+* 5p - 60 1169 601 489 1183 3442 4 1 5 1 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 5p - 61 1169 597 463 1150 3379 7 2 1 2 0 1 0 0 0 1 0 2 0 0 0 0 0 0 0 0 0
+* 5p - 62 1172 560 449 1118 3299 0 0 1 1 0 1 0 1 0 1 0 0 0 1 0 0 0 0 0 0 0
+* 5p - 63 1109 586 460 1078 3233 5 0 4 1 1 1 0 1 0 1 0 1 0 1 0 0 2 0 0 0 0
+* 5p - 64 1100 557 446 1069 3172 2 1 2 2 1 2 0 0 0 0 0 0 0 2 0 0 1 1 0 0 0
+* 5p - 65 1044 568 427 1062 3101 5 3 3 1 0 0 0 0 0 2 0 0 3 0 0 0 0 0 0 0 0
+* 5p - 66 1022 542 434 1039 3037 1 1 2 0 1 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0
+* 5p - 67 990 557 399 1036 2982 1 2 2 2 2 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
+* 5p - 68 1025 509 428 972 2934 1 0 2 0 1 2 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
+* 5p - 69 1030 516 405 913 2864 2 2 2 0 1 0 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0
+* 5p - 70 921 514 418 950 2803 5 1 2 2 0 1 0 0 3 0 0 1 0 0 0 0 0 0 0 0 0
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/3pGtoA_freq.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/3pGtoA_freq.txt
new file mode 100644
index 0000000..4690ba3
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/3pGtoA_freq.txt
@@ -0,0 +1,26 @@
+pos 5pG>A
+1 0.0530401034928849
+2 0.033003300330033
+3 0.0268741159830269
+4 0.0197568389057751
+5 0.0292758089368259
+6 0.0154559505409583
+7 0.0231481481481481
+8 0.00937081659973226
+9 0.0299319727891156
+10 0.0104031209362809
+11 0.0126742712294043
+12 0.00599520383693046
+13 0.00446428571428571
+14 0.00469483568075117
+15 0.00562429696287964
+16 0.00550660792951542
+17 0.00236406619385343
+18 0.0102960102960103
+19 0.0048661800486618
+20 0.00737100737100737
+21 0.00516129032258065
+22 0.00372208436724566
+23 0.00938337801608579
+24 0.00834492350486787
+25 0.0199146514935989
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/5pCtoT_freq.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/5pCtoT_freq.txt
new file mode 100644
index 0000000..b7f1f4f
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/5pCtoT_freq.txt
@@ -0,0 +1,26 @@
+pos 5pC>T
+1 0.0717299578059072
+2 0.0852017937219731
+3 0.0558912386706949
+4 0.0344827586206897
+5 0.0140625
+6 0.0120845921450151
+7 0.0166898470097357
+8 0.0101781170483461
+9 0.0105820105820106
+10 0.00992555831265509
+11 0.00571428571428571
+12 0.00225988700564972
+13 0.00405268490374873
+14 0.0115911485774499
+15 0.00560538116591928
+16 0.0087527352297593
+17 0.00783874580067189
+18 0.0139416983523447
+19 0.00355450236966825
+20 0.0102432778489117
+21 0.00254452926208651
+22 0.00621890547263682
+23 0.0103761348897536
+24 0.0121951219512195
+25 0.0167095115681234
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/Fragmisincorporation_plot.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/Fragmisincorporation_plot.pdf
new file mode 100644
index 0000000..6a76c60
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/Fragmisincorporation_plot.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/Length_plot.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/Length_plot.pdf
new file mode 100644
index 0000000..e209198
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/Length_plot.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/Runtime_log.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/Runtime_log.txt
new file mode 100644
index 0000000..20d9cce
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/Runtime_log.txt
@@ -0,0 +1,4 @@
+2013-10-22 17:28:02,579 INFO main: Started with the command: /home/mischu/bin/mapDamage/bin/mapDamage --no-stats --merge-reference-sequences -t mapDamage plot for library 'Pi1845A_id_CGCTAT' -i - -d /home/mischu/scratch/bam_pipeline/e8964051-f1bf-4723-b608-e1e631c3cd2c -r 000_prefixes/Pi_mito.fasta --downsample 100000
+2013-10-22 17:28:04,349 DEBUG main: BAM read in 4.034586 seconds
+2013-10-22 17:28:05,105 INFO main: Successful run
+2013-10-22 17:28:05,105 DEBUG main: Run completed in 4.791274 seconds
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_correct_prob.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_correct_prob.csv
new file mode 100644
index 0000000..4462cf0
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_correct_prob.csv
@@ -0,0 +1,25 @@
+"","Position","C.T","G.A"
+"1",1,0.950501201871091,0.966499901205436
+"2",2,0.935518558020883,0.93096091657042
+"3",3,0.910645970879661,0.86929123723773
+"4",4,0.876763947186254,0.78111095025384
+"5",5,0.840580109760929,0.669803029072403
+"6",6,0.808238518904477,0.55020194611055
+"7",7,0.774311693121228,0.488700315477924
+"8",8,0.732385398508833,0.509416907283791
+"9",9,0.691711837148632,0.540796730547099
+"10",10,0.648329260124314,0.577398562211059
+"11",11,0.571312188579435,0.633563472626428
+"12",12,0.409665419122892,0.692022036622844
+"13",-12,0.644516124356173,0.553857814900273
+"14",-11,0.626432942565481,0.586642059437981
+"15",-10,0.595228177497798,0.629934408501337
+"16",-9,0.573196933417823,0.665434344351903
+"17",-8,0.591568279032739,0.687444660946995
+"18",-7,0.615824024646365,0.722512178467703
+"19",-6,0.643775143879731,0.772353380571888
+"20",-5,0.682659889818568,0.828739294308882
+"21",-4,0.749734947062913,0.879163965609719
+"22",-3,0.819876683147329,0.921424564569947
+"23",-2,0.878614099528805,0.952445572930539
+"24",-1,0.91979245579429,0.972835879351585
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_hist.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_hist.pdf
new file mode 100644
index 0000000..a7bbe7f
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_hist.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_iter_summ_stat.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_iter_summ_stat.csv
new file mode 100644
index 0000000..fe98434
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_iter_summ_stat.csv
@@ -0,0 +1,45 @@
+"","Theta","DeltaD","DeltaS","Lambda","Rho","LogLik"
+"Mean",0.0353247262810716,0.0120056631352868,0.68830957920536,0.450531698607925,1.35530165011763,-3588.39463960137
+"Std.",0.00123264859239703,0.00178777749242283,0.0671725262456258,0.0286440689602494,0.0519794670889325,1.59529854648417
+"Acceptance ratio",0.25838,0.20898,0.28128,0.24034,0.20626,0.74362
+"0%",0.029888147224448,0.00488609937116802,0.481461863173078,0.342249232681536,1.15856818551238,-3599.88386940088
+"2.5%",0.0329502426202131,0.00852040724856606,0.564020806511152,0.393870552293629,1.25546506861429,-3592.33322126135
+"5%",0.0333174893910754,0.00907681641020517,0.583523944476592,0.403248502456898,1.27307869018027,-3591.50129561233
+"7.5%",0.0335548237315506,0.0094169459910274,0.596021185874996,0.409341274553994,1.28195867876141,-3590.97529431902
+"10%",0.0337521437413037,0.0096672968176017,0.604334875846859,0.413997549786242,1.28985172782242,-3590.57009093198
+"12.5%",0.0339311684297594,0.00994299969309089,0.612084785192734,0.417492439493557,1.29588183333274,-3590.25259235704
+"15%",0.0340659159382996,0.010157678469132,0.618839958754248,0.420948673033576,1.30121297177646,-3590.00407421107
+"17.5%",0.0341864504612079,0.0103506691526436,0.625336439912267,0.423647351721345,1.30591847135777,-3589.77222778895
+"20%",0.0342948302107096,0.0105284487400531,0.631310685912784,0.426333670808574,1.31083145380692,-3589.56805682028
+"22.5%",0.0343949241393155,0.010687005024058,0.636701105161997,0.429327242602347,1.31507138791644,-3589.39212769839
+"25%",0.0344931365806977,0.0108273933443365,0.641899097720293,0.43163593515298,1.31927898414344,-3589.23069583357
+"27.5%",0.0345920644481567,0.0109558154084488,0.64731896873798,0.433639568236988,1.32327308736114,-3589.07800452081
+"30%",0.0346718449453998,0.0110928187237064,0.651754657709076,0.435600998450303,1.3273401286916,-3588.93559312256
+"32.5%",0.0347579230376398,0.0112184502583839,0.656261192319392,0.437684650642377,1.33079460831211,-3588.80709055579
+"35%",0.0348304098648816,0.0113416259336938,0.6600479589172,0.439769704632638,1.33412413390854,-3588.68482553192
+"37.5%",0.0349078389708162,0.011467737618298,0.66458375602569,0.441634200654078,1.33745587125008,-3588.57203317072
+"40%",0.0349900068418811,0.0115739088775452,0.668871538304025,0.443502284210756,1.34075601639796,-3588.46156569467
+"42.5%",0.035069799668674,0.0116795727685291,0.672787648238453,0.445300751268841,1.34432528976673,-3588.36024826855
+"45%",0.0351468379329859,0.0117871815290473,0.676568257580942,0.44696987523199,1.34752196509927,-3588.26308770728
+"47.5%",0.0352246848374282,0.0118999843403286,0.680665832039293,0.448789793864726,1.35170297047418,-3588.16048011103
+"50%",0.0352996750341185,0.0120100804405603,0.684805897336962,0.450369295745996,1.35525869044293,-3588.06576365923
+"52.5%",0.0353864015489717,0.012115908126892,0.689056390312562,0.452167893576017,1.35829725358393,-3587.97664163814
+"55%",0.0354485459788375,0.0122103584072614,0.693977259475581,0.454038702187109,1.36120027096822,-3587.89062245982
+"57.5%",0.035527496370983,0.0123260149227918,0.698488720940029,0.455891244925344,1.36435776211979,-3587.80203640706
+"60%",0.0356160963300237,0.0124350316878456,0.702694821168892,0.457607991307771,1.36768535515704,-3587.71439470118
+"62.5%",0.0356919133057753,0.0125455154454831,0.706551246398867,0.45940586426271,1.37084481250756,-3587.62824810683
+"65%",0.0357795768077512,0.0126634784572666,0.711315224733285,0.4612585142347,1.37425411748014,-3587.54357181772
+"67.5%",0.0358658995026783,0.0127781280392962,0.715661542922403,0.463078031939274,1.37787881071443,-3587.46033932866
+"70%",0.0359547629512711,0.0129071943161274,0.720313974755412,0.465331621094115,1.38152767176453,-3587.37646748431
+"72.5%",0.0360509986431579,0.013030450077465,0.725549549426627,0.467318173697497,1.38563849656139,-3587.29562985739
+"75%",0.0361407957654486,0.013168405658164,0.731256297034345,0.469729787729288,1.38920292707737,-3587.20948273977
+"77.5%",0.0362324652307982,0.0133378621646039,0.737517326486231,0.472166085501515,1.39349834646824,-3587.12766057701
+"80%",0.036354002010535,0.0135110752482325,0.743550987268797,0.474601907446987,1.39811975158303,-3587.0413784765
+"82.5%",0.0364839704227105,0.0136571660573498,0.750749441121369,0.477636713534998,1.40353147977674,-3586.9533381586
+"85%",0.0366123242144472,0.0138470168706744,0.757609498451292,0.480351134103696,1.40892624577859,-3586.8584273364
+"87.5%",0.0367508799707016,0.0140612885181516,0.765880817608254,0.483473332846736,1.41471400273166,-3586.76698930675
+"90%",0.0369208189809193,0.0143213230995774,0.776298015325283,0.487678970055504,1.42160945477572,-3586.66852869822
+"92.5%",0.0371177432000424,0.0146094648645714,0.787650434212691,0.492109353883736,1.43051176599258,-3586.5607064473
+"95%",0.0373536626623941,0.0149801193246185,0.802733949273462,0.49760826052228,1.44327357214819,-3586.42996262641
+"97.5%",0.0377828946837914,0.0154927199649422,0.827090069375237,0.505756270812937,1.46238185517316,-3586.28169372246
+"100%",0.0404558096993849,0.018923158260011,0.991874636882701,0.55928319829257,1.62679161539921,-3585.90246411386
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_post_pred.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_post_pred.pdf
new file mode 100644
index 0000000..074c3ad
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_post_pred.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_trace.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_trace.pdf
new file mode 100644
index 0000000..1731a7c
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_trace.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/dnacomp.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/dnacomp.txt
new file mode 100644
index 0000000..7476ee5
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/dnacomp.txt
@@ -0,0 +1,324 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_mito.fasta as reference file
+# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads
+Chr End Std Pos A C G T Total
+* 3p + -70 139 78 110 146 473
+* 3p + -69 131 109 113 146 499
+* 3p + -68 144 98 130 149 521
+* 3p + -67 158 98 125 163 544
+* 3p + -66 192 90 115 169 566
+* 3p + -65 172 87 148 187 594
+* 3p + -64 181 127 127 181 616
+* 3p + -63 180 122 147 187 636
+* 3p + -62 201 128 139 186 654
+* 3p + -61 227 115 159 172 673
+* 3p + -60 196 115 166 230 707
+* 3p + -59 219 119 173 227 738
+* 3p + -58 218 160 145 234 757
+* 3p + -57 233 122 182 244 781
+* 3p + -56 244 132 202 230 808
+* 3p + -55 259 150 187 242 838
+* 3p + -54 249 173 169 289 880
+* 3p + -53 276 165 222 252 915
+* 3p + -52 296 186 190 281 953
+* 3p + -51 325 201 169 290 985
+* 3p + -50 312 170 217 312 1011
+* 3p + -49 322 192 222 312 1048
+* 3p + -48 336 180 200 369 1085
+* 3p + -47 326 217 232 350 1125
+* 3p + -46 363 197 220 375 1155
+* 3p + -45 395 191 216 398 1200
+* 3p + -44 418 190 277 359 1244
+* 3p + -43 398 224 265 396 1283
+* 3p + -42 434 233 265 389 1321
+* 3p + -41 412 196 298 465 1371
+* 3p + -40 460 227 274 467 1428
+* 3p + -39 536 251 241 468 1496
+* 3p + -38 519 208 297 545 1569
+* 3p + -37 547 238 314 547 1646
+* 3p + -36 616 238 316 555 1725
+* 3p + -35 636 228 302 648 1814
+* 3p + -34 703 224 318 678 1923
+* 3p + -33 685 263 319 739 2006
+* 3p + -32 790 252 298 776 2116
+* 3p + -31 871 260 323 796 2250
+* 3p + -30 937 292 326 817 2372
+* 3p + -29 1016 318 359 865 2558
+* 3p + -28 1077 342 384 951 2754
+* 3p + -27 1164 352 323 1125 2964
+* 3p + -26 1203 343 400 1291 3237
+* 3p + -25 1472 395 387 1379 3633
+* 3p + -24 1373 366 415 1479 3633
+* 3p + -23 1448 361 410 1414 3633
+* 3p + -22 1414 367 455 1397 3633
+* 3p + -21 1413 386 434 1400 3633
+* 3p + -20 1436 383 407 1407 3633
+* 3p + -19 1428 398 426 1381 3633
+* 3p + -18 1413 404 409 1407 3633
+* 3p + -17 1447 404 501 1281 3633
+* 3p + -16 1393 387 510 1343 3633
+* 3p + -15 1333 440 484 1376 3633
+* 3p + -14 1483 416 475 1259 3633
+* 3p + -13 1407 383 496 1347 3633
+* 3p + -12 1344 476 455 1358 3633
+* 3p + -11 1454 407 410 1362 3633
+* 3p + -10 1411 407 428 1386 3632
+* 3p + -9 1408 417 388 1420 3633
+* 3p + -8 1495 303 403 1432 3633
+* 3p + -7 1430 369 368 1466 3633
+* 3p + -6 1478 332 350 1473 3633
+* 3p + -5 1523 319 340 1451 3633
+* 3p + -4 1546 300 365 1422 3633
+* 3p + -3 1591 282 396 1364 3633
+* 3p + -2 1667 294 355 1317 3633
+* 3p + -1 1373 307 446 1507 3633
+* 3p + 1 537 650 239 2207 3633
+* 3p + 2 1188 433 502 1509 3632
+* 3p + 3 1442 412 465 1313 3632
+* 3p + 4 1503 346 442 1340 3631
+* 3p + 5 1465 356 444 1365 3630
+* 3p + 6 1504 359 451 1315 3629
+* 3p + 7 1476 368 425 1359 3628
+* 3p + 8 1495 334 443 1356 3628
+* 3p + 9 1396 326 440 1465 3627
+* 3p + 10 1433 360 412 1422 3627
+* 3p - -70 151 98 58 142 449
+* 3p - -69 148 92 81 148 469
+* 3p - -68 158 103 82 141 484
+* 3p - -67 137 132 80 153 502
+* 3p - -66 116 123 106 173 518
+* 3p - -65 176 126 74 154 530
+* 3p - -64 147 116 102 185 550
+* 3p - -63 155 133 92 194 574
+* 3p - -62 170 132 104 185 591
+* 3p - -61 189 154 110 158 611
+* 3p - -60 207 137 116 172 632
+* 3p - -59 211 138 102 205 656
+* 3p - -58 195 178 122 193 688
+* 3p - -57 176 179 116 236 707
+* 3p - -56 232 180 106 219 737
+* 3p - -55 207 182 142 229 760
+* 3p - -54 235 184 123 253 795
+* 3p - -53 242 194 110 285 831
+* 3p - -52 255 198 141 257 851
+* 3p - -51 278 206 140 259 883
+* 3p - -50 302 229 123 262 916
+* 3p - -49 271 220 151 310 952
+* 3p - -48 291 231 147 325 994
+* 3p - -47 361 227 132 307 1027
+* 3p - -46 354 230 161 320 1065
+* 3p - -45 349 252 168 337 1106
+* 3p - -44 372 209 190 367 1138
+* 3p - -43 418 222 181 367 1188
+* 3p - -42 380 267 197 396 1240
+* 3p - -41 445 229 189 425 1288
+* 3p - -40 440 250 186 474 1350
+* 3p - -39 475 251 208 476 1410
+* 3p - -38 515 236 217 515 1483
+* 3p - -37 514 304 216 501 1535
+* 3p - -36 563 277 227 537 1604
+* 3p - -35 665 264 209 549 1687
+* 3p - -34 614 267 276 616 1773
+* 3p - -33 681 292 262 656 1891
+* 3p - -32 748 311 228 716 2003
+* 3p - -31 719 340 285 775 2119
+* 3p - -30 799 326 304 807 2236
+* 3p - -29 911 298 280 908 2397
+* 3p - -28 976 313 325 975 2589
+* 3p - -27 1092 344 308 1060 2804
+* 3p - -26 1222 360 326 1169 3077
+* 3p - -25 1390 412 329 1386 3517
+* 3p - -24 1362 430 314 1411 3517
+* 3p - -23 1448 399 335 1336 3518
+* 3p - -22 1450 407 351 1310 3518
+* 3p - -21 1382 444 341 1351 3518
+* 3p - -20 1499 377 404 1238 3518
+* 3p - -19 1388 455 397 1278 3518
+* 3p - -18 1320 507 368 1323 3518
+* 3p - -17 1400 435 354 1329 3518
+* 3p - -16 1331 459 411 1317 3518
+* 3p - -15 1258 514 406 1340 3518
+* 3p - -14 1338 459 392 1329 3518
+* 3p - -13 1275 449 411 1383 3518
+* 3p - -12 1284 457 383 1393 3517
+* 3p - -11 1336 396 378 1408 3518
+* 3p - -10 1344 428 325 1420 3517
+* 3p - -9 1344 431 330 1413 3518
+* 3p - -8 1317 374 344 1483 3518
+* 3p - -7 1317 416 277 1508 3518
+* 3p - -6 1392 380 301 1445 3518
+* 3p - -5 1365 342 302 1509 3518
+* 3p - -4 1453 334 305 1426 3518
+* 3p - -3 1380 330 298 1510 3518
+* 3p - -2 1549 358 248 1363 3518
+* 3p - -1 1236 344 329 1609 3518
+* 3p - 1 517 654 166 2181 3518
+* 3p - 2 1143 519 369 1487 3518
+* 3p - 3 1316 422 410 1370 3518
+* 3p - 4 1248 472 448 1350 3518
+* 3p - 5 1422 387 359 1350 3518
+* 3p - 6 1377 394 374 1373 3518
+* 3p - 7 1349 420 351 1398 3518
+* 3p - 8 1308 406 371 1433 3518
+* 3p - 9 1361 408 426 1323 3518
+* 3p - 10 1343 396 344 1435 3518
+* 5p + -10 1464 345 439 1381 3629
+* 5p + -9 1532 367 406 1324 3629
+* 5p + -8 1533 353 435 1310 3631
+* 5p + -7 1526 352 434 1319 3631
+* 5p + -6 1545 335 391 1360 3631
+* 5p + -5 1501 303 416 1411 3631
+* 5p + -4 1561 345 442 1283 3631
+* 5p + -3 1631 335 374 1291 3631
+* 5p + -2 1589 368 438 1238 3633
+* 5p + -1 1758 243 529 1103 3633
+* 5p + 1 1517 376 402 1338 3633
+* 5p + 2 1367 335 360 1571 3633
+* 5p + 3 1483 354 338 1458 3633
+* 5p + 4 1532 338 333 1430 3633
+* 5p + 5 1508 340 335 1450 3633
+* 5p + 6 1531 339 348 1415 3633
+* 5p + 7 1553 328 364 1388 3633
+* 5p + 8 1498 360 377 1398 3633
+* 5p + 9 1546 359 382 1346 3633
+* 5p + 10 1482 338 467 1346 3633
+* 5p + 11 1380 415 446 1392 3633
+* 5p + 12 1442 430 448 1313 3633
+* 5p + 13 1388 462 468 1315 3633
+* 5p + 14 1397 456 430 1350 3633
+* 5p + 15 1342 447 466 1378 3633
+* 5p + 16 1449 445 440 1298 3632
+* 5p + 17 1402 439 437 1355 3633
+* 5p + 18 1445 396 444 1348 3633
+* 5p + 19 1403 406 415 1409 3633
+* 5p + 20 1420 347 406 1460 3633
+* 5p + 21 1453 356 370 1454 3633
+* 5p + 22 1428 374 432 1399 3633
+* 5p + 23 1421 376 425 1411 3633
+* 5p + 24 1496 355 386 1396 3633
+* 5p + 25 1430 364 438 1401 3633
+* 5p + 26 1303 344 387 1203 3237
+* 5p + 27 1213 326 386 1039 2964
+* 5p + 28 1053 322 371 1008 2754
+* 5p + 29 945 287 368 958 2558
+* 5p + 30 844 316 331 881 2372
+* 5p + 31 804 310 314 822 2250
+* 5p + 32 761 269 378 708 2116
+* 5p + 33 739 253 319 695 2006
+* 5p + 34 642 266 358 657 1923
+* 5p + 35 624 271 311 608 1814
+* 5p + 36 599 243 305 578 1725
+* 5p + 37 603 234 276 533 1646
+* 5p + 38 528 207 287 547 1569
+* 5p + 39 464 219 290 523 1496
+* 5p + 40 478 193 280 477 1428
+* 5p + 41 448 218 264 441 1371
+* 5p + 42 432 206 230 453 1321
+* 5p + 43 371 209 256 447 1283
+* 5p + 44 389 213 260 382 1244
+* 5p + 45 399 192 223 386 1200
+* 5p + 46 390 183 224 358 1155
+* 5p + 47 337 167 229 392 1125
+* 5p + 48 357 173 245 310 1085
+* 5p + 49 340 167 235 306 1048
+* 5p + 50 263 160 243 345 1011
+* 5p + 51 291 151 237 306 985
+* 5p + 52 294 166 205 288 953
+* 5p + 53 265 154 246 250 915
+* 5p + 54 283 134 218 245 880
+* 5p + 55 262 137 163 276 838
+* 5p + 56 259 128 180 241 808
+* 5p + 57 223 138 195 225 781
+* 5p + 58 203 151 152 251 757
+* 5p + 59 218 119 177 224 738
+* 5p + 60 221 134 155 197 707
+* 5p + 61 213 124 152 184 673
+* 5p + 62 189 137 148 180 654
+* 5p + 63 198 105 158 175 636
+* 5p + 64 170 118 162 166 616
+* 5p + 65 169 123 127 175 594
+* 5p + 66 187 92 101 186 566
+* 5p + 67 155 85 123 181 544
+* 5p + 68 167 88 97 169 521
+* 5p + 69 182 86 93 138 499
+* 5p + 70 140 74 110 149 473
+* 5p - -10 1352 425 298 1439 3514
+* 5p - -9 1385 446 267 1419 3517
+* 5p - -8 1386 402 319 1410 3517
+* 5p - -7 1398 393 329 1397 3517
+* 5p - -6 1357 400 351 1409 3517
+* 5p - -5 1376 376 332 1433 3517
+* 5p - -4 1372 373 355 1417 3517
+* 5p - -3 1342 387 348 1440 3517
+* 5p - -2 1368 422 372 1355 3517
+* 5p - -1 1562 263 438 1255 3518
+* 5p - 1 1589 387 251 1291 3518
+* 5p - 2 1342 343 280 1553 3518
+* 5p - 3 1475 331 254 1457 3517
+* 5p - 4 1452 333 249 1484 3518
+* 5p - 5 1522 313 261 1421 3517
+* 5p - 6 1489 336 272 1421 3518
+* 5p - 7 1476 394 320 1328 3518
+* 5p - 8 1429 428 328 1333 3518
+* 5p - 9 1462 404 365 1287 3518
+* 5p - 10 1400 462 405 1251 3518
+* 5p - 11 1309 477 363 1369 3518
+* 5p - 12 1318 464 361 1375 3518
+* 5p - 13 1316 524 355 1323 3518
+* 5p - 14 1362 498 388 1270 3518
+* 5p - 15 1299 454 405 1360 3518
+* 5p - 16 1263 458 383 1412 3516
+* 5p - 17 1332 458 393 1335 3518
+* 5p - 18 1369 380 408 1361 3518
+* 5p - 19 1311 434 370 1403 3518
+* 5p - 20 1286 425 367 1440 3518
+* 5p - 21 1325 427 309 1457 3518
+* 5p - 22 1350 440 346 1382 3518
+* 5p - 23 1349 387 334 1448 3518
+* 5p - 24 1450 369 333 1366 3518
+* 5p - 25 1386 412 332 1387 3517
+* 5p - 26 1140 395 308 1234 3077
+* 5p - 27 1082 353 280 1089 2804
+* 5p - 28 943 365 275 1006 2589
+* 5p - 29 858 327 290 922 2397
+* 5p - 30 795 297 287 857 2236
+* 5p - 31 735 331 270 783 2119
+* 5p - 32 745 317 210 731 2003
+* 5p - 33 634 302 227 728 1891
+* 5p - 34 625 327 200 621 1773
+* 5p - 35 587 294 186 620 1687
+* 5p - 36 567 287 195 555 1604
+* 5p - 37 524 248 224 539 1535
+* 5p - 38 518 261 189 515 1483
+* 5p - 39 467 226 198 519 1410
+* 5p - 40 424 249 220 457 1350
+* 5p - 41 408 267 214 399 1288
+* 5p - 42 389 220 182 449 1240
+* 5p - 43 366 220 206 396 1188
+* 5p - 44 358 223 181 376 1138
+* 5p - 45 370 195 186 355 1106
+* 5p - 46 314 212 214 325 1065
+* 5p - 47 363 211 152 301 1027
+* 5p - 48 324 218 133 319 994
+* 5p - 49 287 213 165 287 952
+* 5p - 50 272 189 163 292 916
+* 5p - 51 268 191 155 269 883
+* 5p - 52 225 205 153 268 851
+* 5p - 53 201 191 180 259 831
+* 5p - 54 249 144 148 254 795
+* 5p - 55 246 184 118 212 760
+* 5p - 56 201 167 161 208 737
+* 5p - 57 216 142 143 206 707
+* 5p - 58 212 138 136 202 688
+* 5p - 59 200 147 119 190 656
+* 5p - 60 166 154 112 200 632
+* 5p - 61 157 159 125 170 611
+* 5p - 62 193 158 96 144 591
+* 5p - 63 165 106 128 175 574
+* 5p - 64 166 127 117 140 550
+* 5p - 65 150 138 83 159 530
+* 5p - 66 168 136 76 138 518
+* 5p - 67 141 124 98 139 502
+* 5p - 68 134 123 74 154 485
+* 5p - 69 141 108 77 143 469
+* 5p - 70 141 92 82 134 449
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/dnacomp_genome.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/dnacomp_genome.csv
new file mode 100644
index 0000000..94d15ba
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/dnacomp_genome.csv
@@ -0,0 +1,2 @@
+A,C,G,T
+0.388112441327,0.105690628131,0.117477981119,0.388718949422
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/lgdistribution.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/lgdistribution.txt
new file mode 100644
index 0000000..aa8f733
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/lgdistribution.txt
@@ -0,0 +1,229 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_mito.fasta as reference file
+# Std: strand of reads
+Std Length Occurences
++ 23 2
++ 24 49
++ 25 241
++ 26 234
++ 27 171
++ 28 179
++ 29 167
++ 30 115
++ 31 130
++ 32 110
++ 33 81
++ 34 102
++ 35 90
++ 36 78
++ 37 80
++ 38 63
++ 39 61
++ 40 63
++ 41 50
++ 42 35
++ 43 44
++ 44 39
++ 45 50
++ 46 29
++ 47 37
++ 48 36
++ 49 34
++ 50 28
++ 51 29
++ 52 35
++ 53 39
++ 54 41
++ 55 30
++ 56 26
++ 57 25
++ 58 18
++ 59 32
++ 60 33
++ 61 19
++ 62 17
++ 63 19
++ 64 22
++ 65 28
++ 66 22
++ 67 23
++ 68 22
++ 69 26
++ 70 51
++ 71 29
++ 72 38
++ 73 42
++ 74 10
++ 75 10
++ 76 12
++ 77 13
++ 78 14
++ 79 6
++ 80 12
++ 81 6
++ 82 15
++ 83 9
++ 84 12
++ 85 6
++ 86 7
++ 87 5
++ 88 7
++ 89 7
++ 90 4
++ 91 4
++ 92 7
++ 93 8
++ 94 20
++ 95 2
++ 96 4
++ 97 6
++ 98 7
++ 99 4
++ 100 3
++ 101 5
++ 102 3
++ 103 7
++ 104 3
++ 105 5
++ 106 2
++ 107 2
++ 108 2
++ 109 3
++ 110 2
++ 112 1
++ 117 1
++ 118 4
++ 119 2
++ 120 2
++ 122 2
++ 123 2
++ 124 1
++ 125 1
++ 126 1
++ 128 1
++ 129 1
++ 130 2
++ 133 2
++ 140 1
++ 145 1
++ 146 2
++ 149 1
++ 150 1
++ 161 1
++ 168 1
++ 181 1
++ 184 1
+- 23 6
+- 24 46
+- 25 288
+- 26 234
+- 27 175
+- 28 166
+- 29 152
+- 30 115
+- 31 103
+- 32 108
+- 33 116
+- 34 94
+- 35 73
+- 36 66
+- 37 55
+- 38 71
+- 39 61
+- 40 60
+- 41 51
+- 42 46
+- 43 51
+- 44 30
+- 45 43
+- 46 36
+- 47 33
+- 48 42
+- 49 34
+- 50 33
+- 51 31
+- 52 19
+- 53 33
+- 54 34
+- 55 27
+- 56 30
+- 57 20
+- 58 34
+- 59 22
+- 60 21
+- 61 20
+- 62 17
+- 63 24
+- 64 20
+- 65 12
+- 66 16
+- 67 16
+- 68 16
+- 69 19
+- 70 49
+- 71 16
+- 72 41
+- 73 43
+- 74 9
+- 75 16
+- 76 11
+- 77 8
+- 78 2
+- 79 11
+- 80 9
+- 81 10
+- 82 14
+- 83 6
+- 84 7
+- 85 10
+- 86 3
+- 87 8
+- 88 2
+- 89 5
+- 90 11
+- 91 7
+- 92 4
+- 93 8
+- 94 23
+- 95 5
+- 96 7
+- 97 4
+- 98 8
+- 99 2
+- 100 3
+- 101 3
+- 102 2
+- 103 2
+- 104 3
+- 105 1
+- 107 2
+- 109 3
+- 110 1
+- 111 1
+- 112 1
+- 113 1
+- 114 2
+- 116 2
+- 118 3
+- 119 3
+- 120 1
+- 121 1
+- 122 1
+- 123 2
+- 127 1
+- 129 2
+- 130 1
+- 131 1
+- 134 1
+- 136 1
+- 137 1
+- 138 2
+- 140 1
+- 145 1
+- 147 1
+- 148 1
+- 162 1
+- 164 1
+- 166 1
+- 174 2
+- 178 1
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/misincorporation.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/misincorporation.txt
new file mode 100644
index 0000000..ffeb662
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_mito.mapDamage/Pi1845A_id_CGCTAT/misincorporation.txt
@@ -0,0 +1,284 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_mito.fasta as reference file
+# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads
+Chr End Std Pos A C G T Total G>A C>T A>G T>C A>C A>T C>G C>A T>G T>A G>C G>T A>- T>- C>- G>- ->A ->T ->C ->G S
+* 3p + 1 1355 306 445 1527 3633 19 5 13 12 5 15 7 9 9 23 5 4 0 0 0 0 0 0 0 0 0
+* 3p + 2 1655 279 356 1343 3633 8 4 7 18 4 14 4 5 5 24 6 3 0 0 0 0 0 0 0 0 0
+* 3p + 3 1574 268 408 1383 3633 12 1 5 12 1 13 1 2 2 22 5 3 0 0 0 0 0 0 0 0 0
+* 3p + 4 1537 291 356 1447 3631 8 3 6 18 1 14 3 4 9 16 0 1 0 0 0 0 2 0 0 0 0
+* 3p + 5 1507 325 337 1464 3633 8 0 2 1 0 12 3 4 7 18 0 1 0 0 0 0 0 0 0 0 0
+* 3p + 6 1463 334 353 1478 3628 6 2 2 2 4 11 2 6 2 16 1 3 0 1 0 0 3 0 1 1 0
+* 3p + 7 1423 366 374 1466 3629 7 5 0 8 1 8 0 3 1 6 0 1 1 0 0 0 1 0 2 1 0
+* 3p + 8 1492 288 400 1445 3625 1 2 3 13 1 8 0 0 1 12 1 1 1 0 0 0 3 2 2 1 0
+* 3p + 9 1399 411 392 1421 3623 9 2 5 5 6 7 0 4 1 11 0 4 2 2 2 0 6 1 1 2 0
+* 3p + 10 1396 412 432 1388 3628 4 3 0 3 1 4 1 5 2 6 0 0 4 2 1 2 1 2 0 1 0
+* 3p + 11 1448 410 413 1357 3628 5 6 1 8 2 13 0 4 1 9 0 0 3 2 1 2 1 4 0 0 0
+* 3p + 12 1342 472 455 1354 3623 5 2 3 4 1 4 0 1 0 7 0 0 4 2 1 1 1 5 1 3 0
+* 3p + 13 1397 387 490 1342 3616 2 1 4 2 1 3 0 4 2 8 0 0 2 2 3 2 4 8 1 4 0
+* 3p + 14 1482 409 472 1262 3625 4 1 5 4 2 2 0 0 0 4 1 0 2 1 2 3 2 2 1 3 0
+* 3p + 15 1322 439 481 1377 3619 2 1 1 3 0 3 1 0 1 1 0 0 1 1 2 0 3 8 2 1 0
+* 3p + 16 1392 386 504 1338 3620 3 6 4 5 1 2 0 2 1 5 0 0 3 3 1 0 3 2 3 5 0
+* 3p + 17 1441 403 495 1282 3621 0 2 6 2 4 4 0 2 1 5 0 3 1 2 0 0 8 1 2 1 0
+* 3p + 18 1409 400 412 1409 3630 4 7 2 10 3 6 0 2 0 3 0 1 0 2 0 0 2 0 1 0 0
+* 3p + 19 1437 402 428 1366 3633 2 7 3 2 2 6 1 1 2 4 1 4 3 0 0 0 0 0 0 0 0
+* 3p + 20 1442 383 412 1388 3625 6 4 2 4 3 8 0 1 2 5 0 7 1 1 2 0 3 5 0 0 0
+* 3p + 21 1400 384 434 1414 3632 0 1 3 2 0 2 0 2 3 4 0 2 0 3 0 0 0 0 1 0 0
+* 3p + 22 1421 369 454 1386 3630 3 4 0 3 0 12 0 0 4 2 0 3 0 1 1 0 3 0 0 0 0
+* 3p + 23 1455 352 414 1411 3632 5 1 4 8 3 16 1 0 1 10 0 1 2 1 0 0 0 0 1 0 0
+* 3p + 24 1397 352 413 1467 3629 3 4 5 12 6 25 0 1 4 10 0 6 4 2 1 0 3 1 0 0 0
+* 3p + 25 1502 384 388 1358 3632 11 8 11 16 10 57 1 4 3 28 0 6 5 1 1 0 0 0 1 0 0
+* 3p + 26 1250 351 401 1269 3271 8 8 6 5 5 60 3 7 4 28 0 3 3 1 0 0 0 0 0 0 0
+* 3p + 27 1187 327 327 1136 2977 2 6 11 12 14 39 2 3 4 48 10 8 1 4 0 0 0 0 0 0 0
+* 3p + 28 1101 335 398 940 2774 10 11 8 6 11 41 0 3 4 27 4 12 0 1 0 0 0 0 0 0 0
+* 3p + 29 1055 318 361 826 2560 11 11 10 4 9 43 0 4 3 12 3 3 1 0 0 0 1 5 0 0 0
+* 3p + 30 931 290 333 828 2382 5 1 5 3 5 18 1 4 2 28 0 5 2 0 0 1 0 1 0 0 0
+* 3p + 31 870 260 325 797 2252 7 5 7 10 0 26 1 2 3 22 0 5 1 2 0 0 0 2 0 0 0
+* 3p + 32 816 247 290 763 2116 1 7 11 6 4 21 0 0 2 17 2 4 2 1 0 0 1 1 0 0 0
+* 3p + 33 698 252 324 725 1999 5 2 8 2 4 18 0 2 0 6 4 1 3 5 2 2 2 2 3 0 0
+* 3p + 34 703 217 317 683 1920 3 1 4 3 4 27 0 1 0 25 1 0 1 2 0 0 0 1 2 0 0
+* 3p + 35 647 236 294 644 1821 3 4 6 4 1 20 2 8 3 18 2 2 5 1 0 0 0 0 0 0 0
+* 3p + 36 596 241 321 570 1728 4 3 2 9 2 18 0 9 7 24 0 7 3 0 2 0 1 1 0 0 0
+* 3p + 37 560 239 313 536 1648 6 2 11 0 3 16 1 1 2 15 1 5 1 3 0 0 1 2 0 0 0
+* 3p + 38 513 213 301 545 1572 4 8 2 3 1 19 0 2 0 25 0 2 0 4 1 0 0 0 0 0 0
+* 3p + 39 532 245 243 484 1504 7 0 5 2 5 13 1 1 3 11 2 2 1 2 0 0 0 0 0 0 0
+* 3p + 40 469 227 278 465 1439 8 6 2 3 3 8 0 5 0 8 0 0 4 0 0 0 0 0 0 0 0
+* 3p + 41 421 198 294 461 1374 0 3 1 1 8 10 1 2 2 7 0 0 2 0 0 0 0 1 0 0 0
+* 3p + 42 440 235 264 384 1323 0 3 4 2 2 5 2 0 1 6 1 5 0 1 0 0 0 1 0 0 0
+* 3p + 43 414 222 267 383 1286 5 1 2 2 2 18 0 1 4 6 0 2 0 0 0 0 0 3 0 0 0
+* 3p + 44 423 186 283 351 1243 5 2 3 1 5 9 0 4 0 2 1 3 2 0 0 0 1 0 0 0 0
+* 3p + 45 398 192 221 394 1205 8 3 2 5 5 10 2 5 0 5 0 1 1 1 0 0 0 0 0 0 0
+* 3p + 46 366 209 224 357 1156 1 8 1 0 0 10 0 4 0 1 2 0 1 0 0 0 0 0 0 0 0
+* 3p + 47 311 216 237 359 1123 8 1 2 7 0 2 0 8 1 4 0 1 2 0 0 1 0 0 2 0 0
+* 3p + 48 330 178 200 377 1085 0 1 0 3 0 2 0 3 1 6 0 0 2 0 0 1 1 0 0 0 0
+* 3p + 49 313 187 234 313 1047 10 0 0 4 3 6 0 6 0 3 2 1 3 0 0 0 2 0 0 0 0
+* 3p + 50 302 171 222 318 1013 0 1 0 4 0 0 0 0 0 4 0 1 0 0 1 0 0 0 0 0 0
+* 3p + 51 318 209 164 296 987 0 1 1 4 0 6 0 10 2 6 2 0 0 0 0 0 0 0 0 0 0
+* 3p + 52 306 184 191 276 957 2 1 2 1 2 4 0 1 2 0 0 1 0 0 1 0 0 0 0 0 0
+* 3p + 53 274 157 227 262 920 4 0 0 6 3 1 0 2 2 1 0 3 0 1 0 0 0 0 0 0 0
+* 3p + 54 251 178 170 282 881 1 0 1 0 0 4 0 3 0 0 0 2 0 0 1 0 0 0 0 0 0
+* 3p + 55 256 158 188 237 839 0 5 1 0 1 0 0 3 0 1 0 0 0 0 0 0 0 0 0 0 0
+* 3p + 56 245 132 198 234 809 0 0 1 0 0 0 0 0 1 2 0 0 1 0 0 0 0 0 0 0 0
+* 3p + 57 234 124 175 251 784 0 2 2 1 0 0 0 0 5 2 0 0 0 0 0 0 0 0 0 0 0
+* 3p + 58 219 163 146 229 757 0 6 0 3 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 3p + 59 219 118 174 228 739 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 3p + 60 197 114 163 232 706 0 1 1 3 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0
+* 3p + 61 226 118 157 171 672 0 2 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0
+* 3p + 62 207 128 139 179 653 0 4 1 0 4 3 0 0 0 2 0 0 0 0 0 0 0 1 0 0 0
+* 3p + 63 189 119 143 184 635 0 3 3 0 5 2 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0
+* 3p + 64 188 127 121 180 616 2 1 5 0 2 1 0 0 3 1 0 0 0 0 0 0 0 0 0 0 0
+* 3p + 65 172 85 148 189 594 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
+* 3p + 66 194 89 115 168 566 0 0 0 1 0 4 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0
+* 3p + 67 159 98 118 169 544 1 1 5 3 1 1 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0
+* 3p + 68 147 97 131 146 521 0 1 2 2 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 3p + 69 131 109 112 147 499 0 2 0 0 1 0 0 0 0 2 0 0 1 0 0 0 0 0 1 0 0
+* 3p + 70 137 77 110 149 473 0 0 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 1 1219 337 328 1634 3518 22 4 9 10 11 19 6 4 11 30 0 3 0 0 0 0 0 0 0 0 0
+* 3p - 2 1502 356 250 1409 3517 12 4 3 15 6 17 3 13 8 47 1 3 0 0 0 0 1 0 0 0 0
+* 3p - 3 1361 334 299 1520 3514 7 4 8 8 3 11 2 9 6 24 0 10 0 0 0 0 1 3 0 0 0
+* 3p - 4 1433 336 302 1443 3514 5 3 6 4 4 11 1 6 2 27 0 2 0 0 0 0 3 0 0 1 0
+* 3p - 5 1346 337 312 1517 3512 11 4 1 5 8 8 0 4 2 16 0 2 0 0 0 0 5 1 0 0 0
+* 3p - 6 1389 380 294 1453 3516 4 5 9 4 5 9 0 5 5 17 2 4 0 1 1 1 0 0 0 2 0
+* 3p - 7 1317 414 274 1507 3512 8 4 7 5 5 9 1 4 4 4 1 0 1 2 1 0 3 1 1 1 0
+* 3p - 8 1312 379 347 1474 3512 6 5 5 1 2 6 0 1 2 10 0 4 2 1 2 0 3 2 0 1 0
+* 3p - 9 1336 433 343 1399 3511 13 3 5 0 2 6 0 1 2 3 1 4 0 2 2 2 3 4 0 0 0
+* 3p - 10 1334 428 337 1409 3508 4 4 0 1 2 6 0 4 0 5 1 6 0 2 2 1 1 3 5 0 0
+* 3p - 11 1331 407 376 1396 3510 5 12 2 4 1 8 0 0 3 7 0 2 3 1 3 0 5 2 0 1 0
+* 3p - 12 1284 455 379 1387 3505 0 4 4 1 1 7 1 1 0 8 2 1 1 3 0 0 2 7 2 2 0
+* 3p - 13 1276 447 406 1376 3505 2 9 5 3 2 8 2 2 3 8 1 4 0 3 2 0 3 3 5 1 0
+* 3p - 14 1342 460 380 1325 3507 0 2 4 5 2 3 1 0 6 3 0 0 3 0 1 0 0 6 3 2 0
+* 3p - 15 1250 512 408 1339 3509 3 3 5 4 1 1 0 1 1 2 0 6 0 0 0 1 4 3 0 2 0
+* 3p - 16 1349 455 404 1305 3513 2 3 10 1 1 10 0 0 6 1 1 6 2 1 0 0 2 1 2 0 0
+* 3p - 17 1404 446 351 1305 3506 2 7 5 1 0 3 1 1 1 2 0 9 2 1 0 0 0 5 0 7 0
+* 3p - 18 1314 505 365 1329 3513 4 3 7 3 1 2 0 0 1 7 0 3 1 1 0 0 1 1 0 3 0
+* 3p - 19 1386 450 394 1280 3510 2 8 4 4 3 6 0 0 2 7 0 5 1 0 0 0 0 1 2 5 0
+* 3p - 20 1501 379 402 1227 3509 0 4 1 3 5 4 1 2 2 5 0 7 1 3 1 0 2 2 1 4 0
+* 3p - 21 1381 447 341 1340 3509 4 7 3 3 0 3 1 1 0 3 0 0 2 3 0 1 2 5 2 0 0
+* 3p - 22 1446 409 352 1305 3512 0 7 5 5 0 2 0 1 2 8 0 3 6 5 0 4 4 1 1 0 0
+* 3p - 23 1448 396 332 1340 3516 2 4 6 4 3 5 1 1 1 6 0 3 1 4 0 0 1 1 0 0 0
+* 3p - 24 1359 447 306 1404 3516 3 7 8 4 3 14 3 11 2 19 0 3 2 0 0 0 1 0 0 0 0
+* 3p - 25 1391 399 315 1411 3516 3 6 4 12 9 35 2 2 12 36 3 0 3 1 0 0 0 2 0 0 0
+* 3p - 26 1236 358 318 1201 3113 11 5 12 5 1 32 0 5 20 34 3 3 1 1 0 3 2 0 4 0 0
+* 3p - 27 1059 351 320 1081 2811 13 4 2 4 3 20 3 7 1 45 2 5 0 1 1 0 2 0 1 2 0
+* 3p - 28 972 306 324 991 2593 7 2 4 6 2 25 1 0 5 26 0 1 1 2 0 0 2 3 0 0 0
+* 3p - 29 919 293 283 913 2408 7 5 3 8 3 31 1 6 5 32 1 5 3 0 0 0 0 0 2 0 0
+* 3p - 30 796 325 300 818 2239 3 6 5 7 3 27 3 2 1 20 0 1 1 7 0 0 2 0 2 0 0
+* 3p - 31 742 341 280 759 2122 4 3 7 4 4 36 1 2 2 20 0 0 1 9 0 1 0 0 2 0 0
+* 3p - 32 762 312 228 707 2009 5 0 6 2 8 25 0 11 1 18 0 3 5 0 0 2 0 0 0 1 0
+* 3p - 33 675 295 264 662 1896 5 1 2 3 0 18 3 0 1 15 0 0 0 1 2 1 3 0 0 0 0
+* 3p - 34 627 269 269 613 1778 3 2 4 2 0 15 2 1 3 10 0 0 3 6 0 0 0 4 0 0 0
+* 3p - 35 672 261 210 546 1689 3 2 7 3 3 17 3 1 1 15 3 3 1 1 0 2 0 1 0 0 0
+* 3p - 36 571 279 221 539 1610 2 6 5 6 2 14 0 5 2 15 2 0 2 2 0 0 2 0 0 0 0
+* 3p - 37 515 294 223 507 1539 5 1 4 9 2 19 0 2 4 15 0 7 2 1 0 0 0 1 1 0 0
+* 3p - 38 516 224 219 528 1487 3 2 2 12 4 14 1 0 0 19 0 3 1 0 0 0 0 0 0 0 0
+* 3p - 39 470 243 213 487 1413 9 0 4 6 5 21 0 3 5 22 0 5 0 0 0 0 0 0 0 0 0
+* 3p - 40 418 242 198 493 1351 15 0 1 8 1 13 0 3 6 15 2 1 0 0 0 0 1 0 0 0 0
+* 3p - 41 447 230 186 428 1291 4 2 9 3 1 10 0 4 2 14 0 4 0 1 0 0 1 1 0 0 0
+* 3p - 42 371 262 195 410 1238 5 4 7 8 4 6 0 2 0 19 0 3 1 0 0 0 0 0 0 2 0
+* 3p - 43 422 218 187 363 1190 2 1 1 5 1 13 0 0 0 10 0 5 0 0 1 0 0 0 0 0 0
+* 3p - 44 378 207 191 362 1138 6 1 1 3 0 11 0 0 5 3 0 2 1 0 0 0 0 1 0 0 0
+* 3p - 45 353 247 164 343 1107 1 0 7 5 2 5 0 3 0 5 0 0 0 2 0 0 0 0 0 0 0
+* 3p - 46 349 233 155 329 1066 0 5 3 3 1 7 0 2 2 14 0 0 0 2 0 0 0 0 0 0 0
+* 3p - 47 357 227 131 313 1028 2 0 8 1 1 6 0 0 0 16 0 4 1 0 1 0 0 0 0 0 0
+* 3p - 48 294 228 147 326 995 3 0 3 1 2 7 0 1 1 7 0 1 0 1 0 0 0 0 0 0 0
+* 3p - 49 273 227 149 304 953 2 6 3 2 0 6 0 2 1 2 0 1 0 0 1 0 0 0 0 0 0
+* 3p - 50 304 229 122 262 917 1 0 1 1 1 7 1 0 2 4 0 1 2 0 0 0 0 0 0 0 0
+* 3p - 51 284 204 136 259 883 2 0 9 0 2 4 0 2 0 6 0 3 2 0 0 0 0 0 1 0 0
+* 3p - 52 262 198 135 257 852 0 4 3 1 3 1 1 0 0 5 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 53 247 191 111 283 832 2 0 1 1 3 3 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 54 233 185 122 259 799 1 0 1 1 0 0 1 0 2 1 0 0 0 0 0 0 1 0 0 0 0
+* 3p - 55 212 183 138 233 766 1 2 5 0 0 2 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 56 234 180 101 224 739 0 1 5 2 1 0 0 0 0 4 0 0 0 0 1 0 0 0 0 0 0
+* 3p - 57 170 184 117 237 708 1 1 0 0 0 0 0 3 0 1 0 0 0 0 1 0 0 0 0 0 0
+* 3p - 58 198 177 123 191 689 1 0 1 0 0 3 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
+* 3p - 59 209 138 104 205 656 0 0 0 1 0 0 0 1 0 2 0 2 0 0 0 0 0 0 0 0 0
+* 3p - 60 206 137 115 174 632 2 0 3 0 0 1 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 61 189 154 110 158 611 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 62 172 130 102 187 591 0 0 2 2 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
+* 3p - 63 151 134 91 198 574 1 2 2 0 0 0 0 0 0 5 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 64 150 116 101 183 550 2 1 3 0 0 5 0 0 1 2 0 0 0 0 0 1 0 0 0 0 0
+* 3p - 65 174 127 72 157 530 0 1 2 1 1 0 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 66 119 124 105 170 518 1 3 2 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 67 140 133 80 149 502 0 0 1 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
+* 3p - 68 153 104 82 145 484 3 1 2 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 69 150 96 80 143 469 0 3 1 0 0 1 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 70 149 101 59 141 450 1 2 0 0 0 1 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 1 1565 344 410 1314 3633 49 34 57 40 59 226 8 39 22 206 14 32 0 0 0 0 0 0 0 0 4
+* 5p + 2 1483 319 379 1449 3630 21 29 32 27 24 194 3 12 14 102 11 37 2 1 2 0 0 3 0 0 3
+* 5p + 3 1540 339 343 1403 3625 14 15 15 19 17 90 3 9 9 47 4 13 9 4 0 0 1 6 1 0 2
+* 5p + 4 1533 338 339 1410 3620 7 14 9 9 11 43 2 6 4 38 2 12 10 6 4 2 6 5 2 0 0
+* 5p + 5 1522 328 326 1448 3624 1 2 6 13 6 30 2 5 10 24 2 3 8 9 1 1 3 2 4 0 0
+* 5p + 6 1531 332 341 1415 3619 1 6 7 4 7 9 0 5 5 13 0 5 10 8 0 2 5 4 5 0 0
+* 5p + 7 1544 334 366 1385 3629 1 10 3 1 3 14 1 1 1 6 2 2 5 3 1 0 2 2 0 0 0
+* 5p + 8 1510 363 379 1378 3630 4 6 1 5 4 8 0 3 3 4 1 3 1 0 3 0 1 1 1 0 0
+* 5p + 9 1554 352 380 1343 3629 3 3 6 6 3 6 0 4 1 6 0 3 4 2 0 0 1 1 2 0 0
+* 5p + 10 1472 343 466 1339 3620 4 6 2 2 2 6 0 2 2 4 1 1 6 2 0 0 7 2 1 3 0
+* 5p + 11 1387 407 440 1381 3615 3 4 4 3 0 4 0 1 1 5 0 1 2 3 1 0 5 9 4 0 0
+* 5p + 12 1413 429 449 1332 3623 6 2 4 5 1 5 2 3 2 5 1 0 2 2 0 0 3 1 0 6 0
+* 5p + 13 1395 461 445 1314 3615 0 1 7 7 1 1 1 3 3 5 0 0 3 0 1 2 5 4 2 7 0
+* 5p + 14 1390 450 434 1348 3622 6 7 3 8 1 4 0 1 1 4 0 0 1 2 0 0 2 8 0 1 0
+* 5p + 15 1335 441 470 1378 3624 5 1 1 7 5 10 0 4 2 11 0 0 1 2 2 2 2 6 1 0 0
+* 5p + 16 1435 459 437 1296 3627 2 5 2 7 1 5 1 10 2 6 0 0 2 3 3 2 2 2 0 1 0
+* 5p + 17 1412 436 431 1347 3626 7 5 6 4 7 7 0 7 0 9 0 1 5 3 0 0 3 1 1 2 0
+* 5p + 18 1416 405 447 1356 3624 6 4 7 3 1 3 0 5 0 8 0 1 4 0 5 2 4 2 2 1 0
+* 5p + 19 1379 407 424 1419 3629 7 1 3 3 0 7 0 10 1 8 1 1 2 0 2 1 2 0 1 1 0
+* 5p + 20 1408 354 410 1460 3632 4 3 2 1 1 8 0 3 0 10 0 3 0 0 0 1 0 0 1 0 0
+* 5p + 21 1455 357 376 1443 3631 11 0 4 2 0 9 0 3 1 11 0 2 1 0 0 0 0 0 1 1 0
+* 5p + 22 1430 369 442 1392 3633 5 1 1 2 3 7 0 1 1 9 1 2 1 1 0 0 0 0 0 0 0
+* 5p + 23 1419 377 422 1415 3633 3 5 2 6 1 5 0 2 4 6 1 2 1 0 0 0 0 0 0 0 0
+* 5p + 24 1484 357 391 1401 3633 6 1 1 3 0 9 1 2 0 10 2 0 0 0 0 0 0 0 0 0 0
+* 5p + 25 1398 365 445 1425 3633 2 6 1 5 0 8 0 5 1 16 1 0 0 0 0 0 0 0 0 0 0
+* 5p + 26 1321 336 392 1222 3271 7 2 2 4 4 1 0 1 3 8 1 2 0 0 0 0 0 0 0 0 0
+* 5p + 27 1222 330 382 1043 2977 2 2 1 2 2 2 0 1 2 2 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 28 1064 323 376 1011 2774 2 1 0 3 0 1 0 0 0 3 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 29 942 288 362 974 2566 4 0 4 1 1 3 0 0 1 4 0 0 1 0 0 0 0 0 0 0 0
+* 5p + 30 844 312 338 889 2383 4 0 0 4 2 1 1 0 0 3 0 0 0 0 1 0 0 0 0 0 0
+* 5p + 31 820 308 314 812 2254 0 4 0 2 3 2 0 0 1 0 2 0 0 0 0 0 0 0 0 0 0
+* 5p + 32 752 266 382 718 2118 5 3 0 2 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 33 743 250 319 694 2006 4 0 4 2 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 34 639 262 357 665 1923 0 0 2 5 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0
+* 5p + 35 636 270 312 602 1820 1 3 1 0 0 1 0 0 2 0 0 1 0 0 0 0 0 1 0 0 0
+* 5p + 36 600 247 304 578 1729 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
+* 5p + 37 596 239 278 538 1651 4 3 3 2 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 38 534 199 279 560 1572 0 0 5 6 2 2 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 39 476 216 287 525 1504 0 1 0 3 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 40 480 194 277 488 1439 1 2 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
+* 5p + 41 453 220 267 435 1375 0 3 0 2 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 42 438 198 235 453 1324 2 0 0 5 0 2 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0
+* 5p + 43 370 209 259 449 1287 1 2 1 6 0 0 0 0 0 0 0 1 0 0 0 0 0 2 0 0 0
+* 5p + 44 390 203 260 391 1244 1 0 0 6 1 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 45 397 191 226 391 1205 2 0 0 4 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 46 391 179 228 358 1156 3 0 0 4 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
+* 5p + 47 341 164 227 393 1125 2 4 3 4 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 48 358 170 243 315 1086 2 0 2 3 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 49 336 172 233 308 1049 0 3 0 1 0 1 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 50 267 159 246 341 1013 0 1 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 51 291 149 239 308 987 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 52 293 166 206 292 957 1 0 0 1 1 0 0 0 0 2 0 0 0 1 0 0 0 0 0 0 0
+* 5p + 53 270 148 248 254 920 2 0 0 3 0 3 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 54 283 135 216 247 881 2 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 55 261 140 166 272 839 2 2 1 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 56 256 129 181 243 809 1 2 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 57 225 136 196 227 784 1 1 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 58 201 151 154 251 757 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 59 220 117 175 227 739 0 0 0 3 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 60 218 129 156 204 707 0 0 0 6 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 61 210 123 154 186 673 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 62 189 138 147 178 652 3 0 1 0 0 2 0 0 2 0 0 0 0 0 0 0 1 0 0 1 0
+* 5p + 63 202 102 153 178 635 0 0 3 3 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0
+* 5p + 64 167 115 164 170 616 2 1 0 3 1 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0 0
+* 5p + 65 167 124 128 175 594 0 2 0 1 0 0 0 2 1 2 0 0 1 0 0 0 0 0 0 0 0
+* 5p + 66 177 94 96 199 566 0 0 0 2 0 0 1 2 4 6 1 0 0 3 0 0 0 0 0 0 0
+* 5p + 67 145 86 115 196 542 1 0 0 1 0 1 1 2 5 7 0 0 0 0 0 0 2 0 0 0 0
+* 5p + 68 159 89 95 177 520 3 0 0 3 0 4 3 0 4 8 0 1 0 0 0 0 1 0 0 0 0
+* 5p + 69 175 84 95 145 499 1 0 1 1 1 2 1 2 3 5 2 2 0 0 0 0 1 0 0 0 0
+* 5p + 70 140 72 105 155 472 1 0 2 5 0 2 2 2 2 4 0 1 0 1 0 0 1 0 0 0 0
+* 5p - 1 1492 367 261 1398 3518 48 17 25 44 29 204 9 36 35 271 9 22 0 0 0 0 0 0 0 0 9
+* 5p - 2 1351 350 269 1546 3516 25 28 31 29 21 137 8 21 25 132 3 24 1 1 3 1 0 2 0 0 3
+* 5p - 3 1473 323 249 1468 3513 8 22 13 34 11 66 1 15 11 64 2 7 1 3 1 3 2 2 0 0 0
+* 5p - 4 1479 329 237 1465 3510 5 9 21 9 5 41 1 6 7 25 2 10 6 5 0 0 5 1 2 0 0
+* 5p - 5 1518 312 252 1413 3495 4 7 11 6 0 18 3 1 2 19 0 1 8 9 0 6 5 11 4 2 0
+* 5p - 6 1488 330 264 1420 3502 1 2 8 5 3 11 2 3 3 11 0 6 13 9 1 1 5 1 7 3 0
+* 5p - 7 1483 385 311 1329 3508 0 2 8 7 0 8 1 0 2 8 1 1 9 8 3 3 3 2 3 2 0
+* 5p - 8 1423 423 317 1345 3508 1 2 7 3 2 8 0 0 1 9 0 0 2 7 0 0 0 2 3 5 0
+* 5p - 9 1465 404 354 1281 3504 3 5 7 3 3 1 0 0 0 5 0 0 3 3 0 1 4 4 0 6 0
+* 5p - 10 1406 463 395 1246 3510 1 2 4 1 0 8 0 0 1 5 0 1 2 5 1 0 1 2 1 4 0
+* 5p - 11 1329 468 357 1353 3507 2 1 7 4 1 6 0 1 3 0 0 4 2 2 0 0 5 2 1 3 0
+* 5p - 12 1311 456 366 1374 3507 5 0 0 6 1 4 1 2 1 2 1 4 0 1 0 1 1 6 1 3 0
+* 5p - 13 1315 526 348 1320 3509 6 3 4 3 3 1 3 0 2 3 0 1 3 1 1 0 2 4 2 1 0
+* 5p - 14 1358 499 381 1266 3504 1 4 6 0 1 10 0 1 2 12 1 0 0 1 0 0 2 4 7 1 0
+* 5p - 15 1307 451 398 1353 3509 1 4 4 4 0 4 0 3 4 5 0 1 0 0 0 0 4 3 0 2 0
+* 5p - 16 1270 455 388 1390 3503 4 3 2 4 1 7 0 2 1 4 1 2 1 1 3 0 2 7 4 0 0
+* 5p - 17 1300 457 400 1352 3509 7 2 1 3 2 6 0 1 0 7 0 1 1 3 3 0 4 5 0 0 0
+* 5p - 18 1363 384 405 1361 3513 4 7 5 2 1 8 0 1 3 13 0 1 0 3 1 1 2 2 0 1 0
+* 5p - 19 1317 437 369 1390 3513 4 2 2 0 4 7 1 0 1 4 1 2 2 2 2 1 2 1 2 0 0
+* 5p - 20 1285 427 377 1428 3517 4 5 3 1 1 8 0 1 2 4 1 2 2 0 0 1 1 0 0 0 0
+* 5p - 21 1308 429 308 1472 3517 5 2 1 3 3 3 0 2 5 16 0 3 1 3 1 1 0 0 0 1 0
+* 5p - 22 1346 435 355 1382 3518 7 4 3 1 3 8 0 1 1 6 0 1 0 3 0 0 0 0 0 0 0
+* 5p - 23 1342 394 331 1449 3516 4 3 5 4 2 6 1 1 0 10 0 3 0 0 2 0 0 2 0 0 0
+* 5p - 24 1451 381 328 1358 3518 6 8 11 0 1 12 1 0 1 8 0 1 1 1 0 0 0 0 0 0 0
+* 5p - 25 1405 413 322 1377 3517 2 7 13 0 2 5 1 3 0 5 0 2 0 0 0 0 0 1 0 0 0
+* 5p - 26 1147 396 322 1254 3119 9 1 4 2 1 3 0 0 0 11 0 2 0 0 0 0 0 0 0 0 0
+* 5p - 27 1082 348 285 1101 2816 6 0 0 5 3 6 0 1 0 2 0 2 0 0 0 0 0 0 0 0 0
+* 5p - 28 959 368 270 1000 2597 3 2 4 3 0 1 0 2 1 2 0 1 0 0 0 0 1 0 0 0 0
+* 5p - 29 849 329 288 943 2409 0 1 2 0 3 0 0 1 2 5 0 1 0 0 0 0 1 0 0 0 0
+* 5p - 30 802 295 285 861 2243 3 0 3 0 0 1 0 0 0 6 0 1 0 0 0 0 0 0 0 0 0
+* 5p - 31 729 333 274 788 2124 4 0 7 0 1 3 0 0 1 4 0 3 0 0 0 0 0 0 0 0 0
+* 5p - 32 759 317 213 721 2010 1 1 3 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
+* 5p - 33 634 301 227 737 1899 3 2 2 2 0 3 0 0 0 4 0 0 0 0 0 0 0 0 0 0 0
+* 5p - 34 616 329 200 637 1782 2 2 1 0 0 0 0 0 1 5 0 0 0 0 0 0 0 0 0 0 0
+* 5p - 35 601 297 178 612 1688 0 2 6 0 0 1 0 0 0 3 0 2 0 0 0 0 0 0 0 2 0
+* 5p - 36 570 290 195 557 1612 3 3 4 0 1 3 0 2 0 1 0 1 0 0 0 0 0 0 0 0 0
+* 5p - 37 523 248 222 547 1540 1 3 2 2 0 1 0 0 3 3 0 0 0 0 0 0 0 1 0 0 0
+* 5p - 38 519 260 193 515 1487 2 2 1 6 2 0 0 2 0 2 0 3 0 0 0 0 0 0 0 0 0
+* 5p - 39 474 221 200 518 1413 3 0 2 3 0 2 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0
+* 5p - 40 421 250 225 455 1351 3 5 1 2 2 1 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
+* 5p - 41 402 268 215 408 1293 2 0 1 2 0 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0
+* 5p - 42 391 223 188 438 1240 4 3 1 1 0 0 0 1 1 0 0 5 0 1 0 0 0 0 0 0 0
+* 5p - 43 366 218 211 395 1190 3 4 2 1 1 1 0 0 0 2 2 0 0 0 0 0 0 0 0 0 0
+* 5p - 44 355 231 181 372 1139 4 3 2 0 0 2 3 1 1 1 0 3 0 0 0 0 0 0 0 0 0
+* 5p - 45 368 196 189 352 1105 2 3 1 1 3 1 0 2 2 4 0 4 0 0 0 0 2 0 0 0 0
+* 5p - 46 307 213 220 326 1066 3 3 1 0 1 2 0 0 0 4 0 3 0 0 0 0 0 0 0 0 0
+* 5p - 47 363 218 154 293 1028 1 6 0 2 0 0 0 1 1 1 0 3 0 0 0 0 0 0 0 0 0
+* 5p - 48 321 221 131 322 995 0 3 2 0 0 0 0 0 1 2 0 1 0 0 0 0 0 0 0 0 0
+* 5p - 49 286 219 168 280 953 1 6 0 2 0 1 0 2 2 1 0 4 0 0 0 0 0 0 0 0 0
+* 5p - 50 271 188 163 294 916 2 0 0 0 0 0 0 0 2 1 0 0 0 0 0 0 1 0 0 0 0
+* 5p - 51 264 195 158 267 884 2 4 1 1 1 2 0 1 2 5 0 5 0 0 0 0 0 0 0 0 0
+* 5p - 52 222 205 154 271 852 2 0 0 0 2 0 0 3 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 5p - 53 204 192 180 256 832 1 2 3 0 1 1 1 0 0 0 0 2 0 0 0 0 0 0 0 0 0
+* 5p - 54 246 148 148 258 800 0 0 1 0 0 1 0 2 0 4 0 0 0 0 0 0 0 0 0 0 0
+* 5p - 55 248 181 122 214 765 5 0 3 2 1 0 0 0 0 1 0 2 0 0 0 0 1 0 0 0 0
+* 5p - 56 202 163 166 208 739 2 0 0 2 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0 0
+* 5p - 57 215 140 144 209 708 1 2 0 2 2 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
+* 5p - 58 210 138 139 202 689 3 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 5p - 59 199 145 119 193 656 0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 5p - 60 166 154 112 199 631 2 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0
+* 5p - 61 158 157 125 171 611 1 0 0 2 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 5p - 62 192 152 98 149 591 2 0 0 3 3 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
+* 5p - 63 165 104 126 179 574 1 0 2 1 1 0 0 0 1 2 0 0 0 0 0 0 0 0 0 0 0
+* 5p - 64 163 125 116 146 550 1 0 0 3 0 0 0 2 2 1 1 0 0 0 0 0 0 0 0 0 0
+* 5p - 65 149 138 81 162 530 0 0 0 1 1 1 1 1 2 2 0 0 0 0 0 0 0 0 0 0 0
+* 5p - 66 159 136 78 144 517 2 0 1 3 0 0 0 2 0 3 0 1 0 0 1 0 1 0 0 0 0
+* 5p - 67 135 125 96 145 501 2 0 1 1 0 0 0 2 2 2 0 0 0 0 0 0 1 0 0 0 0
+* 5p - 68 125 122 75 163 485 3 0 0 1 1 0 0 1 3 7 0 0 0 0 0 0 0 0 0 0 0
+* 5p - 69 134 106 75 153 468 0 1 2 1 3 0 0 1 1 10 0 1 0 0 0 0 1 0 0 0 0
+* 5p - 70 138 93 77 140 448 1 0 2 1 2 3 2 3 2 5 0 1 0 0 0 0 1 0 0 1 0
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.coverage b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.coverage
new file mode 100644
index 0000000..5d429da
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.coverage
@@ -0,0 +1,31 @@
+# Timestamp: 2013-10-24T16:43:17.626555
+#
+# Columns:
+# Contig: Contig, chromosome, or feature for which a depth histogram was
+# created. Unnamed features are named after the chromosome or
+# contig on which they are located, with a star appended. For
+# example "chr1*".
+# Size: The total size of the region. Multiple features with the same
+# name are combined into one row, with the size representing to
+# total of these. Note that overlapping bases are counted 2 (or
+# more) times.
+# Hits: Sum of SE, PE_1, and PE_2 hits
+# SE, PE_1, PE_2: Number of Single Ended, and Pair Ended (mate 1 and 2) hits overlapping
+# the current contig or intervals. Note that a hit may be counted multiple
+# times if it overlaps multiple intervals
+# Collapsed: Number of hits for PE pair collapsed into a single read
+# M, I, D: Number of aligned (M), inserted (I) and deleted (D) bases relative to references
+# Coverage: Average number of bases covering each position in the contig(s)/intervals(s).
+Name Sample Library Contig Size Hits SE PE_1 PE_2 Collapsed M I D Coverage
+Pi1845A * * * 228543505 5459297 4986626 14113 48583 409975 349517526 72729 76816 1.52932600732
+Pi1845A * * <Genome> 228543505 5459297 4986626 14113 48583 409975 349517526 72729 76816 1.52932600732
+#
+#
+Pi1845A Pi1845A * * 228543505 5459297 4986626 14113 48583 409975 349517526 72729 76816 1.52932600732
+Pi1845A Pi1845A * <Genome> 228543505 5459297 4986626 14113 48583 409975 349517526 72729 76816 1.52932600732
+#
+Pi1845A Pi1845A Pi1845A_id_CATAGA * 228543505 4688387 4688387 0 0 0 304259452 47708 55909 1.33129774132
+Pi1845A Pi1845A Pi1845A_id_CATAGA <Genome> 228543505 4688387 4688387 0 0 0 304259452 47708 55909 1.33129774132
+#
+Pi1845A Pi1845A Pi1845A_id_CGCTAT * 228543505 770910 298239 14113 48583 409975 45258074 25021 20907 0.198028265997
+Pi1845A Pi1845A Pi1845A_id_CGCTAT <Genome> 228543505 770910 298239 14113 48583 409975 45258074 25021 20907 0.198028265997
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.depths b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.depths
new file mode 100644
index 0000000..5ceb8f2
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.depths
@@ -0,0 +1,29 @@
+# Timestamp: 2013-10-24T16:48:35.941875
+#
+# Columns:
+# Contig: Contig, chromosome, or feature for which a depth histogram was
+# created. Unnamed features are named after the chromosome or
+# contig on which they are located, with a star appended. For
+# example "chr1*".
+# Size: The total size of the region. Multiple features with the same
+# name are combined into one row, with the size representing to
+# total of these. Note that overlapping bases are counted 2 (or
+# more) times.
+# MaxDepth: Maximum depth to use when calling SNPs, in order to exclude
+# (at least) the 0.5% most extreme sites based on read depth,
+# not including sites with depth 0.
+# MD_*: Fraction of sites with a minimum depth of 1-200.
+#
+Name Sample Library Contig Size MaxDepth MD_001 MD_002 MD_003 MD_004 MD_005 MD_006 MD_007 MD_008 MD_009 MD_010 MD_011 MD_012 MD_013 MD_014 MD_015 MD_016 MD_017 MD_018 MD_019 MD_020 MD_021 MD_022 MD_023 MD_024 MD_025 MD_026 MD_027 MD_028 MD_029 MD_030 MD_031 MD_032 MD_033 MD_034 MD_035 MD_036 MD_037 MD_038 MD_039 MD_040 MD_041 MD_042 [...]
+Pi1845A * * * 0 17 0.3404 0.2671 0.2156 0.1731 0.1370 0.1063 0.0808 0.0601 0.0437 0.0312 0.0219 0.0151 0.0103 0.0070 0.0047 0.0032 0.0022 0.0015 0.0011 0.0008 0.0006 0.0005 0.0004 0.0003 0.0003 0.0003 0.0002 0.0002 0.0002 0.0002 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 [...]
+Pi1845A * * <Genome> 0 17 0.3404 0.2671 0.2156 0.1731 0.1370 0.1063 0.0808 0.0601 0.0437 0.0312 0.0219 0.0151 0.0103 0.0070 0.0047 0.0032 0.0022 0.0015 0.0011 0.0008 0.0006 0.0005 0.0004 0.0003 0.0003 0.0003 0.0002 0.0002 0.0002 0.0002 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 [...]
+#
+#
+Pi1845A Pi1845A * * 0 17 0.3404 0.2671 0.2156 0.1731 0.1370 0.1063 0.0808 0.0601 0.0437 0.0312 0.0219 0.0151 0.0103 0.0070 0.0047 0.0032 0.0022 0.0015 0.0011 0.0008 0.0006 0.0005 0.0004 0.0003 0.0003 0.0003 0.0002 0.0002 0.0002 0.0002 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 [...]
+Pi1845A Pi1845A * <Genome> 0 17 0.3404 0.2671 0.2156 0.1731 0.1370 0.1063 0.0808 0.0601 0.0437 0.0312 0.0219 0.0151 0.0103 0.0070 0.0047 0.0032 0.0022 0.0015 0.0011 0.0008 0.0006 0.0005 0.0004 0.0003 0.0003 0.0003 0.0002 0.0002 0.0002 0.0002 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 [...]
+#
+Pi1845A Pi1845A Pi1845A_id_CATAGA * 0 15 0.3306 0.2556 0.2010 0.1556 0.1175 0.0861 0.0612 0.0421 0.0282 0.0184 0.0117 0.0073 0.0046 0.0029 0.0018 0.0012 0.0008 0.0006 0.0005 0.0004 0.0003 0.0003 0.0002 0.0002 0.0002 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0000 0.0000 0.0000 0.0000 [...]
+Pi1845A Pi1845A Pi1845A_id_CATAGA <Genome> 0 15 0.3306 0.2556 0.2010 0.1556 0.1175 0.0861 0.0612 0.0421 0.0282 0.0184 0.0117 0.0073 0.0046 0.0029 0.0018 0.0012 0.0008 0.0006 0.0005 0.0004 0.0003 0.0003 0.0002 0.0002 0.0002 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0000 0.0000 0.0000 0.0000 [...]
+#
+Pi1845A Pi1845A Pi1845A_id_CGCTAT * 0 8 0.0900 0.0619 0.0248 0.0112 0.0052 0.0023 0.0010 0.0005 0.0002 0.0001 0.0001 0.0001 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 [...]
+Pi1845A Pi1845A Pi1845A_id_CGCTAT <Genome> 0 8 0.0900 0.0619 0.0248 0.0112 0.0052 0.0023 0.0010 0.0005 0.0002 0.0001 0.0001 0.0001 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 0.0000 [...]
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/3pGtoA_freq.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/3pGtoA_freq.txt
new file mode 100644
index 0000000..e37ab03
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/3pGtoA_freq.txt
@@ -0,0 +1,26 @@
+pos 5pG>A
+1 0.0319684159038502
+2 0.030530606083478
+3 0.0194565217391304
+4 0.0149370232393117
+5 0.013462164802953
+6 0.0113050522904699
+7 0.00944181167022393
+8 0.00878678805843743
+9 0.00825143024790964
+10 0.0076158466819222
+11 0.00690114201639459
+12 0.00773372845217014
+13 0.00799492832740464
+14 0.00664603699275617
+15 0.00609581181870339
+16 0.00644306833231915
+17 0.00702014322503263
+18 0.00668618309299216
+19 0.00646008589784546
+20 0.00567021201662323
+21 0.00598759712025091
+22 0.00616023655308364
+23 0.00554191511750263
+24 0.00518337061099869
+25 0.00524432160447893
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/5pCtoT_freq.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/5pCtoT_freq.txt
new file mode 100644
index 0000000..c4494c2
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/5pCtoT_freq.txt
@@ -0,0 +1,26 @@
+pos 5pC>T
+1 0.0319610149554697
+2 0.0288547569340889
+3 0.017501640778823
+4 0.0166719094054734
+5 0.0137360687158373
+6 0.011966895317626
+7 0.00985519621446615
+8 0.00861267957972983
+9 0.00907401333770635
+10 0.00761510387422796
+11 0.00777457257440372
+12 0.00847826086956522
+13 0.00695957820738137
+14 0.0069108987694369
+15 0.00763908907466129
+16 0.00649878147847279
+17 0.00660344221988058
+18 0.00639344851118853
+19 0.00654408365046058
+20 0.00582612336031259
+21 0.0073097319764942
+22 0.00668627039137436
+23 0.00568063009688437
+24 0.00634852699907269
+25 0.00630803033524701
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/Fragmisincorporation_plot.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/Fragmisincorporation_plot.pdf
new file mode 100644
index 0000000..6bad5cd
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/Fragmisincorporation_plot.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/Length_plot.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/Length_plot.pdf
new file mode 100644
index 0000000..77e4c2e
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/Length_plot.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/Runtime_log.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/Runtime_log.txt
new file mode 100644
index 0000000..9a7dd98
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/Runtime_log.txt
@@ -0,0 +1,4 @@
+2013-10-24 15:39:23,333 INFO main: Started with the command: /home/mischu/bin/mapDamage/bin/mapDamage --no-stats --merge-reference-sequences -t mapDamage plot for library 'Pi1845A_id_CATAGA' -i - -d /home/mischu/scratch/bam_pipeline/f608c1e6-17da-4932-b988-df347ec518b0 -r 000_prefixes/Pi_nucl.fasta --downsample 100000
+2013-10-24 15:40:08,973 DEBUG main: BAM read in 48.459430 seconds
+2013-10-24 15:40:09,853 INFO main: Successful run
+2013-10-24 15:40:09,854 DEBUG main: Run completed in 49.340715 seconds
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_correct_prob.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_correct_prob.csv
new file mode 100644
index 0000000..a3b749a
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_correct_prob.csv
@@ -0,0 +1,25 @@
+"","Position","C.T","G.A"
+"1",1,0.858363468043394,0.536805828211302
+"2",2,0.812081835165555,0.476392271735615
+"3",3,0.755519496480026,0.426415916810969
+"4",4,0.692842453587123,0.373372583016875
+"5",5,0.626098855939121,0.327367843920009
+"6",6,0.55582434878456,0.298058832553788
+"7",7,0.486280933839937,0.279642475490063
+"8",8,0.428131088424831,0.257096459510594
+"9",9,0.385104022944255,0.230419964421403
+"10",10,0.352364239101824,0.207956595828764
+"11",11,0.328538672872695,0.189250663453081
+"12",12,0.313750686142648,0.171046895962565
+"13",-12,0.175976130402216,0.310138918218144
+"14",-11,0.192999591007824,0.325749212511926
+"15",-10,0.209562787350563,0.351079130034619
+"16",-9,0.223024947956386,0.389507467919079
+"17",-8,0.238928441403877,0.43818051414926
+"18",-7,0.259118332096079,0.495939324694388
+"19",-6,0.28265377974805,0.561471622073828
+"20",-5,0.314539233558655,0.629643500952421
+"21",-4,0.351601117786816,0.697529089214748
+"22",-3,0.383174596791868,0.762357114736984
+"23",-2,0.412545087867264,0.818920628146721
+"24",-1,0.449523709572697,0.864739540116389
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_hist.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_hist.pdf
new file mode 100644
index 0000000..26d5272
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_hist.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_iter_summ_stat.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_iter_summ_stat.csv
new file mode 100644
index 0000000..fa50375
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_iter_summ_stat.csv
@@ -0,0 +1,45 @@
+"","Theta","DeltaD","DeltaS","Lambda","Rho","LogLik"
+"Mean",0.0176069843775655,0.00235139735300914,0.0878896967853244,0.296676783863304,0.474867717423335,-5093.9170804141
+"Std.",0.000227067079965104,0.000339544541505685,0.00275178147070993,0.0117130516060914,0.00766398224871813,1.56492340204832
+"Acceptance ratio",0.26114,0.19374,0.1986,0.15906,0.18356,0.66956
+"0%",0.0167345313107573,0.00102420556627437,0.0774144881337502,0.251702910789381,0.449110152725067,-5106.55219967874
+"2.5%",0.0171619604347919,0.00164969043119911,0.0825683166577122,0.273880960542525,0.460061090341875,-5097.79546463438
+"5%",0.0172328223823462,0.00177281387120728,0.0834517153589275,0.277511783962876,0.462508707297191,-5096.89507427876
+"7.5%",0.0172842087256627,0.00185350324849776,0.0839927997792689,0.279588290901854,0.464044957059467,-5096.37706965011
+"10%",0.0173158030479297,0.00190743850942571,0.0843601266113926,0.281522824402029,0.465180864538734,-5096.02677550001
+"12.5%",0.0173458553038652,0.00195334464955561,0.0847422673869833,0.282920439584945,0.466169438871407,-5095.71880603032
+"15%",0.0173709667625123,0.00199892371521307,0.0850421572329836,0.284380003355435,0.46697509355143,-5095.45884771958
+"17.5%",0.0173935654026147,0.00203600217141516,0.0852934323762461,0.285501577917916,0.467694211254173,-5095.24001787641
+"20%",0.0174149163812505,0.00207276511012261,0.0855577195486027,0.286761928273496,0.468433145296931,-5095.03144478598
+"22.5%",0.0174344650736615,0.00210638849970845,0.0857820911717155,0.287784393458378,0.468955855258768,-5094.85738323919
+"25%",0.0174529458945399,0.00213352194735953,0.0860194765488325,0.288848931608037,0.469582200167971,-5094.70812677603
+"27.5%",0.0174691512248804,0.00216428128572716,0.086242464772388,0.289801392550922,0.470131416534215,-5094.56744580941
+"30%",0.0174845698894279,0.0021893673715819,0.0864086658993715,0.290614428065072,0.470771463229017,-5094.43083786042
+"32.5%",0.0175015521404108,0.00221209322535398,0.0866255858018227,0.291457620893595,0.471386076740115,-5094.31045549635
+"35%",0.0175169472942416,0.00223491728621757,0.0867927601988026,0.292243875706537,0.471889768435041,-5094.19069842525
+"37.5%",0.0175326321892892,0.00225753420156304,0.0869454602750707,0.292902466210741,0.472350523409417,-5094.07627537489
+"40%",0.0175486240660267,0.00227664365176326,0.0871085888357103,0.29379306938455,0.472851293728451,-5093.97324310556
+"42.5%",0.017563289712117,0.00229888372869506,0.0872866615571048,0.294480473434693,0.473301620828878,-5093.87548033895
+"45%",0.017577869750938,0.00231853403957182,0.0874761503329861,0.295140105754977,0.473775114968223,-5093.77990128724
+"47.5%",0.0175921025161064,0.00233711232267054,0.0876389058189423,0.295950017110231,0.474309635627782,-5093.68429638464
+"50%",0.0176077204550503,0.00235657667046353,0.0878351578775717,0.2967030451107,0.474781190194523,-5093.6001679804
+"52.5%",0.0176225288821868,0.0023785648943985,0.0880097793805924,0.297577868285476,0.475227693205912,-5093.51227469824
+"55%",0.0176363834695044,0.00240021319200171,0.0881840652575716,0.298276655997764,0.475707171570351,-5093.42398254033
+"57.5%",0.0176516066873216,0.00241778491589933,0.0883831635844075,0.298990142711596,0.476115007494677,-5093.33642092691
+"60%",0.0176659056228047,0.00243749665607039,0.0885708556527009,0.29957661115566,0.476597438763165,-5093.25661082432
+"62.5%",0.0176821433618888,0.00246166826154538,0.0887362376275482,0.30026400425628,0.477114204301859,-5093.17549026845
+"65%",0.0176995406913664,0.00248159008317669,0.0889258871935799,0.301150993212104,0.477634727756382,-5093.09384279743
+"67.5%",0.0177146324605405,0.00250747216232732,0.0891324333098413,0.301983301054059,0.478193209149032,-5093.01606297621
+"70%",0.0177280522387879,0.00253024108566189,0.0893516688286586,0.302640832648195,0.478723299274607,-5092.9364402797
+"72.5%",0.0177466522010596,0.00255203580687264,0.0895563992965914,0.303690644935786,0.47932329856428,-5092.8549218301
+"75%",0.0177630149595609,0.00257891524188314,0.0897776364626582,0.304521253715916,0.479965928152191,-5092.76875986779
+"77.5%",0.017781482671515,0.00260644649104885,0.0899742365239343,0.305584762246086,0.480647104184841,-5092.68824097671
+"80%",0.0177989598192868,0.00263881363971679,0.0902466858457065,0.306801349520659,0.481316329060981,-5092.60658045786
+"82.5%",0.0178192443000559,0.00267286929444296,0.0904918237844304,0.307757691121337,0.482017517466379,-5092.52549280321
+"85%",0.0178429472859676,0.00270661895898183,0.0907639027599105,0.308726025724075,0.482885656298225,-5092.42914804343
+"87.5%",0.0178652938756097,0.00274616613420397,0.091091707565661,0.310302675528649,0.483808436953886,-5092.33846992759
+"90%",0.0178990901876382,0.00278139119450408,0.0914614783059502,0.311628587551786,0.484812245286338,-5092.24760184729
+"92.5%",0.0179367083155157,0.00283097916172655,0.0919290651474338,0.313638361395002,0.486029721699754,-5092.13556161618
+"95%",0.0179786391276352,0.00289231010450458,0.0925332231672703,0.315758312610741,0.487619250882804,-5092.00618346186
+"97.5%",0.0180439450407181,0.00299983244399644,0.0933409650959394,0.319435155541676,0.490203954444395,-5091.85516929001
+"100%",0.0184511883838814,0.00352122736948409,0.0998974265079864,0.337694979297343,0.505547033954819,-5091.50108256668
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_post_pred.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_post_pred.pdf
new file mode 100644
index 0000000..ce2419a
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_post_pred.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_trace.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_trace.pdf
new file mode 100644
index 0000000..6b394d2
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/Stats_out_MCMC_trace.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/dnacomp.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/dnacomp.txt
new file mode 100644
index 0000000..0967175
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/dnacomp.txt
@@ -0,0 +1,324 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_nucl.fasta as reference file
+# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads
+Chr End Std Pos A C G T Total
+* 3p + -70 4408 5090 4860 4803 19161
+* 3p + -69 4524 5428 5178 4959 20089
+* 3p + -68 4778 5616 5420 5132 20946
+* 3p + -67 5084 5741 5642 5458 21925
+* 3p + -66 5192 6229 5831 5606 22858
+* 3p + -65 5380 6411 6100 5872 23763
+* 3p + -64 5597 6603 6469 6192 24861
+* 3p + -63 5990 7008 6640 6263 25901
+* 3p + -62 6135 7203 7095 6475 26908
+* 3p + -61 6481 7592 7036 6864 27973
+* 3p + -60 6562 7785 7580 7099 29026
+* 3p + -59 6852 8148 7873 7232 30105
+* 3p + -58 7026 8314 8296 7548 31184
+* 3p + -57 7399 8647 8428 7866 32340
+* 3p + -56 7657 8969 8823 8003 33452
+* 3p + -55 7865 9481 9059 8160 34565
+* 3p + -54 8144 9656 9336 8525 35661
+* 3p + -53 8418 9885 9802 8613 36718
+* 3p + -52 8568 10187 10124 8912 37791
+* 3p + -51 8624 10753 10284 9165 38826
+* 3p + -50 8929 11057 10632 9178 39796
+* 3p + -49 9352 11119 10680 9549 40700
+* 3p + -48 9466 11499 10852 9753 41570
+* 3p + -47 9308 11784 11548 9829 42469
+* 3p + -46 9895 11903 11581 9986 43365
+* 3p + -45 9861 12310 11539 10418 44128
+* 3p + -44 9794 12467 12287 10293 44841
+* 3p + -43 10316 12455 12347 10386 45504
+* 3p + -42 10220 12840 12321 10725 46106
+* 3p + -41 10231 13032 13040 10338 46641
+* 3p + -40 10692 12953 12928 10575 47148
+* 3p + -39 10493 13190 13059 10826 47568
+* 3p + -38 10669 13285 13379 10626 47959
+* 3p + -37 10857 13296 13342 10819 48314
+* 3p + -36 10735 13542 13339 10996 48612
+* 3p + -35 10668 13679 13780 10739 48866
+* 3p + -34 10882 13635 13500 11076 49093
+* 3p + -33 10670 13791 13699 11109 49269
+* 3p + -32 10872 13715 14066 10800 49453
+* 3p + -31 11052 13844 13805 10890 49591
+* 3p + -30 10954 14200 13704 10844 49702
+* 3p + -29 10899 13883 14318 10710 49810
+* 3p + -28 11151 13969 13994 10795 49909
+* 3p + -27 10878 14193 13622 11311 50004
+* 3p + -26 11082 14118 14181 10716 50097
+* 3p + -25 11176 13816 14121 11143 50256
+* 3p + -24 10982 14047 14141 11085 50255
+* 3p + -23 11097 13886 14397 10877 50257
+* 3p + -22 11244 13834 14120 11060 50258
+* 3p + -21 10960 14460 13893 10944 50257
+* 3p + -20 11090 14069 14181 10918 50258
+* 3p + -19 11411 13679 14147 11017 50254
+* 3p + -18 11219 14060 13742 11237 50258
+* 3p + -17 11379 13914 14161 10803 50257
+* 3p + -16 11385 13943 13836 11092 50256
+* 3p + -15 11208 14013 14015 11021 50257
+* 3p + -14 11451 13806 14191 10809 50257
+* 3p + -13 11638 13488 14164 10966 50256
+* 3p + -12 11339 13883 13758 11276 50256
+* 3p + -11 11315 13710 14286 10946 50257
+* 3p + -10 11636 13614 14076 10932 50258
+* 3p + -9 11567 13978 13505 11207 50257
+* 3p + -8 11555 13480 14210 11013 50258
+* 3p + -7 11858 13288 13851 11261 50258
+* 3p + -6 11827 13540 13609 11282 50258
+* 3p + -5 11878 13574 13817 10989 50258
+* 3p + -4 12166 12756 14112 11223 50257
+* 3p + -3 12132 13344 13760 11022 50258
+* 3p + -2 13010 13902 12907 10439 50258
+* 3p + -1 13722 11918 14090 10528 50258
+* 3p + 1 4321 20107 5619 20204 50251
+* 3p + 2 10634 12952 14943 11722 50251
+* 3p + 3 11910 12894 14018 11427 50249
+* 3p + 4 11926 13400 13146 11776 50248
+* 3p + 5 11777 13459 13530 11482 50248
+* 3p + 6 12046 13112 13270 11820 50248
+* 3p + 7 11799 13353 13264 11831 50247
+* 3p + 8 12302 13095 13424 11423 50244
+* 3p + 9 12341 12933 13139 11829 50242
+* 3p + 10 11818 13298 13389 11735 50240
+* 3p - -70 4444 5099 4855 4678 19076
+* 3p - -69 4576 5359 5091 4950 19976
+* 3p - -68 4890 5588 5324 5074 20876
+* 3p - -67 5153 5837 5548 5277 21815
+* 3p - -66 5287 6117 5801 5558 22763
+* 3p - -65 5509 6318 6061 5827 23715
+* 3p - -64 5754 6559 6376 6047 24736
+* 3p - -63 5842 6873 6608 6409 25732
+* 3p - -62 6103 7158 6991 6459 26711
+* 3p - -61 6408 7560 7087 6693 27748
+* 3p - -60 6566 7718 7371 7189 28844
+* 3p - -59 6685 8073 7906 7225 29889
+* 3p - -58 7071 8363 8025 7528 30987
+* 3p - -57 7294 8687 8287 7775 32043
+* 3p - -56 7519 8951 8773 7849 33092
+* 3p - -55 7698 9148 9021 8261 34128
+* 3p - -54 7968 9494 9151 8534 35147
+* 3p - -53 8164 9845 9656 8564 36229
+* 3p - -52 8458 10093 9943 8751 37245
+* 3p - -51 8528 10559 10082 9044 38213
+* 3p - -50 8865 10799 10496 9016 39176
+* 3p - -49 9129 10836 10697 9477 40139
+* 3p - -48 8970 11274 11041 9776 41061
+* 3p - -47 9321 11492 11526 9559 41898
+* 3p - -46 9788 11441 11579 9880 42688
+* 3p - -45 9699 12053 11665 10097 43514
+* 3p - -44 9861 12102 12113 10106 44182
+* 3p - -43 10144 12194 12398 10129 44865
+* 3p - -42 10204 12610 12253 10394 45461
+* 3p - -41 10124 12704 12736 10435 45999
+* 3p - -40 10391 12889 12786 10469 46535
+* 3p - -39 10436 13120 12779 10643 46978
+* 3p - -38 10593 13106 13082 10575 47356
+* 3p - -37 10698 13167 13252 10586 47703
+* 3p - -36 10579 13636 13006 10830 48051
+* 3p - -35 10505 13577 13565 10676 48323
+* 3p - -34 10774 13361 13647 10744 48526
+* 3p - -33 10679 13867 13311 10879 48736
+* 3p - -32 10657 13805 13775 10668 48905
+* 3p - -31 10893 13663 13645 10848 49049
+* 3p - -30 10656 13921 13789 10792 49158
+* 3p - -29 10754 13879 13936 10711 49280
+* 3p - -28 10824 13671 13844 11060 49399
+* 3p - -27 10773 13972 13869 10875 49489
+* 3p - -26 10932 13975 14077 10634 49618
+* 3p - -25 11075 13726 14035 10906 49742
+* 3p - -24 10928 13958 13966 10890 49742
+* 3p - -23 11011 13911 14029 10788 49739
+* 3p - -22 10961 13645 14251 10884 49741
+* 3p - -21 10863 14031 14076 10772 49742
+* 3p - -20 11123 13682 14137 10799 49741
+* 3p - -19 11059 13732 13930 11021 49742
+* 3p - -18 11085 14096 13815 10746 49742
+* 3p - -17 11139 13681 14080 10842 49742
+* 3p - -16 11434 13537 14018 10753 49742
+* 3p - -15 11337 13675 13816 10914 49742
+* 3p - -14 11360 13419 14137 10826 49742
+* 3p - -13 11246 13389 14121 10986 49742
+* 3p - -12 11180 13748 13796 11018 49742
+* 3p - -11 11323 13617 14142 10660 49742
+* 3p - -10 11524 13460 13781 10974 49739
+* 3p - -9 11377 13566 13633 11166 49742
+* 3p - -8 11243 13467 13994 11036 49740
+* 3p - -7 11666 13333 13631 11111 49741
+* 3p - -6 11665 13336 13372 11367 49740
+* 3p - -5 11924 13559 13586 10672 49741
+* 3p - -4 12193 12641 13945 10963 49742
+* 3p - -3 11960 13151 13640 10991 49742
+* 3p - -2 12713 13666 13237 10126 49742
+* 3p - -1 13516 11884 14088 10254 49742
+* 3p - 1 4284 19571 5604 20277 49736
+* 3p - 2 10528 12996 14672 11538 49734
+* 3p - 3 11682 12953 14021 11076 49732
+* 3p - 4 11749 13459 12868 11655 49731
+* 3p - 5 11694 13328 13248 11459 49729
+* 3p - 6 11809 12891 13513 11515 49728
+* 3p - 7 11889 13183 13136 11520 49728
+* 3p - 8 11936 12939 13385 11468 49728
+* 3p - 9 12311 12861 12946 11610 49728
+* 3p - 10 11793 13163 13139 11630 49725
+* 5p + -10 11882 13027 13516 11827 50252
+* 5p + -9 11795 13351 12901 12205 50252
+* 5p + -8 11544 13676 13126 11906 50252
+* 5p + -7 11668 13118 13574 11892 50252
+* 5p + -6 11600 13652 12878 12122 50252
+* 5p + -5 11693 13514 13315 11730 50252
+* 5p + -4 11888 12872 13633 11861 50254
+* 5p + -3 11195 14173 13014 11873 50255
+* 5p + -2 11754 14891 13206 10404 50255
+* 5p + -1 21391 4661 21200 3003 50255
+* 5p + 1 10332 14610 11332 13984 50258
+* 5p + 2 10380 13197 13927 12746 50250
+* 5p + 3 10830 13598 13294 12108 49830
+* 5p + 4 11085 14245 12810 12117 50257
+* 5p + 5 11029 14017 13232 11980 50258
+* 5p + 6 11399 13485 13451 11922 50257
+* 5p + 7 11260 13968 13414 11616 50258
+* 5p + 8 11177 14064 13513 11504 50258
+* 5p + 9 11202 13660 13717 11679 50258
+* 5p + 10 11106 14250 13271 11630 50257
+* 5p + 11 10949 14313 13588 11408 50258
+* 5p + 12 11166 13693 13997 11402 50258
+* 5p + 13 11007 14329 13487 11435 50258
+* 5p + 14 10849 14114 13772 11523 50258
+* 5p + 15 10987 13964 13829 11478 50258
+* 5p + 16 11043 14229 13678 11308 50258
+* 5p + 17 10832 14214 13905 11307 50258
+* 5p + 18 11136 13986 13967 11169 50258
+* 5p + 19 11021 13944 14023 11270 50258
+* 5p + 20 11050 14376 13746 11086 50258
+* 5p + 21 11077 13949 14019 11213 50258
+* 5p + 22 10767 14291 13789 11411 50258
+* 5p + 23 10779 14394 14003 11082 50258
+* 5p + 24 11099 13925 14102 11132 50258
+* 5p + 25 11031 14131 13777 11319 50258
+* 5p + 26 10737 14195 14037 11131 50100
+* 5p + 27 10864 14074 14126 10944 50008
+* 5p + 28 11002 14026 13753 11131 49912
+* 5p + 29 10703 14179 14061 10870 49813
+* 5p + 30 11149 13867 13838 10852 49706
+* 5p + 31 10835 14004 13798 10960 49597
+* 5p + 32 10716 14147 13887 10706 49456
+* 5p + 33 10951 13713 13868 10741 49273
+* 5p + 34 10811 13831 13490 10963 49095
+* 5p + 35 10724 13800 13700 10652 48876
+* 5p + 36 10893 13545 13450 10721 48609
+* 5p + 37 10728 13565 13339 10679 48311
+* 5p + 38 10799 13417 13163 10564 47943
+* 5p + 39 10681 13019 13399 10460 47559
+* 5p + 40 10728 13053 12923 10448 47152
+* 5p + 41 10563 12772 12982 10328 46645
+* 5p + 42 10420 12550 12849 10297 46116
+* 5p + 43 10385 12496 12243 10391 45515
+* 5p + 44 10025 12437 12343 10043 44848
+* 5p + 45 10412 11783 12150 9790 44135
+* 5p + 46 10070 11577 11883 9844 43374
+* 5p + 47 9857 11639 11454 9527 42477
+* 5p + 48 9671 11130 11508 9269 41578
+* 5p + 49 9556 10792 11045 9314 40707
+* 5p + 50 9262 10698 10823 9019 39802
+* 5p + 51 9229 10208 10719 8675 38831
+* 5p + 52 8867 10121 10160 8650 37798
+* 5p + 53 8648 9769 10020 8290 36727
+* 5p + 54 8557 9328 9718 8063 35666
+* 5p + 55 8339 9056 9221 7956 34572
+* 5p + 56 7952 8759 9153 7599 33463
+* 5p + 57 7877 8400 8622 7451 32350
+* 5p + 58 7513 8146 8316 7216 31191
+* 5p + 59 7290 7950 8115 6762 30117
+* 5p + 60 7071 7398 7917 6657 29043
+* 5p + 61 6777 7417 7345 6439 27978
+* 5p + 62 6521 7111 7054 6230 26916
+* 5p + 63 6378 6647 7018 5863 25906
+* 5p + 64 6192 6408 6536 5734 24870
+* 5p + 65 5802 6280 6279 5409 23770
+* 5p + 66 5575 5860 6121 5308 22864
+* 5p + 67 5441 5564 5792 5136 21933
+* 5p + 68 5067 5583 5471 4833 20954
+* 5p + 69 4968 5089 5431 4605 20093
+* 5p + 70 4717 4913 5058 4480 19168
+* 5p - -10 11694 13028 13272 11732 49726
+* 5p - -9 11657 13123 12822 12126 49728
+* 5p - -8 11555 13465 12920 11788 49728
+* 5p - -7 11652 12843 13392 11844 49731
+* 5p - -6 11687 13144 12751 12150 49732
+* 5p - -5 11539 13508 13120 11565 49732
+* 5p - -4 11709 12908 13474 11641 49732
+* 5p - -3 11172 14142 12721 11698 49733
+* 5p - -2 11801 14774 12937 10222 49734
+* 5p - -1 21392 4591 20918 2833 49734
+* 5p - 1 10205 14488 11131 13918 49742
+* 5p - 2 10308 13015 13717 12700 49740
+* 5p - 3 10601 13405 13194 12076 49276
+* 5p - 4 11057 13994 12582 12107 49740
+* 5p - 5 10948 13894 13182 11718 49742
+* 5p - 6 11581 13110 13360 11691 49742
+* 5p - 7 11097 13963 13155 11527 49742
+* 5p - 8 11150 13764 13469 11359 49742
+* 5p - 9 11174 13629 13616 11323 49742
+* 5p - 10 10863 14126 13231 11522 49742
+* 5p - 11 10809 13973 13785 11175 49742
+* 5p - 12 11163 13761 13593 11224 49741
+* 5p - 13 10916 13990 13223 11613 49742
+* 5p - 14 10704 14128 13737 11173 49742
+* 5p - 15 10991 13660 13800 11291 49742
+* 5p - 16 10816 13987 13379 11560 49742
+* 5p - 17 10669 14152 13747 11174 49742
+* 5p - 18 11051 13788 13790 11113 49742
+* 5p - 19 10778 14083 13642 11238 49741
+* 5p - 20 10774 14205 13885 10878 49742
+* 5p - 21 10957 13845 14152 10788 49742
+* 5p - 22 10725 14143 13685 11189 49742
+* 5p - 23 10555 14224 13856 11107 49742
+* 5p - 24 10818 13995 13875 11054 49742
+* 5p - 25 10855 14012 13653 11222 49742
+* 5p - 26 10621 14188 13920 10893 49622
+* 5p - 27 11190 13566 13906 10829 49491
+* 5p - 28 10788 14075 13626 10910 49399
+* 5p - 29 10794 13899 13827 10762 49282
+* 5p - 30 10787 13708 14066 10598 49159
+* 5p - 31 10770 13844 13651 10789 49054
+* 5p - 32 10651 13795 13778 10684 48908
+* 5p - 33 11018 13387 13646 10689 48740
+* 5p - 34 10744 13560 13300 10927 48531
+* 5p - 35 10366 13745 13458 10760 48329
+* 5p - 36 10838 13420 13451 10344 48053
+* 5p - 37 10766 13381 12931 10624 47702
+* 5p - 38 10364 13227 13139 10621 47351
+* 5p - 39 10719 12710 13137 10412 46978
+* 5p - 40 10636 12774 12671 10464 46545
+* 5p - 41 10332 12615 12916 10143 46006
+* 5p - 42 10564 12300 12699 9905 45468
+* 5p - 43 10412 12236 12105 10118 44871
+* 5p - 44 10063 12132 12233 9758 44186
+* 5p - 45 10107 11543 11999 9875 43524
+* 5p - 46 9755 11668 11612 9662 42697
+* 5p - 47 9606 11455 11562 9283 41906
+* 5p - 48 9517 10903 11322 9326 41068
+* 5p - 49 9494 10624 10841 9187 40146
+* 5p - 50 9084 10614 10681 8803 39182
+* 5p - 51 9095 9928 10637 8560 38220
+* 5p - 52 8953 9924 9955 8425 37257
+* 5p - 53 8590 9614 9770 8261 36235
+* 5p - 54 8281 9200 9636 8040 35157
+* 5p - 55 8070 9139 9100 7831 34140
+* 5p - 56 7803 8865 8941 7490 33099
+* 5p - 57 7760 8278 8712 7299 32049
+* 5p - 58 7411 8083 8227 7277 30998
+* 5p - 59 7162 7866 8106 6760 29894
+* 5p - 60 7015 7404 7811 6623 28853
+* 5p - 61 6678 7328 7392 6359 27757
+* 5p - 62 6314 7053 7240 6107 26714
+* 5p - 63 6300 6640 6999 5796 25735
+* 5p - 64 6110 6398 6438 5799 24745
+* 5p - 65 5656 6165 6377 5524 23722
+* 5p - 66 5523 5926 6038 5283 22770
+* 5p - 67 5308 5661 5760 5094 21823
+* 5p - 68 5096 5338 5610 4837 20881
+* 5p - 69 4903 5169 5333 4576 19981
+* 5p - 70 4718 4787 5148 4427 19080
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/dnacomp_genome.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/dnacomp_genome.csv
new file mode 100644
index 0000000..2fc7659
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/dnacomp_genome.csv
@@ -0,0 +1,2 @@
+A,C,G,T
+0.245290724081,0.254598401178,0.255055853499,0.245055021242
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/lgdistribution.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/lgdistribution.txt
new file mode 100644
index 0000000..f92b033
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/lgdistribution.txt
@@ -0,0 +1,155 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_nucl.fasta as reference file
+# Std: strand of reads
+Std Length Occurences
++ 24 27
++ 25 133
++ 26 94
++ 27 90
++ 28 99
++ 29 110
++ 30 107
++ 31 145
++ 32 179
++ 33 178
++ 34 217
++ 35 266
++ 36 299
++ 37 354
++ 38 388
++ 39 414
++ 40 516
++ 41 526
++ 42 601
++ 43 668
++ 44 714
++ 45 760
++ 46 891
++ 47 897
++ 48 872
++ 49 910
++ 50 973
++ 51 1034
++ 52 1075
++ 53 1056
++ 54 1089
++ 55 1115
++ 56 1109
++ 57 1161
++ 58 1070
++ 59 1078
++ 60 1062
++ 61 1062
++ 62 1017
++ 63 1042
++ 64 1085
++ 65 903
++ 66 939
++ 67 969
++ 68 861
++ 69 933
++ 70 901
++ 71 841
++ 72 813
++ 73 877
++ 74 775
++ 75 754
++ 76 697
++ 77 680
++ 78 638
++ 79 602
++ 80 617
++ 81 548
++ 82 568
++ 83 540
++ 84 529
++ 85 456
++ 86 395
++ 87 408
++ 88 378
++ 89 234
++ 90 263
++ 91 372
++ 92 452
++ 93 642
++ 94 5115
++ 95 56
++ 96 8
++ 97 5
++ 98 4
++ 99 2
+- 23 1
+- 24 12
+- 25 108
+- 26 130
+- 27 95
+- 28 111
+- 29 126
+- 30 104
+- 31 148
+- 32 169
+- 33 207
+- 34 204
+- 35 271
+- 36 349
+- 37 346
+- 38 380
+- 39 437
+- 40 538
+- 41 540
+- 42 594
+- 43 684
+- 44 664
+- 45 822
+- 46 799
+- 47 839
+- 48 912
+- 49 967
+- 50 961
+- 51 970
+- 52 1019
+- 53 1083
+- 54 1019
+- 55 1035
+- 56 1047
+- 57 1055
+- 58 1101
+- 59 1037
+- 60 1101
+- 61 1044
+- 62 984
+- 63 981
+- 64 1026
+- 65 950
+- 66 947
+- 67 939
+- 68 909
+- 69 895
+- 70 838
+- 71 795
+- 72 788
+- 73 781
+- 74 722
+- 75 722
+- 76 696
+- 77 682
+- 78 660
+- 79 577
+- 80 587
+- 81 628
+- 82 548
+- 83 562
+- 84 549
+- 85 453
+- 86 469
+- 87 411
+- 88 420
+- 89 265
+- 90 259
+- 91 367
+- 92 473
+- 93 594
+- 94 5158
+- 95 65
+- 96 12
+- 98 1
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/misincorporation.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/misincorporation.txt
new file mode 100644
index 0000000..0fcf18a
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CATAGA/misincorporation.txt
@@ -0,0 +1,284 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_nucl.fasta as reference file
+# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads
+Chr End Std Pos A C G T Total G>A C>T A>G T>C A>C A>T C>G C>A T>G T>A G>C G>T A>- T>- C>- G>- ->A ->T ->C ->G S
+* 3p + 1 13148 12028 14335 10740 50251 478 57 118 86 59 29 114 143 95 155 58 38 0 0 0 0 0 0 0 0 0
+* 3p + 2 12538 14049 13101 10565 50253 405 59 108 56 24 22 77 120 79 100 26 28 0 0 0 1 0 0 0 0 0
+* 3p + 3 11813 13422 13863 11154 50252 288 44 86 56 28 24 64 69 72 98 16 25 0 0 1 0 1 1 0 0 0
+* 3p + 4 11889 12856 14182 11322 50249 210 54 73 65 23 19 52 95 51 80 14 23 0 0 0 0 2 1 0 1 0
+* 3p + 5 11610 13635 13927 11077 50249 188 46 57 62 10 21 13 85 29 78 10 14 0 0 0 0 5 0 1 1 0
+* 3p + 6 11696 13545 13700 11296 50237 163 42 63 47 18 13 12 24 12 27 5 16 1 1 0 0 11 1 3 5 0
+* 3p + 7 11775 13304 13915 11246 50240 119 54 48 51 8 18 9 24 17 8 8 18 2 2 4 2 5 2 5 5 0
+* 3p + 8 11469 13476 14269 11035 50249 115 50 40 55 15 8 11 22 28 12 14 11 4 4 2 2 3 3 0 2 0
+* 3p + 9 11489 13978 13584 11202 50253 119 51 47 44 10 12 5 12 16 11 12 13 1 1 1 4 1 1 1 1 0
+* 3p + 10 11565 13625 14135 10924 50249 111 61 45 51 21 12 13 20 19 14 15 17 3 1 4 1 4 2 2 1 0
+* 3p + 11 11243 13705 14346 10956 50250 104 44 42 55 9 12 14 17 11 17 9 15 0 1 2 0 0 2 1 4 0
+* 3p + 12 11260 13873 13829 11282 50244 113 45 44 58 13 17 10 21 9 19 16 17 1 1 1 2 1 3 4 4 0
+* 3p + 13 11572 13499 14217 10958 50246 114 48 52 46 8 17 15 16 16 10 9 12 1 1 1 3 3 3 3 1 0
+* 3p + 14 11381 13820 14247 10797 50245 94 44 45 39 13 11 14 20 15 16 7 20 2 4 2 1 3 4 3 2 0
+* 3p + 15 11183 14011 14031 11020 50245 84 52 50 53 24 14 15 13 15 8 6 10 2 3 1 2 4 3 3 2 0
+* 3p + 16 11337 13956 13872 11083 50248 84 55 47 37 10 6 9 18 14 10 16 9 1 1 1 5 1 4 1 2 0
+* 3p + 17 11327 13895 14220 10798 50240 107 45 47 57 11 16 4 7 16 13 6 12 2 0 1 5 3 6 7 1 0
+* 3p + 18 11163 14026 13802 11257 50248 93 54 35 62 18 8 5 10 8 14 9 11 2 1 1 4 5 0 3 2 0
+* 3p + 19 11355 13690 14202 10993 50240 94 54 45 44 16 15 5 18 13 14 9 16 2 0 3 5 8 2 3 1 0
+* 3p + 20 11058 14074 14226 10896 50254 81 50 47 40 15 12 5 14 12 6 7 17 1 2 3 0 1 2 1 0 0
+* 3p + 21 10906 14482 13962 10899 50249 92 75 38 44 14 14 11 12 6 11 9 15 3 3 1 5 2 2 1 3 0
+* 3p + 22 11209 13863 14145 11032 50249 96 69 56 41 10 9 10 12 13 5 11 16 3 3 3 3 1 2 2 4 0
+* 3p + 23 11064 13901 14439 10845 50249 75 58 51 49 9 15 7 20 5 9 15 11 1 1 6 3 2 3 1 2 0
+* 3p + 24 10926 14070 14178 11074 50248 78 56 53 34 20 6 8 14 9 15 12 16 2 4 8 2 2 0 0 5 0
+* 3p + 25 11131 13838 14178 11101 50248 94 66 42 43 11 10 11 21 8 16 15 19 3 2 3 5 2 3 2 1 0
+* 3p + 26 11062 14126 14213 10698 50099 66 57 43 45 13 13 11 10 16 14 14 18 1 1 2 1 0 2 0 2 0
+* 3p + 27 10850 14253 13636 11261 50000 69 88 44 38 10 14 13 15 18 12 11 18 0 4 2 1 1 1 2 1 0
+* 3p + 28 11112 13985 14033 10777 49907 73 68 41 50 16 9 5 19 15 10 10 16 1 0 0 4 1 0 3 1 0
+* 3p + 29 10862 13899 14371 10676 49808 73 66 39 35 17 15 7 16 10 12 14 16 2 1 3 1 2 0 0 3 0
+* 3p + 30 10941 14239 13710 10802 49692 60 69 45 44 15 14 14 16 17 12 12 20 3 1 5 2 2 6 1 1 0
+* 3p + 31 11005 13870 13855 10853 49583 68 68 29 37 11 12 9 12 12 3 11 13 1 1 1 5 4 3 1 1 0
+* 3p + 32 10871 13747 14086 10742 49446 74 87 54 43 17 12 6 17 16 13 14 13 3 3 4 4 0 6 0 1 0
+* 3p + 33 10634 13812 13741 11074 49261 61 78 35 48 11 11 10 10 16 13 19 14 2 1 3 6 3 4 1 2 0
+* 3p + 34 10833 13670 13534 11046 49083 55 88 40 47 7 10 10 14 7 13 14 16 2 3 4 6 2 3 2 5 0
+* 3p + 35 10675 13704 13792 10695 48866 56 69 42 44 10 12 10 10 10 10 14 16 5 2 3 3 3 0 1 0 0
+* 3p + 36 10692 13598 13394 10920 48604 74 91 44 33 13 14 8 12 9 17 11 23 0 3 4 1 0 3 5 2 0
+* 3p + 37 10845 13353 13352 10756 48306 65 91 45 34 11 14 9 12 7 10 10 12 4 3 6 2 1 2 2 2 0
+* 3p + 38 10640 13307 13407 10597 47951 59 82 35 54 27 10 9 21 17 14 9 20 1 4 3 4 4 2 3 3 0
+* 3p + 39 10510 13210 13079 10766 47565 69 92 53 40 24 12 6 13 15 7 16 12 3 2 3 2 1 0 3 2 0
+* 3p + 40 10670 12993 12952 10531 47146 60 89 46 42 5 10 6 13 17 10 18 21 2 1 4 1 2 0 2 3 0
+* 3p + 41 10199 13097 13065 10279 46640 57 99 38 45 14 9 13 16 17 8 12 16 3 1 4 5 2 2 0 0 0
+* 3p + 42 10200 12901 12343 10660 46104 58 93 41 33 11 14 10 16 13 10 7 16 1 4 1 4 3 0 1 1 0
+* 3p + 43 10280 12503 12367 10351 45501 52 84 35 47 9 13 6 21 9 15 7 16 2 3 3 3 3 0 3 0 0
+* 3p + 44 9775 12551 12306 10207 44839 46 114 38 29 8 12 10 11 8 15 11 16 1 1 4 5 4 2 2 1 0
+* 3p + 45 9858 12390 11560 10318 44126 52 118 40 39 16 10 11 11 13 11 9 15 3 3 1 3 0 3 1 2 0
+* 3p + 46 9843 11970 11609 9942 43364 62 97 27 47 8 11 6 14 5 6 5 14 1 1 2 2 4 0 4 1 0
+* 3p + 47 9314 11853 11565 9737 42469 50 114 36 40 7 11 8 13 11 10 12 12 1 2 2 1 1 7 2 2 0
+* 3p + 48 9441 11575 10888 9665 41569 56 120 35 35 13 13 2 15 11 6 6 16 1 2 1 2 3 2 3 3 0
+* 3p + 49 9334 11177 10700 9492 40703 48 95 36 44 18 10 11 13 8 9 9 13 2 3 1 1 0 1 3 2 0
+* 3p + 50 8912 11155 10634 9096 39797 45 112 34 27 10 8 18 14 9 15 6 20 1 2 1 3 1 3 2 2 0
+* 3p + 51 8606 10783 10315 9118 38822 56 75 36 36 11 9 7 13 10 10 9 16 0 3 3 1 4 3 1 2 0
+* 3p + 52 8555 10284 10140 8812 37791 35 105 27 35 12 12 9 20 11 5 9 14 0 2 3 2 2 4 2 0 0
+* 3p + 53 8423 9955 9815 8526 36719 46 107 41 28 11 12 7 7 11 6 8 12 2 0 4 3 1 2 0 3 0
+* 3p + 54 8150 9717 9338 8460 35665 41 98 38 25 14 13 2 9 9 10 7 14 2 2 2 5 2 0 1 1 0
+* 3p + 55 7852 9539 9070 8108 34569 44 84 30 32 8 16 8 14 7 6 8 12 1 1 2 2 2 1 0 3 0
+* 3p + 56 7639 9032 8852 7935 33458 37 89 19 25 10 10 9 8 13 16 7 5 1 0 1 0 1 1 0 0 0
+* 3p + 57 7391 8725 8434 7793 32343 41 100 34 32 6 12 7 13 16 5 8 11 1 1 1 0 1 2 0 1 0
+* 3p + 58 7026 8399 8297 7464 31186 49 108 37 29 9 11 5 9 8 5 8 5 0 1 1 0 1 0 1 1 0
+* 3p + 59 6823 8217 7908 7163 30111 39 91 27 23 9 5 7 12 8 8 6 12 0 0 0 1 0 0 1 1 0
+* 3p + 60 6558 7866 7588 7019 29031 39 89 26 13 7 13 8 11 7 9 5 9 2 1 1 2 2 2 0 0 0
+* 3p + 61 6479 7636 7036 6825 27976 26 88 21 34 7 9 5 7 11 7 7 10 0 0 1 1 1 0 1 1 0
+* 3p + 62 6143 7291 7104 6373 26911 33 90 31 24 9 12 7 8 10 3 2 10 2 2 2 2 0 2 2 0 0
+* 3p + 63 5987 7067 6629 6220 25903 25 79 25 28 4 6 5 6 5 6 7 12 0 0 4 0 0 0 1 2 0
+* 3p + 64 5596 6640 6478 6145 24859 28 73 19 24 12 10 3 2 8 7 3 8 1 3 3 0 3 0 1 1 0
+* 3p + 65 5371 6482 6122 5791 23766 22 84 21 15 6 8 2 10 4 6 0 14 2 0 3 2 1 0 2 1 0
+* 3p + 66 5183 6277 5840 5566 22866 24 61 15 20 3 3 2 7 5 7 6 9 1 2 1 2 0 1 0 2 0
+* 3p + 67 5070 5765 5672 5420 21927 27 56 16 18 2 9 5 8 3 4 5 9 2 2 0 3 1 0 3 1 0
+* 3p + 68 4777 5669 5435 5074 20955 28 62 13 16 9 6 3 9 5 4 2 7 1 0 0 1 2 1 0 2 0
+* 3p + 69 4526 5477 5199 4897 20099 23 66 16 19 3 9 4 12 4 3 6 7 0 0 2 0 1 0 0 1 0
+* 3p + 70 4400 5141 4860 4767 19168 25 59 13 17 3 10 2 3 3 3 5 5 0 0 3 3 1 0 0 2 0
+* 3p - 1 13000 11961 14287 10490 49738 437 52 109 111 65 37 121 141 99 148 60 31 0 0 0 0 0 0 0 0 1
+* 3p - 2 12251 13820 13397 10267 49735 404 48 100 67 26 29 105 112 80 100 17 28 0 0 0 0 1 1 0 1 1
+* 3p - 3 11653 13270 13737 11070 49730 249 56 87 53 20 17 59 94 46 82 18 25 0 0 1 0 6 2 0 2 1
+* 3p - 4 11938 12730 14003 11059 49730 211 50 82 45 17 20 57 73 56 87 28 21 0 2 0 2 1 1 0 8 0
+* 3p - 5 11642 13627 13706 10752 49727 184 49 59 54 13 15 14 85 22 91 16 20 2 1 5 1 6 3 1 2 0
+* 3p - 6 11550 13345 13456 11371 49722 144 41 48 54 18 20 17 33 14 20 10 20 0 6 5 1 4 5 1 7 0
+* 3p - 7 11524 13347 13728 11123 49722 142 44 35 48 7 16 13 34 18 17 16 10 1 3 1 1 7 1 5 6 0
+* 3p - 8 11146 13453 14069 11059 49727 134 42 53 57 16 14 10 23 26 22 11 16 2 3 2 4 4 4 2 3 0
+* 3p - 9 11297 13568 13684 11183 49732 106 41 53 44 12 9 12 26 14 15 13 14 2 1 0 1 3 2 4 1 0
+* 3p - 10 11448 13473 13833 10977 49731 102 56 45 46 8 9 5 19 21 17 12 13 3 3 3 4 1 2 3 2 0
+* 3p - 11 11267 13605 14200 10661 49733 93 50 43 57 10 16 12 10 15 14 12 16 2 1 0 7 3 2 4 0 0
+* 3p - 12 11111 13763 13842 11017 49733 101 50 47 37 9 9 6 17 23 17 10 11 3 5 3 0 0 6 1 2 0
+* 3p - 13 11182 13388 14176 10986 49732 113 56 50 52 13 10 8 16 7 16 8 12 3 3 4 2 3 1 4 2 0
+* 3p - 14 11300 13430 14191 10818 49739 95 56 49 50 13 11 6 18 11 8 10 10 2 0 6 9 2 0 1 0 0
+* 3p - 15 11296 13699 13857 10883 49735 86 60 51 44 12 12 7 19 10 12 13 14 2 3 5 3 2 2 1 2 0
+* 3p - 16 11372 13545 14065 10753 49735 96 48 45 44 6 10 10 17 17 12 10 15 2 2 3 3 2 3 1 1 0
+* 3p - 17 11093 13690 14127 10825 49735 92 50 54 48 8 12 6 11 12 14 10 16 1 0 1 1 1 2 2 2 0
+* 3p - 18 11025 14095 13867 10746 49733 92 44 43 36 10 13 9 17 12 14 12 10 2 2 2 1 3 3 1 2 0
+* 3p - 19 11002 13730 13971 11025 49728 88 56 42 62 15 8 5 19 15 7 16 12 1 1 3 3 3 1 2 8 0
+* 3p - 20 11078 13691 14168 10791 49728 80 47 43 39 7 8 8 12 9 11 6 11 2 0 1 1 4 2 1 5 0
+* 3p - 21 10825 14027 14096 10780 49728 76 61 46 48 17 8 8 15 17 19 5 9 1 1 1 1 2 3 5 3 0
+* 3p - 22 10923 13661 14263 10882 49729 79 53 53 51 13 9 12 11 11 10 2 15 2 1 3 2 5 1 2 3 0
+* 3p - 23 10971 13912 14071 10776 49730 83 43 42 43 10 16 13 13 13 9 13 6 1 2 3 1 4 1 1 2 0
+* 3p - 24 10895 13971 13989 10880 49735 68 55 49 42 7 8 10 15 15 11 11 16 2 3 4 2 2 0 4 0 0
+* 3p - 25 11046 13755 14043 10892 49736 54 68 35 46 20 9 7 25 19 16 6 16 4 2 5 6 2 1 1 1 0
+* 3p - 26 10917 13991 14101 10614 49623 67 50 52 41 15 14 10 18 11 10 11 15 3 1 4 0 1 2 1 3 0
+* 3p - 27 10775 13954 13865 10892 49486 53 45 52 51 17 13 6 17 12 18 9 14 4 2 1 2 1 1 1 3 0
+* 3p - 28 10794 13704 13874 11022 49394 73 71 41 44 13 13 12 11 16 5 10 13 4 1 1 5 0 1 2 3 0
+* 3p - 29 10718 13889 13958 10711 49276 74 59 48 53 13 10 12 22 16 12 9 17 1 4 2 1 2 2 3 2 0
+* 3p - 30 10616 13949 13810 10781 49156 78 60 42 43 12 11 17 16 18 14 16 12 1 2 1 5 0 0 1 1 0
+* 3p - 31 10873 13681 13699 10792 49045 66 72 35 37 20 12 11 7 5 12 17 17 0 1 1 3 1 3 0 0 0
+* 3p - 32 10673 13811 13778 10639 48901 60 74 47 49 14 21 9 7 15 9 8 15 1 1 5 4 0 1 0 3 0
+* 3p - 33 10656 13928 13334 10813 48731 64 80 50 35 17 9 9 20 8 7 11 16 3 5 4 5 0 1 1 4 0
+* 3p - 34 10746 13393 13673 10712 48524 68 69 46 31 15 15 18 10 11 16 6 16 1 3 1 3 0 2 0 1 0
+* 3p - 35 10477 13613 13588 10640 48318 60 90 41 64 12 9 10 16 13 8 11 12 1 0 1 3 3 1 3 0 0
+* 3p - 36 10577 13668 13018 10781 48044 43 93 35 41 13 11 10 18 12 10 14 14 2 2 0 4 2 2 2 3 0
+* 3p - 37 10668 13211 13276 10545 47700 66 95 40 48 9 9 9 12 14 10 16 18 3 2 5 6 2 0 0 3 0
+* 3p - 38 10580 13132 13108 10533 47353 55 80 36 56 17 13 9 11 7 21 9 13 1 2 4 5 0 1 2 2 0
+* 3p - 39 10395 13186 12807 10582 46970 59 94 41 34 7 10 9 13 10 15 11 16 0 2 2 2 0 4 2 4 0
+* 3p - 40 10351 12924 12833 10422 46530 76 86 30 41 18 8 9 16 9 10 12 10 3 1 2 1 2 0 0 4 0
+* 3p - 41 10081 12771 12774 10368 45994 62 87 40 28 14 5 13 12 10 13 8 24 1 2 4 2 2 2 1 0 0
+* 3p - 42 10180 12661 12279 10341 45461 57 90 30 35 14 12 6 15 7 8 8 14 1 1 0 7 0 0 0 1 0
+* 3p - 43 10121 12251 12422 10070 44864 68 72 48 34 6 11 9 10 12 10 7 15 1 1 2 1 2 3 0 1 0
+* 3p - 44 9835 12150 12144 10055 44184 48 89 29 38 12 17 12 19 6 14 5 18 0 0 1 2 0 1 1 0 0
+* 3p - 45 9699 12106 11669 10040 43514 51 96 44 53 9 8 11 13 10 15 9 13 1 0 2 2 1 2 2 1 0
+* 3p - 46 9739 11503 11611 9839 42692 57 83 31 36 9 13 7 13 10 14 10 11 1 2 6 0 2 2 1 1 0
+* 3p - 47 9311 11590 11514 9480 41895 40 107 39 30 11 7 9 16 8 12 1 9 0 3 0 2 1 1 0 4 0
+* 3p - 48 8961 11311 11056 9731 41059 48 94 33 42 12 9 7 12 11 9 4 17 0 0 3 2 2 1 1 2 0
+* 3p - 49 9099 10918 10732 9391 40140 44 108 32 35 8 11 8 10 7 14 11 13 4 3 0 3 0 2 3 3 0
+* 3p - 50 8860 10851 10509 8958 39178 40 88 35 28 7 4 8 14 15 9 9 13 3 0 2 2 2 0 3 0 0
+* 3p - 51 8520 10636 10081 8978 38215 44 102 38 34 9 10 11 14 14 14 6 11 0 4 3 0 1 3 1 0 0
+* 3p - 52 8411 10172 9975 8687 37245 60 101 26 27 10 12 9 6 9 12 11 13 1 0 2 2 1 1 1 2 0
+* 3p - 53 8167 9903 9649 8512 36231 41 92 32 30 12 11 6 14 16 13 7 12 0 3 4 0 1 3 0 2 0
+* 3p - 54 7949 9582 9181 8435 35147 51 104 36 25 10 10 5 14 10 7 11 11 1 3 3 0 0 0 2 2 0
+* 3p - 55 7697 9218 9033 8176 34124 38 98 32 26 16 7 7 12 10 6 2 16 2 1 4 2 0 4 1 1 0
+* 3p - 56 7504 9058 8776 7758 33096 44 111 30 22 14 15 8 12 9 11 9 10 4 0 3 3 1 0 0 1 0
+* 3p - 57 7258 8749 8319 7726 32052 43 91 19 27 10 4 5 10 8 5 8 6 0 2 0 2 1 0 1 1 0
+* 3p - 58 7072 8417 8058 7440 30987 40 80 21 17 8 12 11 16 6 5 8 11 1 2 1 3 0 0 1 2 0
+* 3p - 59 6676 8153 7907 7155 29891 31 88 27 19 9 10 6 11 8 5 2 8 2 0 0 2 1 3 1 1 0
+* 3p - 60 6582 7778 7381 7114 28855 35 83 34 23 14 6 0 5 7 8 3 13 1 0 2 1 0 1 0 0 0
+* 3p - 61 6390 7619 7103 6643 27755 29 78 23 14 11 4 9 10 7 6 11 9 2 0 0 1 2 0 0 0 0
+* 3p - 62 6109 7185 6992 6427 26713 24 61 30 22 10 7 9 4 9 10 11 7 1 0 3 2 1 0 0 0 0
+* 3p - 63 5842 6968 6589 6335 25734 21 89 24 20 8 10 7 14 6 5 5 17 0 1 1 0 0 0 0 1 0
+* 3p - 64 5751 6589 6403 6000 24743 29 63 18 20 9 5 2 6 8 7 6 16 0 0 2 1 3 0 1 0 0
+* 3p - 65 5507 6388 6058 5766 23719 21 81 23 18 6 7 6 7 10 4 4 8 1 0 0 0 0 0 1 1 0
+* 3p - 66 5309 6150 5806 5505 22770 16 66 28 27 2 12 6 6 5 2 8 9 0 1 1 1 0 0 1 1 0
+* 3p - 67 5137 5915 5547 5224 21823 27 72 19 21 4 3 4 9 8 7 3 7 1 0 0 0 0 0 1 1 0
+* 3p - 68 4887 5626 5347 5025 20885 25 69 16 14 6 7 3 5 3 4 6 5 0 0 1 0 0 0 0 0 0
+* 3p - 69 4579 5385 5101 4916 19981 20 48 17 16 7 8 4 2 8 6 6 7 1 1 1 1 0 0 0 0 0
+* 3p - 70 4444 5165 4851 4624 19084 28 63 17 11 3 4 9 6 9 1 2 6 0 0 0 0 1 0 1 0 0
+* 5p + 1 10368 14922 11453 13513 50256 80 506 71 88 69 50 26 33 29 41 94 73 0 0 0 0 0 0 0 0 8
+* 5p + 2 10411 13523 13958 12355 50247 65 400 66 56 33 50 28 25 35 28 38 58 1 1 0 0 0 1 0 0 3
+* 5p + 3 10789 13843 13260 11928 49820 68 259 70 59 27 34 54 62 55 38 43 37 1 3 2 1 2 2 2 3 1
+* 5p + 4 11076 14426 12822 11918 50242 49 234 45 58 17 20 17 23 13 20 13 27 4 0 6 5 3 5 4 2 0
+* 5p + 5 11020 14139 13254 11831 50244 49 189 48 57 16 13 10 17 9 13 16 25 3 6 5 7 4 2 3 4 0
+* 5p + 6 11421 13610 13445 11768 50244 34 170 43 42 16 16 16 10 9 9 16 19 4 3 5 6 4 5 2 1 0
+* 5p + 7 11222 14055 13452 11519 50248 62 137 41 46 7 10 5 14 18 11 8 19 5 2 1 4 4 4 1 0 0
+* 5p + 8 11176 14119 13512 11439 50246 60 108 60 48 14 18 6 14 14 15 6 19 0 2 4 2 4 3 3 1 0
+* 5p + 9 11195 13744 13738 11570 50247 54 138 43 47 11 20 6 12 10 10 7 16 1 2 5 3 2 3 4 1 0
+* 5p + 10 11084 14327 13265 11574 50250 46 108 47 41 15 5 5 22 12 14 9 13 1 3 2 1 3 0 2 2 0
+* 5p + 11 10931 14392 13604 11323 50250 60 124 43 47 15 13 15 16 15 10 8 20 1 8 3 0 3 2 3 0 0
+* 5p + 12 11163 13770 14021 11296 50250 50 124 48 42 9 14 7 9 3 8 13 22 1 1 4 3 1 3 1 3 0
+* 5p + 13 10989 14391 13507 11363 50250 53 99 24 38 13 16 12 14 17 8 7 14 2 4 2 4 2 1 3 2 0
+* 5p + 14 10843 14174 13768 11462 50247 46 88 43 35 11 17 12 17 7 16 7 16 1 4 3 7 4 1 4 2 0
+* 5p + 15 10951 14038 13849 11409 50247 65 109 45 35 10 6 7 14 14 12 7 10 3 3 4 3 2 4 2 3 0
+* 5p + 16 11027 14272 13684 11260 50243 56 90 41 38 18 11 10 12 12 17 5 14 0 1 1 3 5 3 2 5 0
+* 5p + 17 10825 14267 13922 11238 50252 53 98 41 52 15 15 10 13 7 9 11 12 5 5 4 0 1 2 3 0 0
+* 5p + 18 11135 13982 14000 11134 50251 56 83 40 58 19 8 6 9 13 3 15 19 1 0 2 2 1 3 2 1 0
+* 5p + 19 11014 13978 14024 11237 50253 57 87 52 59 8 14 7 21 12 8 12 11 3 5 3 4 1 1 3 0 0
+* 5p + 20 11034 14433 13743 11041 50251 54 77 43 39 11 14 12 18 18 12 7 19 1 1 4 1 1 1 1 4 0
+* 5p + 21 11070 14011 14015 11153 50249 54 117 50 43 17 10 14 15 17 13 10 13 3 0 5 2 1 3 0 5 0
+* 5p + 22 10745 14380 13809 11317 50251 61 106 38 34 7 10 10 10 13 3 6 17 4 1 2 2 3 0 1 3 0
+* 5p + 23 10747 14432 14024 11039 50242 58 88 33 47 13 14 9 12 12 9 6 12 1 2 1 3 3 3 3 7 0
+* 5p + 24 11052 13996 14130 11069 50247 68 95 32 37 8 11 10 15 15 8 10 17 0 4 2 2 2 4 0 5 0
+* 5p + 25 11018 14180 13804 11246 50248 54 92 42 36 13 12 9 8 10 11 9 18 0 2 5 2 1 0 6 3 0
+* 5p + 26 10712 14227 14052 11108 50099 69 74 44 44 9 10 6 12 15 7 8 12 1 1 5 3 1 0 1 3 0
+* 5p + 27 10860 14099 14136 10909 50004 51 77 43 45 21 5 6 11 10 17 7 14 2 2 1 1 3 1 1 2 0
+* 5p + 28 10998 14052 13762 11100 49912 61 83 43 46 15 12 15 10 13 13 11 17 1 1 0 3 1 0 2 0 0
+* 5p + 29 10680 14191 14086 10852 49809 51 65 35 52 8 15 5 12 15 8 8 9 0 5 2 3 3 1 1 2 0
+* 5p + 30 11110 13898 13862 10830 49700 70 71 46 46 6 12 4 16 15 12 7 11 0 2 5 2 2 2 2 0 0
+* 5p + 31 10828 14020 13793 10945 49586 62 70 61 52 9 17 10 10 15 18 7 17 1 2 0 2 4 2 1 5 0
+* 5p + 32 10669 14202 13913 10664 49448 83 77 50 48 11 17 13 14 7 17 11 9 1 0 1 2 1 3 2 2 0
+* 5p + 33 10925 13737 13897 10706 49265 74 71 51 40 9 10 10 14 10 10 8 17 0 1 1 2 2 2 3 2 0
+* 5p + 34 10784 13849 13513 10948 49094 84 60 46 41 17 14 11 17 9 8 4 8 3 3 2 2 0 1 1 1 0
+* 5p + 35 10694 13840 13725 10616 48875 70 64 55 35 9 8 7 20 12 9 15 13 2 4 1 2 0 0 4 1 0
+* 5p + 36 10838 13550 13510 10707 48605 91 53 44 47 11 9 14 15 15 16 12 17 2 1 3 7 2 0 2 2 0
+* 5p + 37 10679 13585 13361 10677 48302 93 63 51 39 16 10 13 19 12 11 10 8 0 3 4 2 5 0 1 3 0
+* 5p + 38 10728 13455 13220 10536 47939 89 61 39 29 15 9 9 22 22 10 7 16 1 2 1 2 1 2 0 6 0
+* 5p + 39 10656 13065 13411 10425 47557 69 54 42 39 16 14 15 20 12 8 9 16 3 1 3 1 2 1 1 1 0
+* 5p + 40 10691 13057 12982 10421 47151 82 52 45 37 8 10 7 10 9 10 9 21 7 0 3 1 2 2 0 2 0
+* 5p + 41 10498 12794 13031 10316 46639 94 54 41 37 4 10 7 10 13 19 12 15 3 1 4 6 3 0 2 4 0
+* 5p + 42 10361 12575 12917 10260 46113 97 69 43 37 21 8 10 13 7 20 10 15 2 2 3 4 2 3 0 1 0
+* 5p + 43 10316 12520 12286 10391 45513 98 56 40 44 7 16 9 17 18 12 11 8 0 1 1 5 0 4 1 0 0
+* 5p + 44 9965 12448 12421 10011 44845 102 59 41 29 21 10 6 14 15 10 10 16 0 1 3 3 2 2 4 2 0
+* 5p + 45 10366 11810 12178 9778 44132 89 61 51 49 6 9 10 17 16 16 15 21 1 0 2 2 2 3 0 1 0
+* 5p + 46 9999 11592 11958 9826 43375 96 56 32 40 18 8 10 14 8 8 5 13 1 1 2 1 1 2 3 2 0
+* 5p + 47 9788 11660 11506 9525 42479 102 54 45 34 17 10 6 19 5 17 6 13 2 3 3 2 2 3 1 3 0
+* 5p + 48 9602 11161 11584 9236 41583 102 52 45 44 12 6 10 22 10 14 2 18 2 2 1 2 1 3 2 0 0
+* 5p + 49 9484 10833 11090 9304 40711 98 49 42 33 5 7 12 18 9 6 6 8 0 0 0 3 3 1 1 0 0
+* 5p + 50 9199 10706 10896 9003 39804 113 49 37 38 10 17 13 21 12 8 7 9 1 1 0 2 1 4 2 0 0
+* 5p + 51 9159 10233 10782 8658 38832 100 50 29 33 11 14 9 15 11 10 7 10 2 2 0 3 2 1 1 1 0
+* 5p + 52 8792 10143 10231 8636 37802 100 37 34 41 11 14 9 19 8 10 7 9 0 0 1 1 0 1 3 0 0
+* 5p + 53 8590 9778 10066 8293 36727 77 46 30 33 11 12 6 12 11 17 3 7 0 0 2 1 2 2 2 1 0
+* 5p + 54 8479 9357 9797 8034 35667 114 43 36 28 9 10 7 17 7 5 4 15 1 0 2 5 2 2 1 2 0
+* 5p + 55 8260 9086 9307 7924 34577 90 55 28 30 5 8 5 11 9 8 7 17 1 3 1 0 2 2 0 1 0
+* 5p + 56 7885 8772 9208 7598 33463 82 42 27 38 11 3 5 17 5 7 3 13 2 1 2 0 5 1 0 1 0
+* 5p + 57 7794 8416 8691 7450 32351 95 39 16 34 7 11 7 17 16 5 5 5 1 1 1 4 2 4 0 1 0
+* 5p + 58 7466 8146 8373 7204 31189 83 37 27 36 6 7 6 10 11 10 8 12 2 0 2 0 2 3 1 1 0
+* 5p + 59 7215 7954 8202 6752 30123 95 28 21 28 6 9 3 11 9 8 4 8 2 0 4 0 1 1 0 0 0
+* 5p + 60 6976 7437 7976 6659 29048 94 28 23 15 7 9 5 12 9 14 5 5 2 0 1 0 0 2 1 1 0
+* 5p + 61 6731 7418 7385 6447 27981 70 25 29 29 5 6 2 13 10 9 7 15 0 2 1 1 1 2 1 0 0
+* 5p + 62 6452 7106 7129 6236 26923 89 31 26 34 5 5 8 17 11 4 8 8 1 0 1 0 0 0 1 0 0
+* 5p + 63 6314 6667 7066 5865 25912 81 30 28 20 6 5 5 8 4 11 7 5 0 2 0 0 0 0 0 0 0
+* 5p + 64 6149 6396 6581 5744 24870 62 20 21 32 9 9 8 8 2 7 6 8 0 2 0 1 2 0 1 1 0
+* 5p + 65 5770 6277 6314 5416 23777 56 29 14 27 9 6 8 5 13 7 5 3 1 1 2 1 1 0 0 0 0
+* 5p + 66 5504 5858 6187 5324 22873 74 20 23 33 7 6 4 11 4 14 5 9 1 1 1 1 1 1 0 1 0
+* 5p + 67 5396 5572 5847 5126 21941 69 24 16 10 9 5 3 9 9 7 2 10 2 0 1 1 0 0 0 0 0
+* 5p + 68 5037 5591 5510 4830 20968 54 17 25 18 7 4 7 13 9 5 7 12 1 0 1 1 0 0 0 0 0
+* 5p + 69 4917 5089 5476 4620 20102 73 15 25 19 7 6 5 4 6 7 3 1 1 1 0 2 0 1 0 1 0
+* 5p + 70 4677 4926 5096 4479 19178 51 22 13 13 2 6 2 8 3 7 3 1 0 1 0 2 0 0 0 0 0
+* 5p - 1 10212 14833 11217 13473 49735 74 445 64 61 44 46 38 43 29 30 73 70 0 0 0 0 0 0 0 0 2
+* 5p - 2 10334 13301 13752 12347 49734 57 374 59 65 30 44 22 27 33 21 42 51 1 2 0 2 0 0 0 0 0
+* 5p - 3 10578 13583 13170 11938 49269 74 221 83 57 40 28 54 54 46 44 49 35 3 2 0 3 0 1 0 0 0
+* 5p - 4 11034 14185 12617 11899 49735 64 243 42 47 24 17 20 18 14 22 19 27 2 1 3 2 0 3 0 0 0
+* 5p - 5 10937 14035 13187 11564 49723 48 198 44 45 20 9 11 14 15 21 12 20 2 5 1 3 4 5 3 5 0
+* 5p - 6 11562 13214 13394 11559 49729 55 151 37 39 12 20 7 16 13 14 14 17 2 0 3 5 4 4 3 1 0
+* 5p - 7 11098 14052 13157 11423 49730 61 140 53 43 17 17 8 14 12 14 7 14 3 5 1 1 2 2 5 3 0
+* 5p - 8 11110 13863 13485 11271 49729 64 133 40 40 12 9 10 16 13 9 6 14 2 3 3 1 4 4 2 3 0
+* 5p - 9 11170 13697 13626 11240 49733 51 111 47 45 16 16 11 15 8 10 9 17 2 2 4 2 1 2 2 4 0
+* 5p - 10 10855 14169 13247 11463 49734 47 109 37 60 13 8 6 14 11 4 8 17 1 1 2 5 1 3 2 2 0
+* 5p - 11 10794 14034 13793 11113 49734 55 97 47 37 12 7 6 10 13 16 5 14 2 1 2 3 3 0 3 2 0
+* 5p - 12 11172 13830 13567 11159 49728 48 110 51 42 12 17 14 9 10 11 6 9 0 3 2 0 4 4 1 4 0
+* 5p - 13 10896 14059 13251 11532 49738 58 99 40 42 6 14 11 10 12 8 3 22 0 0 1 2 0 1 2 1 0
+* 5p - 14 10697 14187 13733 11120 49737 51 108 46 51 13 9 9 16 17 15 7 15 1 2 2 0 0 0 4 1 0
+* 5p - 15 10984 13714 13803 11236 49737 54 103 45 47 9 13 6 15 12 10 10 7 2 2 3 3 0 3 1 1 0
+* 5p - 16 10787 14041 13406 11500 49734 51 94 42 52 12 11 7 23 9 14 10 14 0 0 2 8 2 2 2 2 0
+* 5p - 17 10648 14203 13775 11111 49737 58 90 36 33 11 14 10 9 13 13 10 17 1 1 3 2 2 0 2 1 0
+* 5p - 18 11030 13859 13768 11075 49732 41 95 38 35 11 12 13 21 22 10 7 9 2 3 2 2 5 2 3 0 0
+* 5p - 19 10758 14139 13663 11172 49732 57 97 38 41 19 7 10 12 12 6 6 13 2 5 3 2 1 4 1 3 0
+* 5p - 20 10764 14231 13913 10824 49732 68 90 44 40 17 9 5 11 11 6 7 15 1 0 2 4 2 2 2 4 0
+* 5p - 21 10953 13897 14159 10727 49736 51 87 44 45 8 8 8 10 12 4 9 14 3 2 4 3 0 3 1 2 0
+* 5p - 22 10689 14186 13706 11150 49731 44 85 30 44 14 7 6 14 7 10 8 11 0 1 6 2 4 2 1 4 0
+* 5p - 23 10556 14262 13855 11065 49738 57 75 44 41 9 11 8 10 13 7 4 8 2 1 2 2 2 0 1 1 0
+* 5p - 24 10797 14042 13892 11004 49735 56 83 39 30 21 9 11 9 10 15 7 11 4 2 3 3 0 0 0 7 0
+* 5p - 25 10844 14038 13658 11197 49737 56 86 38 47 13 15 13 8 7 12 12 8 1 2 1 2 1 0 3 1 0
+* 5p - 26 10617 14241 13927 10845 49630 53 75 43 38 13 12 9 14 16 9 8 13 1 0 6 0 0 1 1 1 0
+* 5p - 27 11180 13611 13906 10792 49489 65 79 48 39 13 13 11 21 18 12 13 10 0 2 1 3 1 1 0 4 0
+* 5p - 28 10765 14074 13653 10907 49399 75 63 52 42 14 11 7 13 13 11 10 8 2 2 3 2 0 0 1 0 0
+* 5p - 29 10771 13933 13813 10765 49282 57 54 49 42 10 4 15 16 15 8 3 11 1 1 0 3 1 1 2 1 0
+* 5p - 30 10772 13736 14075 10572 49155 51 63 44 48 9 8 8 16 11 11 6 17 3 1 5 4 0 0 1 3 0
+* 5p - 31 10753 13874 13672 10753 49052 68 67 56 44 16 6 5 16 11 8 8 13 1 3 3 4 1 1 1 0 0
+* 5p - 32 10622 13821 13796 10665 48904 59 64 39 37 9 7 15 17 13 8 6 8 0 0 3 2 0 0 2 3 0
+* 5p - 33 10954 13427 13688 10664 48733 79 50 38 33 8 13 10 9 7 11 14 13 3 2 4 2 3 3 2 1 0
+* 5p - 34 10734 13601 13320 10868 48523 80 92 52 44 10 11 14 15 10 13 8 14 3 0 4 3 1 0 5 3 0
+* 5p - 35 10333 13781 13463 10745 48322 67 79 43 42 12 13 9 16 14 15 13 13 1 2 3 0 0 2 2 5 0
+* 5p - 36 10815 13446 13483 10305 48049 72 48 49 27 20 12 12 12 12 7 7 19 2 0 2 2 2 2 1 1 0
+* 5p - 37 10718 13387 12981 10614 47700 80 58 34 46 11 10 9 13 14 8 6 11 0 2 1 5 3 0 0 1 0
+* 5p - 38 10371 13239 13154 10579 47343 82 56 62 37 15 11 7 18 8 8 7 12 1 2 3 3 1 3 3 3 0
+* 5p - 39 10670 12733 13180 10390 46973 79 58 39 34 16 12 4 9 10 13 10 15 1 2 3 3 1 1 2 2 0
+* 5p - 40 10556 12789 12737 10457 46539 86 52 35 39 13 12 4 27 11 13 9 10 1 1 4 1 2 0 3 2 0
+* 5p - 41 10276 12643 12955 10131 46005 90 57 43 47 15 14 8 19 15 9 7 14 2 1 1 3 2 0 0 1 0
+* 5p - 42 10486 12304 12769 9905 45464 106 37 33 37 12 8 6 21 17 8 9 12 1 2 4 0 3 0 1 2 0
+* 5p - 43 10352 12257 12141 10119 44869 97 53 43 39 8 19 15 17 21 11 12 10 2 1 4 1 1 0 2 4 0
+* 5p - 44 9986 12142 12315 9736 44179 109 61 41 48 12 15 7 16 24 11 17 20 1 2 1 3 3 7 0 1 0
+* 5p - 45 10053 11583 12031 9853 43520 83 57 43 33 9 14 9 13 24 18 7 15 3 2 1 3 3 0 2 3 0
+* 5p - 46 9703 11679 11659 9662 42703 94 54 32 38 8 10 12 21 10 13 5 6 1 3 2 1 0 3 0 1 0
+* 5p - 47 9542 11481 11632 9250 41905 94 52 40 26 6 13 11 17 10 15 11 18 2 0 0 3 1 0 1 1 0
+* 5p - 48 9462 10920 11353 9331 41066 85 54 38 53 10 11 6 20 12 12 8 8 0 1 2 2 0 3 1 2 0
+* 5p - 49 9428 10666 10898 9155 40147 92 47 30 37 18 16 6 19 11 8 4 13 1 0 2 0 1 2 2 3 0
+* 5p - 50 9007 10629 10735 8814 39185 96 41 34 35 8 9 15 17 13 13 9 6 1 1 2 1 1 0 2 1 0
+* 5p - 51 9046 9955 10685 8540 38226 82 36 34 30 10 5 11 11 7 11 9 15 0 0 0 2 0 1 0 0 0
+* 5p - 52 8882 9922 10026 8428 37258 88 45 29 32 5 8 5 15 9 16 8 14 0 2 1 1 2 1 0 2 0
+* 5p - 53 8520 9601 9839 8277 36237 84 32 26 39 12 7 10 12 12 9 9 14 0 1 2 0 1 0 2 2 0
+* 5p - 54 8202 9237 9704 8013 35156 105 35 31 27 14 12 11 17 2 11 6 13 2 1 1 0 2 0 1 1 0
+* 5p - 55 7998 9126 9183 7831 34138 98 31 32 31 7 11 9 14 6 18 11 15 3 0 0 3 2 2 0 0 0
+* 5p - 56 7731 8869 9015 7485 33100 79 43 26 39 8 7 3 13 9 13 12 12 1 4 1 1 2 1 1 1 0
+* 5p - 57 7711 8289 8773 7284 32057 83 36 30 34 10 10 6 10 7 6 7 14 2 2 1 3 0 0 1 0 0
+* 5p - 58 7337 8087 8290 7287 31001 90 33 25 27 18 9 8 15 8 9 5 9 1 0 2 2 1 1 0 0 0
+* 5p - 59 7119 7878 8154 6751 29902 81 32 35 24 11 7 12 13 7 14 7 6 0 2 3 1 0 1 0 0 0
+* 5p - 60 6988 7434 7820 6618 28860 54 31 17 36 7 7 9 11 14 5 4 6 3 2 0 3 1 0 2 2 0
+* 5p - 61 6608 7337 7442 6372 27759 76 25 24 20 4 4 8 12 4 12 4 5 1 2 0 2 3 1 1 1 0
+* 5p - 62 6285 7030 7274 6123 26712 61 19 26 32 7 8 7 8 10 8 2 6 1 0 2 1 1 3 1 2 0
+* 5p - 63 6245 6637 7032 5822 25736 66 25 32 33 8 4 5 13 9 13 3 7 0 1 3 4 0 1 0 2 0
+* 5p - 64 6054 6435 6480 5787 24756 56 29 17 16 7 6 3 14 10 11 7 10 2 1 2 1 0 1 0 0 0
+* 5p - 65 5600 6165 6437 5524 23726 79 32 17 27 7 4 6 12 7 10 7 6 1 0 0 1 0 0 0 1 0
+* 5p - 66 5467 5956 6078 5277 22778 64 30 23 18 5 3 6 9 5 5 3 3 0 1 2 0 0 0 1 1 0
+* 5p - 67 5277 5656 5801 5096 21830 58 28 25 27 4 5 4 8 6 6 7 9 0 0 0 0 1 0 0 1 0
+* 5p - 68 5043 5347 5658 4841 20889 63 18 18 22 3 9 3 6 2 12 6 9 0 0 0 0 1 0 0 1 0
+* 5p - 69 4863 5168 5378 4574 19983 61 19 18 16 5 6 6 12 10 5 4 8 0 1 1 1 0 1 0 1 0
+* 5p - 70 4660 4807 5189 4433 19089 57 25 14 15 8 2 6 7 9 5 3 6 2 0 1 1 0 0 0 0 0
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/3pGtoA_freq.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/3pGtoA_freq.txt
new file mode 100644
index 0000000..9cd7160
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/3pGtoA_freq.txt
@@ -0,0 +1,26 @@
+pos 5pG>A
+1 0.0385501920828462
+2 0.0431893687707641
+3 0.0262298650413583
+4 0.0269436372116385
+5 0.018594446842181
+6 0.0191471167788229
+7 0.0118727761237383
+8 0.0114543068903153
+9 0.00953385975765074
+10 0.00815169236221868
+11 0.00812837955695099
+12 0.00732984293193717
+13 0.00650964196969167
+14 0.00671258259623116
+15 0.00615683732987686
+16 0.00728597449908925
+17 0.00609734730350072
+18 0.00632843119628924
+19 0.00726774805140088
+20 0.00664242763249895
+21 0.00519628430081503
+22 0.0057699750202301
+23 0.00486456218940295
+24 0.00503535461752732
+25 0.0051537293939073
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/5pCtoT_freq.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/5pCtoT_freq.txt
new file mode 100644
index 0000000..58c20e5
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/5pCtoT_freq.txt
@@ -0,0 +1,26 @@
+pos 5pC>T
+1 0.0327175147074001
+2 0.0300228353236252
+3 0.0173239034147735
+4 0.014040014040014
+5 0.0120392073300661
+6 0.0103100402793688
+7 0.00900836998155767
+8 0.00799518873598189
+9 0.00835087969736983
+10 0.00676199197013454
+11 0.00769419948705337
+12 0.00662131847456401
+13 0.00651499843221963
+14 0.00707298191287212
+15 0.00675289409747034
+16 0.00643120350413683
+17 0.00646969277608635
+18 0.00609993970989822
+19 0.00582165516279718
+20 0.00570046640179651
+21 0.00612164999822045
+22 0.00595093092048025
+23 0.00615320796460177
+24 0.00482326432896775
+25 0.00567983831625897
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/Fragmisincorporation_plot.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/Fragmisincorporation_plot.pdf
new file mode 100644
index 0000000..f2f22c9
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/Fragmisincorporation_plot.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/Length_plot.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/Length_plot.pdf
new file mode 100644
index 0000000..c39a862
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/Length_plot.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/Runtime_log.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/Runtime_log.txt
new file mode 100644
index 0000000..7dfebb3
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/Runtime_log.txt
@@ -0,0 +1,4 @@
+2013-10-24 15:07:05,083 INFO main: Started with the command: /home/mischu/bin/mapDamage/bin/mapDamage --no-stats --merge-reference-sequences -t mapDamage plot for library 'Pi1845A_id_CGCTAT' -i - -d /home/mischu/scratch/bam_pipeline/be658e3a-4421-4a70-a9ab-33d4a0d10ba2 -r 000_prefixes/Pi_nucl.fasta --downsample 100000
+2013-10-24 15:07:36,958 DEBUG main: BAM read in 34.531730 seconds
+2013-10-24 15:07:37,718 INFO main: Successful run
+2013-10-24 15:07:37,718 DEBUG main: Run completed in 35.291744 seconds
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_correct_prob.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_correct_prob.csv
new file mode 100644
index 0000000..9371883
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_correct_prob.csv
@@ -0,0 +1,25 @@
+"","Position","C.T","G.A"
+"1",1,0.822989479184056,0.569371179976006
+"2",2,0.763705526744711,0.44656269966241
+"3",3,0.686996085963154,0.349536613851233
+"4",4,0.593525118391169,0.279899879160483
+"5",5,0.487661365300798,0.230722245398689
+"6",6,0.38093454024941,0.188818820236893
+"7",7,0.286191607600513,0.148410351366908
+"8",8,0.210782824616397,0.110001018232146
+"9",9,0.153052321156299,0.0784722183771067
+"10",10,0.109149749110748,0.0556339160083942
+"11",11,0.076506702607571,0.0398160469775537
+"12",12,0.0522491239903261,0.0296261779466191
+"13",-12,0.0312179034543074,0.0506898928830193
+"14",-11,0.0431480560563886,0.0733477662386683
+"15",-10,0.0586191383634996,0.106393372509562
+"16",-9,0.0780507960522221,0.153284975322634
+"17",-8,0.10242267285694,0.216474409282334
+"18",-7,0.134402511649219,0.295517190410198
+"19",-6,0.177322559874261,0.387192945287547
+"20",-5,0.238537849013771,0.483835161887875
+"21",-4,0.318119348841762,0.579941181858254
+"22",-3,0.415743801993105,0.668661440252283
+"23",-2,0.5175445876565,0.747626080575631
+"24",-1,0.631102714750573,0.809726262367146
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_hist.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_hist.pdf
new file mode 100644
index 0000000..b2e2a5f
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_hist.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_iter_summ_stat.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_iter_summ_stat.csv
new file mode 100644
index 0000000..8a2d79f
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_iter_summ_stat.csv
@@ -0,0 +1,45 @@
+"","Theta","DeltaD","DeltaS","Lambda","Rho","LogLik"
+"Mean",0.0275833354710102,3.34561287463039e-05,0.124779030702315,0.323789622995648,0.918775976332997,-18766.5962380243
+"Std.",0.000263693666102707,3.32594726717004e-05,0.00312904908695234,0.00607387243054788,0.0102642584015137,1.70275888218207
+"Acceptance ratio",0.26642,0.2018,0.25018,0.25904,0.20476,0.74038
+"0%",0.0266540101212686,5.08430752837777e-09,0.113148362459768,0.300751172879184,0.880276424431998,-18778.1808967307
+"2.5%",0.0270706844974985,8.63303432874918e-07,0.11902105336135,0.31220961777386,0.898854660658202,-18770.7575401854
+"5%",0.0271523257788836,1.69458760859594e-06,0.11987644974734,0.314131062540229,0.901832929619413,-18769.8467841703
+"7.5%",0.0272071467487264,2.55030305922469e-06,0.120403527629705,0.315294647223063,0.903990079991048,-18769.2973888737
+"10%",0.0272478308614661,3.23961380608697e-06,0.120823921719195,0.316083292144517,0.905664537052658,-18768.8868235797
+"12.5%",0.0272784807617019,4.02085226169441e-06,0.121203959811656,0.316845114179385,0.906877779598108,-18768.5679297344
+"15%",0.0273090664063941,5.11555994888759e-06,0.121555578131757,0.317498069930494,0.908010183938086,-18768.2890616313
+"17.5%",0.0273368358152623,6.12684104766328e-06,0.121840040010875,0.318128911220384,0.909056579792864,-18768.0356131696
+"20%",0.0273600911777602,7.22232308291811e-06,0.122105822315837,0.31867450536136,0.910131001803781,-18767.8331770706
+"22.5%",0.0273804826194021,8.05937557625017e-06,0.122378939904702,0.319203886285329,0.911201890821345,-18767.6514001938
+"25%",0.027401962631993,9.14898778900812e-06,0.122613914047479,0.319706029691453,0.912051807437616,-18767.4881823124
+"27.5%",0.0274238747591415,1.04631294987505e-05,0.122859215540079,0.320150562987714,0.912759074333299,-18767.3309480195
+"30%",0.0274429877111701,1.15216357521017e-05,0.12309873904279,0.320569957666869,0.913420748014924,-18767.1889837104
+"32.5%",0.0274596545076922,1.27029744395689e-05,0.123312788912644,0.320931494985985,0.914097996513825,-18767.0624309433
+"35%",0.0274797323251407,1.42903835601093e-05,0.123525326260038,0.321327518952669,0.91477365987461,-18766.9384714442
+"37.5%",0.0274976347784389,1.55494246088445e-05,0.123765424859494,0.321710556461365,0.915403188131131,-18766.8163417853
+"40%",0.0275151835405313,1.68973862468801e-05,0.123937337499734,0.322077827329619,0.916247818717267,-18766.7015869266
+"42.5%",0.0275338076690217,1.83151907227893e-05,0.124116853320328,0.322446295327986,0.916914359123012,-18766.5952971988
+"45%",0.0275497627585016,2.0140547808191e-05,0.124308834096465,0.322868913468303,0.917408532353834,-18766.4893689937
+"47.5%",0.0275668868891626,2.16824117255395e-05,0.124493868799483,0.323211087631214,0.918041502056398,-18766.3850842816
+"50%",0.0275823260800664,2.31942077676962e-05,0.124665796287497,0.323601843852261,0.918714769999579,-18766.2900351317
+"52.5%",0.0275975574643281,2.47180375254283e-05,0.124854296027888,0.323973513978742,0.919424565214727,-18766.1932800784
+"55%",0.0276146002699486,2.62992670146587e-05,0.125080961277785,0.324376383409278,0.920042710592026,-18766.0923320415
+"57.5%",0.0276326462355948,2.83613765545899e-05,0.125266926810126,0.32477059582537,0.920684156108689,-18765.9951988153
+"60%",0.0276510643710888,3.07915884037689e-05,0.125474955735363,0.32520804899463,0.921405678368773,-18765.8965696173
+"62.5%",0.0276652818130436,3.33275342350489e-05,0.125675622665307,0.32563338219201,0.922055769101683,-18765.8075925719
+"65%",0.0276832797703657,3.57124516428861e-05,0.125872699453715,0.325979205632005,0.922737913811702,-18765.717135957
+"67.5%",0.0277023258990307,3.81884628033436e-05,0.126114467564607,0.326419503247716,0.923445616031807,-18765.6238971442
+"70%",0.0277213849126008,4.1020860248523e-05,0.126321817408288,0.326816640100801,0.924154626372352,-18765.5365848399
+"72.5%",0.0277405401121762,4.39807628634639e-05,0.126540643697511,0.327259015178842,0.924885746049791,-18765.4395350521
+"75%",0.027760389698616,4.74669275807704e-05,0.126795023546854,0.327727138355936,0.925767613776735,-18765.3442972281
+"77.5%",0.0277806586591256,5.07773916305104e-05,0.127045210620775,0.328253870700646,0.926627579671115,-18765.2542616972
+"80%",0.0278033744610072,5.47057850872189e-05,0.127321479478956,0.328852634531782,0.927457506051715,-18765.1578028854
+"82.5%",0.0278297879925748,5.9216972270442e-05,0.127664487657286,0.32946477267488,0.928446557293927,-18765.0565798667
+"85%",0.0278568673369877,6.40826720460735e-05,0.128008982241373,0.330215577329557,0.929396339584046,-18764.9484888549
+"87.5%",0.0278912864346315,6.98508457430145e-05,0.128376866091354,0.330889026268217,0.930636703616476,-18764.8351703966
+"90%",0.0279267157759398,7.63753925994585e-05,0.12883562400457,0.331747497697286,0.931806441697034,-18764.7213721314
+"92.5%",0.0279688845305674,8.72586634181523e-05,0.129378189697081,0.332715115258501,0.933526562875365,-18764.5899468433
+"95%",0.0280192401685,9.95522705210282e-05,0.130082233371827,0.334006230390133,0.935763494288059,-18764.4440368819
+"97.5%",0.0281063155076181,0.000123077929149151,0.131198890843294,0.335888988606095,0.938932332341738,-18764.2486367545
+"100%",0.0286830582486809,0.000320290328646787,0.139148600176267,0.349270680547334,0.95495164639458,-18763.6834924867
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_post_pred.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_post_pred.pdf
new file mode 100644
index 0000000..1556656
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_post_pred.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_trace.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_trace.pdf
new file mode 100644
index 0000000..3ce0ce4
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/Stats_out_MCMC_trace.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/dnacomp.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/dnacomp.txt
new file mode 100644
index 0000000..d249fa1
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/dnacomp.txt
@@ -0,0 +1,324 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_nucl.fasta as reference file
+# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads
+Chr End Std Pos A C G T Total
+* 3p + -70 3328 3776 3623 3674 14401
+* 3p + -69 3459 4081 3918 3951 15409
+* 3p + -68 3774 4389 4155 4042 16360
+* 3p + -67 3954 4586 4374 4151 17065
+* 3p + -66 4079 4827 4481 4411 17798
+* 3p + -65 4294 5090 4741 4512 18637
+* 3p + -64 4413 5169 5087 4782 19451
+* 3p + -63 4606 5562 5260 4894 20322
+* 3p + -62 4771 5692 5478 5258 21199
+* 3p + -61 4979 6062 5755 5329 22125
+* 3p + -60 5312 6095 6035 5680 23122
+* 3p + -59 5470 6508 6435 5726 24139
+* 3p + -58 5606 6731 6738 6101 25176
+* 3p + -57 5978 7127 6718 6440 26263
+* 3p + -56 6141 7495 7312 6436 27384
+* 3p + -55 6497 7607 7590 6885 28579
+* 3p + -54 6603 8117 7950 7023 29693
+* 3p + -53 6935 8355 8255 7297 30842
+* 3p + -52 7221 8771 8421 7532 31945
+* 3p + -51 7393 9100 8791 7773 33057
+* 3p + -50 7562 9422 9336 7871 34191
+* 3p + -49 7985 9646 9466 8174 35271
+* 3p + -48 8063 10093 9626 8553 36335
+* 3p + -47 8252 10434 10114 8575 37375
+* 3p + -46 8429 10769 10433 8708 38339
+* 3p + -45 8730 10900 10478 9164 39272
+* 3p + -44 8783 11258 11003 9140 40184
+* 3p + -43 9169 11355 11135 9360 41019
+* 3p + -42 9090 11834 11494 9423 41841
+* 3p + -41 9348 11894 11766 9567 42575
+* 3p + -40 9491 12048 11993 9761 43293
+* 3p + -39 9515 12516 12164 9756 43951
+* 3p + -38 9702 12398 12392 10069 44561
+* 3p + -37 9923 12609 12682 9847 45061
+* 3p + -36 9846 12963 12746 9973 45528
+* 3p + -35 9964 13124 13008 9892 45988
+* 3p + -34 10102 12965 13124 10189 46380
+* 3p + -33 10034 13355 12943 10387 46719
+* 3p + -32 10330 13330 13368 10019 47047
+* 3p + -31 10345 13294 13370 10329 47338
+* 3p + -30 10176 13615 13324 10512 47627
+* 3p + -29 10349 13772 13640 10164 47925
+* 3p + -28 10519 13502 13853 10433 48307
+* 3p + -27 10631 13975 13509 10564 48679
+* 3p + -26 10835 13935 14100 10296 49166
+* 3p + -25 11010 13948 14127 10705 49790
+* 3p + -24 10836 14074 14035 10857 49802
+* 3p + -23 10855 14216 14343 10402 49816
+* 3p + -22 10877 14002 14143 10803 49825
+* 3p + -21 10768 14308 13965 10801 49842
+* 3p + -20 10915 13994 14176 10772 49857
+* 3p + -19 11038 13909 14226 10684 49857
+* 3p + -18 10903 14190 13841 10923 49857
+* 3p + -17 10915 13888 14237 10817 49857
+* 3p + -16 11187 14016 14174 10480 49857
+* 3p + -15 11006 14029 13974 10848 49857
+* 3p + -14 11164 14009 14199 10485 49857
+* 3p + -13 11424 13433 14321 10677 49855
+* 3p + -12 11236 13907 13775 10936 49854
+* 3p + -11 11211 13696 14288 10654 49849
+* 3p + -10 11454 13613 13976 10812 49855
+* 3p + -9 11607 13845 13706 10699 49857
+* 3p + -8 11734 13441 14097 10585 49857
+* 3p + -7 11908 13355 13838 10755 49856
+* 3p + -6 12455 13161 13463 10778 49857
+* 3p + -5 11977 13239 14127 10514 49857
+* 3p + -4 12783 12910 13823 10341 49857
+* 3p + -3 11904 13020 14046 10887 49857
+* 3p + -2 13409 13751 12925 9772 49857
+* 3p + -1 13327 11268 14983 10279 49857
+* 3p + 1 4561 19401 6208 19628 49798
+* 3p + 2 10420 13159 14439 11778 49796
+* 3p + 3 11483 12973 14021 11313 49790
+* 3p + 4 11825 13595 12964 11403 49787
+* 3p + 5 11650 13314 13578 11242 49784
+* 3p + 6 11973 12933 13236 11641 49783
+* 3p + 7 11699 13337 13181 11565 49782
+* 3p + 8 11979 13058 13561 11179 49777
+* 3p + 9 11909 12999 13264 11604 49776
+* 3p + 10 11852 13383 12862 11677 49774
+* 3p - -70 3409 3830 3690 3647 14576
+* 3p - -69 3474 4169 3967 3934 15544
+* 3p - -68 3786 4426 4195 4070 16477
+* 3p - -67 3896 4546 4535 4305 17282
+* 3p - -66 4066 4975 4556 4405 18002
+* 3p - -65 4390 4992 4899 4552 18833
+* 3p - -64 4586 5178 5168 4717 19649
+* 3p - -63 4555 5508 5389 5068 20520
+* 3p - -62 4884 5931 5554 5086 21455
+* 3p - -61 5096 5957 5978 5410 22441
+* 3p - -60 5247 6406 6072 5675 23400
+* 3p - -59 5678 6537 6446 5776 24437
+* 3p - -58 5973 6861 6676 6004 25514
+* 3p - -57 6053 7184 6977 6379 26593
+* 3p - -56 6101 7572 7359 6629 27661
+* 3p - -55 6649 7744 7465 6882 28740
+* 3p - -54 6708 8084 8021 7073 29886
+* 3p - -53 6871 8577 8362 7231 31041
+* 3p - -52 7291 8747 8440 7664 32142
+* 3p - -51 7413 9261 8808 7782 33264
+* 3p - -50 7604 9409 9253 8024 34290
+* 3p - -49 7892 9593 9573 8291 35349
+* 3p - -48 7948 10167 9822 8491 36428
+* 3p - -47 8288 10517 10137 8481 37423
+* 3p - -46 8635 10589 10285 8871 38380
+* 3p - -45 8488 10898 10616 9386 39388
+* 3p - -44 8812 11212 11245 8991 40260
+* 3p - -43 9174 11196 11400 9336 41106
+* 3p - -42 9074 11793 11415 9671 41953
+* 3p - -41 9330 11953 11880 9554 42717
+* 3p - -40 9475 12119 12037 9815 43446
+* 3p - -39 9583 12463 12037 10006 44089
+* 3p - -38 9852 12458 12491 9885 44686
+* 3p - -37 10022 12735 12661 9781 45199
+* 3p - -36 9730 13188 12553 10177 45648
+* 3p - -35 9945 12969 13195 10002 46111
+* 3p - -34 10222 12950 13262 10069 46503
+* 3p - -33 10021 13404 13061 10358 46844
+* 3p - -32 10130 13497 13418 10138 47183
+* 3p - -31 10342 13400 13377 10380 47499
+* 3p - -30 10239 13660 13317 10602 47818
+* 3p - -29 10492 13393 13766 10480 48131
+* 3p - -28 10612 13804 13717 10390 48523
+* 3p - -27 10619 13998 13659 10651 48927
+* 3p - -26 10520 14150 14058 10704 49432
+* 3p - -25 11010 14155 14145 10757 50067
+* 3p - -24 10783 14380 13984 10930 50077
+* 3p - -23 11106 14175 14213 10596 50090
+* 3p - -22 11117 13866 14313 10814 50110
+* 3p - -21 10770 14265 14108 10983 50126
+* 3p - -20 10853 14282 14382 10626 50143
+* 3p - -19 11317 13950 14179 10697 50143
+* 3p - -18 11001 14320 13946 10876 50143
+* 3p - -17 11096 14099 14306 10642 50143
+* 3p - -16 11290 13782 14300 10771 50143
+* 3p - -15 11230 14185 13825 10902 50142
+* 3p - -14 11143 14075 14408 10516 50142
+* 3p - -13 11244 13723 14260 10915 50142
+* 3p - -12 11379 13978 13822 10962 50141
+* 3p - -11 11062 14042 14358 10677 50139
+* 3p - -10 11492 13844 14198 10604 50138
+* 3p - -9 11523 13991 13880 10749 50143
+* 3p - -8 11754 13563 14113 10713 50143
+* 3p - -7 11906 13462 13950 10825 50143
+* 3p - -6 12701 13158 13359 10925 50143
+* 3p - -5 11839 13266 14219 10819 50143
+* 3p - -4 12835 12671 14148 10489 50143
+* 3p - -3 12045 13150 13977 10971 50143
+* 3p - -2 13338 14116 12919 9770 50143
+* 3p - -1 13551 11271 15292 10029 50143
+* 3p - 1 4606 19573 6364 19542 50085
+* 3p - 2 10722 13441 14510 11407 50080
+* 3p - 3 11649 12995 14181 11252 50077
+* 3p - 4 11756 13617 13106 11596 50075
+* 3p - 5 11597 13337 13554 11583 50071
+* 3p - 6 12025 13031 13508 11505 50069
+* 3p - 7 11626 13767 13105 11569 50067
+* 3p - 8 11904 13202 13441 11519 50066
+* 3p - 9 12019 12927 13489 11630 50065
+* 3p - 10 11824 13388 13228 11625 50065
+* 5p + -10 11838 13041 13245 11715 49839
+* 5p + -9 11735 13280 12917 11907 49839
+* 5p + -8 11595 13339 13214 11693 49841
+* 5p + -7 11671 12906 13345 11921 49843
+* 5p + -6 11417 13491 12853 12084 49845
+* 5p + -5 11592 13416 13100 11737 49845
+* 5p + -4 11683 13009 13424 11731 49847
+* 5p + -3 11211 14147 12917 11573 49848
+* 5p + -2 12507 14025 13737 9579 49848
+* 5p + -1 20468 5493 20020 3869 49850
+* 5p + 1 10503 14138 11530 13686 49857
+* 5p + 2 10351 12936 13700 12870 49857
+* 5p + 3 10884 13817 13249 11898 49848
+* 5p + 4 10903 14205 12830 11917 49855
+* 5p + 5 10750 13930 13550 11627 49857
+* 5p + 6 11237 13485 13445 11690 49857
+* 5p + 7 10944 14059 13414 11440 49857
+* 5p + 8 10800 14053 13840 11164 49857
+* 5p + 9 11147 13859 13571 11280 49857
+* 5p + 10 10817 14192 13696 11152 49857
+* 5p + 11 10538 14137 13956 11226 49857
+* 5p + 12 10924 13768 13947 11218 49857
+* 5p + 13 10831 14166 13491 11369 49857
+* 5p + 14 10734 14140 13799 11184 49857
+* 5p + 15 10699 13983 13923 11251 49856
+* 5p + 16 10588 14375 13779 11101 49843
+* 5p + 17 10532 14419 14107 10799 49857
+* 5p + 18 10882 13913 14042 11020 49857
+* 5p + 19 10597 14326 13851 11083 49857
+* 5p + 20 10622 14503 13809 10923 49857
+* 5p + 21 10977 13936 14013 10915 49841
+* 5p + 22 10742 14398 13892 10794 49826
+* 5p + 23 10658 14418 13940 10800 49816
+* 5p + 24 10810 14151 14079 10762 49802
+* 5p + 25 10524 14384 13890 10992 49790
+* 5p + 26 10439 14189 13982 10556 49166
+* 5p + 27 10567 13779 13976 10357 48679
+* 5p + 28 10485 13799 13494 10529 48307
+* 5p + 29 10273 13712 13822 10118 47925
+* 5p + 30 10518 13379 13694 10037 47628
+* 5p + 31 10189 13542 13351 10256 47338
+* 5p + 32 10143 13430 13468 10006 47047
+* 5p + 33 10063 13034 13404 10218 46719
+* 5p + 34 9990 13198 12939 10252 46379
+* 5p + 35 10047 13040 13055 9846 45988
+* 5p + 36 10177 12733 12806 9812 45528
+* 5p + 37 9976 12732 12612 9741 45061
+* 5p + 38 9854 12559 12552 9596 44561
+* 5p + 39 9951 12057 12443 9500 43951
+* 5p + 40 9663 12029 11953 9649 43294
+* 5p + 41 9630 11800 12021 9125 42576
+* 5p + 42 9576 11446 11823 8996 41841
+* 5p + 43 9404 11467 11266 8882 41019
+* 5p + 44 9260 11007 11102 8815 40184
+* 5p + 45 9130 10526 10904 8712 39272
+* 5p + 46 8823 10480 10527 8509 38339
+* 5p + 47 8653 10066 10422 8234 37375
+* 5p + 48 8683 9736 10080 7836 36335
+* 5p + 49 8238 9511 9635 7888 35272
+* 5p + 50 7912 9245 9440 7594 34191
+* 5p + 51 7830 8870 9155 7202 33057
+* 5p + 52 7617 8582 8622 7124 31945
+* 5p + 53 7374 8308 8309 6852 30843
+* 5p + 54 7175 7868 7916 6734 29693
+* 5p + 55 6912 7533 7664 6470 28579
+* 5p + 56 6549 7299 7387 6150 27385
+* 5p + 57 6444 6779 7275 5765 26263
+* 5p + 58 6048 6667 6848 5613 25176
+* 5p + 59 5986 6342 6433 5378 24139
+* 5p + 60 5837 5896 6246 5143 23122
+* 5p + 61 5565 5735 5823 5002 22125
+* 5p + 62 5279 5427 5783 4710 21199
+* 5p + 63 5312 5075 5469 4465 20321
+* 5p + 64 4992 4794 5326 4339 19451
+* 5p + 65 4893 4538 5106 4100 18637
+* 5p + 66 4742 4287 4864 3906 17799
+* 5p + 67 4652 4029 4619 3765 17065
+* 5p + 68 4417 4011 4382 3552 16362
+* 5p + 69 4149 3574 4340 3347 15410
+* 5p + 70 3832 3396 3981 3193 14402
+* 5p - -10 11863 12934 13466 11858 50121
+* 5p - -9 11640 13498 12931 12052 50121
+* 5p - -8 11600 13485 12889 12148 50122
+* 5p - -7 11519 13215 13414 11974 50122
+* 5p - -6 11492 13468 12935 12228 50123
+* 5p - -5 11494 13724 13300 11608 50126
+* 5p - -4 11654 13114 13656 11704 50128
+* 5p - -3 11203 14372 12913 11640 50128
+* 5p - -2 12539 14035 13964 9593 50131
+* 5p - -1 20222 5600 20432 3878 50132
+* 5p - 1 10624 14216 11651 13652 50143
+* 5p - 2 10523 13190 13584 12846 50143
+* 5p - 3 10821 13912 13189 12212 50134
+* 5p - 4 10984 14044 13173 11940 50141
+* 5p - 5 10951 14032 13431 11727 50141
+* 5p - 6 11348 13453 13689 11653 50143
+* 5p - 7 10997 14048 13563 11535 50143
+* 5p - 8 10922 14156 13764 11301 50143
+* 5p - 9 11149 14066 13862 11066 50143
+* 5p - 10 10910 14183 13707 11343 50143
+* 5p - 11 10879 14274 13895 11095 50143
+* 5p - 12 11019 13852 14097 11175 50143
+* 5p - 13 10696 14493 13572 11382 50143
+* 5p - 14 10597 14255 13868 11423 50143
+* 5p - 15 10832 13978 14146 11187 50143
+* 5p - 16 10824 14445 13629 11232 50130
+* 5p - 17 10694 14515 13871 11063 50143
+* 5p - 18 10799 14219 14078 11047 50143
+* 5p - 19 10995 14392 13904 10852 50143
+* 5p - 20 10700 14431 14024 10988 50143
+* 5p - 21 10677 14185 14262 11001 50125
+* 5p - 22 10703 14356 14047 11004 50110
+* 5p - 23 10600 14476 14103 10911 50090
+* 5p - 24 10599 14290 14233 10956 50078
+* 5p - 25 10701 14317 14000 11049 50067
+* 5p - 26 10463 14344 14025 10600 49432
+* 5p - 27 10749 13722 13920 10536 48927
+* 5p - 28 10459 13824 13591 10649 48523
+* 5p - 29 10158 13907 13800 10266 48131
+* 5p - 30 10583 13640 13446 10149 47818
+* 5p - 31 10199 13393 13470 10437 47499
+* 5p - 32 10320 13435 13444 9985 47184
+* 5p - 33 10302 13124 13351 10067 46844
+* 5p - 34 10223 13169 12853 10258 46503
+* 5p - 35 9939 13213 13011 9948 46111
+* 5p - 36 10123 12891 13076 9558 45648
+* 5p - 37 10117 12725 12574 9783 45199
+* 5p - 38 9817 12661 12641 9567 44686
+* 5p - 39 9873 12027 12595 9594 44089
+* 5p - 40 9732 12145 12114 9455 43446
+* 5p - 41 9661 11895 12005 9157 42718
+* 5p - 42 9514 11446 11869 9124 41953
+* 5p - 43 9439 11227 11281 9159 41106
+* 5p - 44 9136 11207 11102 8815 40260
+* 5p - 45 9106 10707 10841 8734 39388
+* 5p - 46 8815 10488 10388 8690 38381
+* 5p - 47 8687 10080 10374 8282 37423
+* 5p - 48 8532 9637 10254 8005 36428
+* 5p - 49 8287 9665 9552 7845 35349
+* 5p - 50 8142 9242 9365 7541 34290
+* 5p - 51 7856 8849 9258 7301 33264
+* 5p - 52 7470 8583 8840 7249 32142
+* 5p - 53 7383 8278 8370 7011 31042
+* 5p - 54 7112 7956 8189 6629 29886
+* 5p - 55 6759 7691 7819 6471 28740
+* 5p - 56 6722 7186 7514 6239 27661
+* 5p - 57 6423 6900 7380 5891 26594
+* 5p - 58 6220 6639 6837 5818 25514
+* 5p - 59 6030 6338 6585 5484 24437
+* 5p - 60 5875 6064 6219 5242 23400
+* 5p - 61 5596 5856 5941 5049 22442
+* 5p - 62 5384 5531 5779 4762 21456
+* 5p - 63 5288 5155 5516 4560 20519
+* 5p - 64 5225 4891 5123 4411 19650
+* 5p - 65 4966 4706 5062 4099 18833
+* 5p - 66 4872 4330 4940 3861 18003
+* 5p - 67 4769 4222 4556 3735 17282
+* 5p - 68 4306 3934 4539 3699 16478
+* 5p - 69 4147 3646 4500 3251 15544
+* 5p - 70 3856 3411 4097 3213 14577
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/dnacomp_genome.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/dnacomp_genome.csv
new file mode 100644
index 0000000..2fc7659
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/dnacomp_genome.csv
@@ -0,0 +1,2 @@
+A,C,G,T
+0.245290724081,0.254598401178,0.255055853499,0.245055021242
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/lgdistribution.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/lgdistribution.txt
new file mode 100644
index 0000000..a74a58d
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/lgdistribution.txt
@@ -0,0 +1,300 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_nucl.fasta as reference file
+# Std: strand of reads
+Std Length Occurences
++ 23 4
++ 24 39
++ 25 171
++ 26 198
++ 27 131
++ 28 150
++ 29 159
++ 30 166
++ 31 177
++ 32 197
++ 33 232
++ 34 291
++ 35 363
++ 36 384
++ 37 422
++ 38 500
++ 39 584
++ 40 659
++ 41 674
++ 42 763
++ 43 770
++ 44 857
++ 45 893
++ 46 926
++ 47 1000
++ 48 1020
++ 49 1036
++ 50 1104
++ 51 1089
++ 52 1084
++ 53 1125
++ 54 1098
++ 55 1158
++ 56 1100
++ 57 1065
++ 58 1023
++ 59 987
++ 60 974
++ 61 917
++ 62 859
++ 63 872
++ 64 798
++ 65 803
++ 66 717
++ 67 723
++ 68 887
++ 69 1073
++ 70 2151
++ 71 933
++ 72 1471
++ 73 1506
++ 74 590
++ 75 450
++ 76 420
++ 77 444
++ 78 381
++ 79 360
++ 80 352
++ 81 291
++ 82 266
++ 83 311
++ 84 249
++ 85 251
++ 86 210
++ 87 240
++ 88 190
++ 89 171
++ 90 175
++ 91 145
++ 92 149
++ 93 136
++ 94 370
++ 95 86
++ 96 81
++ 97 74
++ 98 107
++ 99 81
++ 100 60
++ 101 68
++ 102 62
++ 103 53
++ 104 46
++ 105 36
++ 106 40
++ 107 41
++ 108 48
++ 109 31
++ 110 31
++ 111 26
++ 112 24
++ 113 26
++ 114 17
++ 115 16
++ 116 24
++ 117 15
++ 118 32
++ 119 24
++ 120 17
++ 121 12
++ 122 7
++ 123 13
++ 124 14
++ 125 18
++ 126 7
++ 127 12
++ 128 9
++ 129 15
++ 130 12
++ 131 14
++ 132 9
++ 133 10
++ 134 5
++ 135 3
++ 136 3
++ 137 4
++ 138 4
++ 139 8
++ 140 3
++ 141 2
++ 142 4
++ 143 4
++ 144 2
++ 145 4
++ 146 2
++ 147 2
++ 148 3
++ 149 1
++ 150 1
++ 152 1
++ 153 7
++ 154 3
++ 155 1
++ 156 3
++ 157 3
++ 158 2
++ 161 2
++ 163 3
++ 164 1
++ 167 2
++ 168 2
++ 169 1
++ 170 1
++ 172 1
++ 173 2
++ 174 2
++ 178 1
++ 179 1
++ 183 2
+- 23 4
+- 24 40
+- 25 193
+- 26 190
+- 27 162
+- 28 165
+- 29 150
+- 30 175
+- 31 175
+- 32 230
+- 33 243
+- 34 302
+- 35 379
+- 36 360
+- 37 435
+- 38 499
+- 39 564
+- 40 657
+- 41 690
+- 42 785
+- 43 789
+- 44 820
+- 45 953
+- 46 902
+- 47 960
+- 48 1034
+- 49 1024
+- 50 998
+- 51 1105
+- 52 1062
+- 53 1138
+- 54 1121
+- 55 1048
+- 56 1050
+- 57 1060
+- 58 1044
+- 59 1026
+- 60 941
+- 61 953
+- 62 933
+- 63 857
+- 64 797
+- 65 815
+- 66 720
+- 67 772
+- 68 898
+- 69 1031
+- 70 2198
+- 71 1007
+- 72 1474
+- 73 1506
+- 74 631
+- 75 477
+- 76 407
+- 77 398
+- 78 378
+- 79 373
+- 80 371
+- 81 332
+- 82 302
+- 83 288
+- 84 261
+- 85 222
+- 86 219
+- 87 217
+- 88 208
+- 89 197
+- 90 151
+- 91 156
+- 92 156
+- 93 126
+- 94 370
+- 95 87
+- 96 95
+- 97 83
+- 98 115
+- 99 80
+- 100 70
+- 101 53
+- 102 51
+- 103 48
+- 104 41
+- 105 47
+- 106 37
+- 107 36
+- 108 43
+- 109 32
+- 110 35
+- 111 26
+- 112 23
+- 113 24
+- 114 28
+- 115 22
+- 116 24
+- 117 13
+- 118 18
+- 119 15
+- 120 16
+- 121 11
+- 122 15
+- 123 12
+- 124 13
+- 125 13
+- 126 20
+- 127 10
+- 128 14
+- 129 10
+- 130 7
+- 131 12
+- 132 12
+- 133 12
+- 134 7
+- 135 6
+- 136 4
+- 137 6
+- 138 4
+- 139 4
+- 140 3
+- 141 2
+- 142 8
+- 144 3
+- 145 3
+- 146 1
+- 147 2
+- 148 1
+- 150 3
+- 151 6
+- 152 3
+- 153 2
+- 155 1
+- 157 3
+- 158 2
+- 159 2
+- 160 2
+- 161 2
+- 164 1
+- 166 1
+- 167 1
+- 169 1
+- 170 3
+- 171 2
+- 172 1
+- 173 1
+- 174 1
+- 176 1
+- 177 1
+- 179 1
+- 180 1
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/misincorporation.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/misincorporation.txt
new file mode 100644
index 0000000..3165243
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.Pi_nucl.mapDamage/Pi1845A_id_CGCTAT/misincorporation.txt
@@ -0,0 +1,284 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_nucl.fasta as reference file
+# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads
+Chr End Std Pos A C G T Total G>A C>T A>G T>C A>C A>T C>G C>A T>G T>A G>C G>T A>- T>- C>- G>- ->A ->T ->C ->G S
+* 3p + 1 12707 11749 14870 10477 49803 597 221 369 145 133 136 349 338 323 296 143 203 0 0 0 0 0 0 0 0 215
+* 3p + 2 12488 14052 13117 10132 49789 570 151 234 167 165 117 250 453 196 405 186 131 5 3 0 5 1 4 14 1 208
+* 3p + 3 11456 13480 13775 11054 49765 355 176 256 124 107 147 308 303 278 279 98 141 1 15 8 4 8 38 1 3 201
+* 3p + 4 11787 13316 13765 10892 49760 374 91 170 129 103 59 219 435 193 439 98 62 1 7 5 3 43 4 9 5 200
+* 3p + 5 11469 13649 13857 10778 49753 264 100 163 72 80 56 261 263 178 244 64 57 6 8 12 3 37 24 4 11 194
+* 3p + 6 11549 13472 13498 11062 49581 254 72 97 71 73 37 114 310 87 315 35 56 2 10 14 4 192 32 13 17 191
+* 3p + 7 11500 13471 13705 10926 49602 153 72 81 77 66 22 92 139 84 127 47 45 19 68 73 24 130 21 17 70 188
+* 3p + 8 11272 13509 14107 10720 49608 165 53 58 82 59 22 49 126 52 116 27 27 12 52 57 14 170 10 11 45 187
+* 3p + 9 11387 13886 13643 10805 49721 123 68 70 64 60 21 34 96 65 74 25 23 24 43 48 18 81 12 6 26 186
+* 3p + 10 11308 13638 13969 10852 49767 98 51 56 59 54 18 39 68 41 54 38 22 19 21 28 16 50 5 2 24 185
+* 3p + 11 11111 13668 14340 10687 49806 133 59 53 63 59 18 35 42 44 29 37 25 10 32 30 13 26 7 0 11 184
+* 3p + 12 11142 13873 13836 10967 49818 106 63 49 78 52 21 36 42 29 30 29 21 12 10 15 14 18 3 5 5 182
+* 3p + 13 11419 13419 14307 10685 49830 94 62 63 65 50 12 27 34 45 17 22 28 7 6 11 10 6 5 7 7 180
+* 3p + 14 11173 13945 14192 10533 49843 93 53 59 63 51 28 28 30 51 13 27 23 4 3 8 7 4 1 5 4 179
+* 3p + 15 10992 14055 13938 10850 49835 82 71 50 57 47 13 24 23 47 15 23 12 4 6 8 4 6 6 3 7 178
+* 3p + 16 11184 13957 14199 10494 49834 95 52 54 58 51 22 25 22 35 16 19 27 2 5 1 7 6 6 4 7 177
+* 3p + 17 10885 13892 14255 10801 49833 83 56 66 43 57 14 24 26 30 19 14 18 2 3 2 5 7 7 6 4 176
+* 3p + 18 10885 14182 13846 10926 49839 94 45 54 49 46 23 27 25 41 15 23 18 0 2 3 5 6 4 4 4 175
+* 3p + 19 11004 13876 14273 10683 49836 109 58 60 64 46 17 24 26 30 17 19 24 3 4 4 7 3 9 4 5 175
+* 3p + 20 10887 13931 14225 10792 49835 96 50 48 71 52 15 26 31 26 16 27 25 2 5 5 4 6 6 8 2 175
+* 3p + 21 10767 14281 13963 10811 49822 68 64 50 62 45 12 24 21 36 15 19 21 4 7 2 2 2 2 10 6 174
+* 3p + 22 10880 14006 14104 10829 49819 80 57 49 55 39 15 29 18 41 15 15 16 9 3 1 2 1 1 2 2 173
+* 3p + 23 10853 14188 14333 10432 49806 60 55 59 48 52 14 37 24 30 19 20 21 2 4 1 5 3 1 2 5 171
+* 3p + 24 10835 14066 14032 10863 49796 72 45 62 49 32 11 19 22 39 15 20 25 6 2 4 7 4 3 1 0 171
+* 3p + 25 10998 13916 14160 10708 49782 76 58 43 54 55 16 15 33 30 18 32 28 7 2 5 8 4 3 2 0 169
+* 3p + 26 10860 13975 14126 10304 49265 81 79 60 52 50 18 32 31 28 17 25 17 8 5 5 5 0 3 3 1 168
+* 3p + 27 10662 13958 13549 10562 48731 76 64 55 54 50 17 24 24 35 22 28 28 3 3 4 2 2 3 3 2 168
+* 3p + 28 10531 13490 13898 10416 48335 76 77 45 50 41 15 21 19 38 15 19 26 2 4 2 2 5 3 4 4 168
+* 3p + 29 10337 13799 13663 10153 47952 82 90 55 60 48 9 30 22 26 12 23 23 2 3 2 1 2 4 2 3 168
+* 3p + 30 10174 13658 13295 10516 47643 72 81 55 57 42 10 26 23 32 16 23 22 1 2 2 1 2 1 1 1 167
+* 3p + 31 10346 13284 13389 10330 47349 69 71 46 51 40 16 24 14 28 26 16 22 3 3 6 0 0 0 3 1 166
+* 3p + 32 10326 13314 13408 10007 47055 66 71 51 53 46 17 11 30 27 14 15 24 3 1 0 5 2 1 1 0 164
+* 3p + 33 10039 13368 12965 10354 46726 61 71 39 54 45 17 15 32 26 14 26 28 0 2 2 3 0 3 1 1 158
+* 3p + 34 10106 12954 13116 10213 46389 71 62 49 65 30 12 24 19 30 17 14 22 6 3 4 2 1 1 1 0 157
+* 3p + 35 9982 13134 13010 9866 45992 51 76 54 46 45 18 21 19 24 12 12 26 3 2 2 0 2 1 2 0 151
+* 3p + 36 9838 13025 12733 9930 45526 55 70 55 35 25 9 20 19 20 17 25 12 0 2 7 0 0 2 1 1 144
+* 3p + 37 9922 12627 12699 9815 45063 58 87 53 41 27 19 13 15 17 21 14 21 2 2 4 0 0 2 3 2 132
+* 3p + 38 9733 12432 12407 9992 44564 52 89 41 39 30 20 21 17 19 4 15 29 0 1 2 2 1 0 0 2 117
+* 3p + 39 9505 12552 12185 9720 43962 65 84 54 51 20 10 13 21 13 7 24 20 3 2 1 2 0 2 2 0 109
+* 3p + 40 9488 12094 12024 9688 43294 52 89 38 29 25 16 21 20 15 9 18 16 1 2 2 3 1 2 2 0 105
+* 3p + 41 9324 11928 11789 9533 42574 74 71 46 34 21 11 18 18 15 15 12 18 1 1 0 3 4 2 2 1 97
+* 3p + 42 9098 11846 11512 9390 41846 54 89 41 53 32 17 6 17 18 17 14 18 2 4 2 2 2 0 2 3 83
+* 3p + 43 9168 11387 11149 9315 41019 36 88 31 37 23 11 8 5 24 13 10 20 3 1 3 3 0 1 3 2 69
+* 3p + 44 8799 11302 11030 9057 40188 54 102 47 43 29 8 15 21 18 12 16 19 2 3 2 3 0 0 4 1 65
+* 3p + 45 8741 10943 10487 9111 39282 50 92 36 40 21 11 8 17 14 11 19 20 0 1 1 2 3 5 0 2 58
+* 3p + 46 8434 10811 10417 8682 38344 38 104 51 39 24 9 14 14 18 8 4 18 3 1 2 2 2 4 0 1 52
+* 3p + 47 8265 10492 10113 8512 37382 43 81 43 36 24 15 7 14 24 13 11 14 3 0 0 1 2 1 0 2 44
+* 3p + 48 8031 10138 9645 8527 36341 52 81 26 31 16 10 12 11 15 19 10 12 0 2 5 1 0 1 0 1 34
+* 3p + 49 8003 9719 9449 8108 35279 30 93 40 42 20 14 11 14 11 9 8 10 1 4 4 3 1 0 1 0 26
+* 3p + 50 7532 9474 9368 7827 34201 38 89 24 27 16 5 10 18 11 10 10 16 3 2 4 5 0 1 0 2 16
+* 3p + 51 7387 9164 8821 7691 33063 42 91 20 27 20 11 11 8 17 8 6 16 4 1 2 1 0 1 0 1 11
+* 3p + 52 7204 8832 8441 7478 31955 37 78 25 27 20 2 8 15 10 9 8 18 3 2 1 3 0 0 0 0 10
+* 3p + 53 6929 8405 8237 7278 30849 39 80 29 39 15 6 8 10 11 12 4 10 1 2 2 0 4 0 0 1 9
+* 3p + 54 6603 8171 7976 6947 29697 27 99 24 23 17 5 6 6 13 8 5 15 0 1 1 3 6 0 0 1 8
+* 3p + 55 6525 7639 7584 6833 28581 20 76 26 25 9 10 5 9 12 9 7 16 1 1 0 1 0 3 1 1 7
+* 3p + 56 6106 7559 7330 6401 27396 32 73 18 29 6 2 6 7 6 9 5 9 2 0 2 1 0 1 0 1 6
+* 3p + 57 5971 7186 6760 6354 26271 25 67 19 29 15 6 8 9 9 8 6 15 1 1 0 0 0 2 0 2 6
+* 3p + 58 5630 6777 6699 6079 25185 27 69 29 22 8 9 8 8 7 7 7 18 0 0 2 2 0 0 0 1 6
+* 3p + 59 5456 6540 6481 5670 24147 20 71 21 25 8 5 6 6 7 8 5 12 1 1 1 1 0 0 1 0 3
+* 3p + 60 5322 6160 6013 5634 23129 19 73 22 12 17 3 7 14 7 5 7 12 0 0 2 1 2 1 1 0 3
+* 3p + 61 4980 6092 5792 5267 22131 27 69 20 19 7 7 4 6 8 8 4 15 2 1 0 0 2 2 0 1 2
+* 3p + 62 4776 5745 5492 5191 21204 21 47 16 17 8 7 7 3 8 2 3 16 1 0 2 0 0 1 0 2 2
+* 3p + 63 4580 5612 5247 4887 20326 15 51 16 22 6 4 9 11 4 10 3 8 1 2 0 1 1 1 2 2 2
+* 3p + 64 4422 5202 5082 4747 19453 14 49 8 11 9 8 3 6 2 4 3 7 0 1 4 0 0 3 0 0 1
+* 3p + 65 4326 5121 4738 4457 18642 15 62 20 18 6 2 6 3 3 3 7 10 2 0 1 1 0 2 0 1 1
+* 3p + 66 4068 4879 4517 4359 17823 20 79 17 18 7 3 2 7 2 10 1 2 0 1 1 1 0 0 1 1 1
+* 3p + 67 3973 4659 4351 4105 17088 13 55 15 14 9 8 5 5 3 6 4 7 0 0 0 1 3 0 0 0 1
+* 3p + 68 3764 4434 4202 3982 16382 17 72 17 13 9 3 4 6 7 1 7 9 1 1 1 1 4 1 0 0 1
+* 3p + 69 3469 4158 3937 3887 15451 20 76 13 11 8 5 3 3 5 2 5 10 1 0 0 0 2 0 1 1 1
+* 3p + 70 3343 3833 3623 3649 14448 13 52 10 11 7 2 2 6 7 4 2 3 1 0 0 1 3 0 2 1 1
+* 3p - 1 12995 11761 15065 10268 50089 557 201 378 119 136 158 384 324 347 321 159 186 0 0 0 0 0 0 0 0 219
+* 3p - 2 12447 14363 13070 10166 50046 561 130 235 177 156 122 252 435 208 391 198 117 1 1 3 4 5 6 24 9 217
+* 3p - 3 11576 13560 13789 11057 49982 368 187 236 133 102 140 280 289 247 252 97 157 1 9 5 2 27 54 14 20 216
+* 3p - 4 11824 13112 14108 10888 49932 377 98 174 106 82 61 209 416 151 401 81 80 2 9 10 7 103 19 19 30 212
+* 3p - 5 11305 13617 13947 11067 49936 253 96 165 101 78 45 222 262 165 219 51 71 4 21 27 9 68 37 13 59 208
+* 3p - 6 11740 13521 13399 11248 49908 261 76 99 71 79 30 100 361 78 346 39 53 14 33 45 12 118 34 15 50 207
+* 3p - 7 11514 13610 13837 10992 49953 174 57 91 68 60 30 83 179 66 174 40 31 15 45 56 35 61 22 12 84 205
+* 3p - 8 11377 13689 14092 10843 50001 158 68 61 57 52 26 62 143 52 141 31 20 14 29 51 19 61 12 11 51 201
+* 3p - 9 11367 14038 13838 10832 50075 139 60 100 73 72 30 51 95 47 99 25 31 11 28 43 25 27 13 3 21 201
+* 3p - 10 11344 13839 14246 10650 50079 132 66 61 54 65 16 30 73 46 55 26 25 21 20 24 14 29 7 6 13 200
+* 3p - 11 11009 14070 14325 10701 50105 100 76 66 69 56 18 30 48 40 35 29 31 11 30 19 11 13 5 6 8 198
+* 3p - 12 11344 13919 13859 11001 50123 97 55 55 65 63 16 25 40 41 23 30 15 4 8 18 14 4 7 3 3 196
+* 3p - 13 11229 13682 14266 10936 50113 92 51 68 63 57 13 39 35 33 14 37 24 8 7 7 9 9 3 10 6 195
+* 3p - 14 11111 14090 14411 10503 50115 99 56 66 54 44 22 31 38 43 21 25 23 3 7 7 8 12 4 2 8 192
+* 3p - 15 11216 14135 13836 10931 50118 89 52 58 64 59 21 22 30 33 29 21 14 4 1 6 4 6 4 10 3 192
+* 3p - 16 11277 13731 14349 10765 50122 113 60 60 66 59 12 25 20 31 14 31 21 5 4 5 6 4 7 3 4 189
+* 3p - 17 11093 14038 14282 10715 50128 91 52 57 59 53 16 25 23 39 22 22 17 3 7 6 8 1 1 4 6 188
+* 3p - 18 10964 14309 13965 10886 50124 82 53 57 56 42 13 28 30 30 19 37 20 4 3 7 4 4 7 3 3 187
+* 3p - 19 11306 13896 14209 10710 50121 98 60 53 54 55 16 26 30 29 22 27 17 2 6 4 5 4 3 4 8 187
+* 3p - 20 10808 14281 14379 10649 50117 94 46 54 67 45 22 29 30 31 14 15 17 3 5 3 2 6 1 7 9 186
+* 3p - 21 10736 14264 14134 10983 50117 78 67 48 57 50 12 19 23 35 13 16 28 2 5 4 5 1 4 0 4 186
+* 3p - 22 11116 13825 14319 10839 50099 84 62 54 58 49 13 21 30 37 23 13 17 4 6 3 6 5 2 4 0 185
+* 3p - 23 11126 14146 14241 10569 50082 79 57 66 69 42 30 24 32 30 18 12 26 2 0 3 4 3 1 1 2 185
+* 3p - 24 10791 14400 13970 10904 50065 69 78 49 55 53 25 25 26 35 23 24 28 2 3 1 6 3 2 2 3 185
+* 3p - 25 10992 14154 14169 10745 50060 70 67 44 56 45 25 28 36 37 20 28 30 3 4 2 5 1 3 3 0 181
+* 3p - 26 10489 14152 14109 10763 49513 79 72 50 50 51 23 27 28 37 23 32 24 3 4 3 4 1 2 0 0 181
+* 3p - 27 10656 14030 13657 10647 48990 85 72 51 46 54 17 26 33 37 19 12 23 3 1 1 3 1 1 1 1 180
+* 3p - 28 10614 13783 13761 10408 48566 79 65 51 58 48 17 19 16 30 14 17 20 3 1 2 2 1 0 1 1 179
+* 3p - 29 10494 13435 13807 10425 48161 76 76 43 48 50 12 28 26 19 13 22 30 1 0 2 5 2 2 0 1 179
+* 3p - 30 10242 13685 13327 10575 47829 70 78 49 59 41 9 27 30 25 15 19 26 1 5 1 2 5 4 1 2 179
+* 3p - 31 10348 13365 13409 10385 47507 69 81 51 66 33 18 20 19 27 19 28 18 5 3 1 4 0 3 3 2 178
+* 3p - 32 10120 13548 13414 10108 47190 62 80 56 45 28 23 18 22 27 12 17 25 4 5 2 1 0 2 0 2 176
+* 3p - 33 10049 13373 13078 10356 46856 54 75 51 58 41 18 18 13 36 19 19 27 2 2 3 2 1 0 1 3 173
+* 3p - 34 10232 12957 13242 10076 46507 48 70 47 67 43 16 20 21 31 17 17 29 1 2 2 4 3 0 1 2 164
+* 3p - 35 9968 12972 13190 9980 46110 54 73 56 49 39 8 15 18 24 13 16 28 2 5 2 1 1 4 1 1 156
+* 3p - 36 9749 13215 12563 10120 45647 55 92 40 50 31 13 22 18 18 12 10 16 1 4 5 4 2 1 3 2 148
+* 3p - 37 9991 12751 12700 9761 45203 64 85 46 38 28 8 12 19 28 17 21 17 3 3 1 3 1 0 1 3 137
+* 3p - 38 9802 12532 12496 9860 44690 60 94 37 35 33 17 15 30 35 18 12 21 3 0 2 6 1 2 0 3 131
+* 3p - 39 9590 12522 12019 9964 44095 61 86 44 40 30 8 7 16 30 16 19 18 4 1 3 3 0 3 1 1 123
+* 3p - 40 9480 12107 12098 9767 43452 70 77 45 48 22 15 15 15 16 11 16 27 2 1 1 4 3 1 0 1 114
+* 3p - 41 9326 12006 11907 9484 42723 56 95 43 30 24 11 17 16 16 9 17 18 1 1 1 7 4 1 2 0 104
+* 3p - 42 9065 11869 11392 9636 41962 42 81 38 34 19 10 16 14 21 10 11 18 3 3 1 3 0 0 0 1 91
+* 3p - 43 9198 11223 11425 9261 41107 40 80 44 37 20 9 10 14 15 6 14 19 2 2 1 4 2 1 1 0 81
+* 3p - 44 8813 11215 11278 8956 40262 41 86 40 34 19 11 11 15 16 14 25 21 2 5 3 4 0 3 0 2 75
+* 3p - 45 8484 10947 10647 9309 39387 60 98 20 39 22 7 17 16 19 8 20 15 1 2 3 2 2 1 0 2 71
+* 3p - 46 8593 10634 10321 8845 38393 63 87 44 53 20 8 14 18 18 11 8 22 3 0 4 1 0 1 0 0 61
+* 3p - 47 8286 10570 10139 8435 37430 45 90 40 40 18 8 7 10 13 11 12 17 0 2 3 2 0 2 2 1 50
+* 3p - 48 7950 10241 9806 8440 36437 39 87 31 31 11 11 5 14 12 10 6 17 0 0 4 2 1 1 1 1 43
+* 3p - 49 7901 9615 9610 8231 35357 41 82 39 28 20 14 3 14 9 10 7 18 2 1 2 1 0 2 3 0 31
+* 3p - 50 7593 9469 9279 7955 34296 40 79 35 22 14 9 10 8 15 11 10 18 1 1 0 3 1 1 1 1 22
+* 3p - 51 7407 9340 8799 7729 33275 40 73 45 22 15 3 6 14 15 12 7 17 1 0 2 0 0 1 1 0 16
+* 3p - 52 7292 8817 8450 7590 32149 32 100 34 31 10 12 5 9 14 11 4 16 2 2 1 0 0 1 0 1 13
+* 3p - 53 6880 8629 8384 7161 31054 38 113 31 31 18 6 3 11 7 6 7 12 3 1 0 1 2 1 0 0 8
+* 3p - 54 6674 8155 8056 7009 29894 39 95 21 26 9 6 6 10 11 8 9 11 1 2 2 0 2 0 2 0 7
+* 3p - 55 6648 7810 7454 6833 28745 34 79 25 24 18 5 14 14 17 10 9 9 1 0 1 1 0 1 1 0 5
+* 3p - 56 6106 7665 7345 6552 27668 36 94 31 27 13 9 3 13 11 7 5 8 0 3 2 2 0 0 0 1 3
+* 3p - 57 6044 7207 6992 6353 26596 26 92 25 16 12 3 8 10 7 6 7 10 1 0 0 1 0 0 0 2 3
+* 3p - 58 6010 6918 6660 5928 25516 15 75 25 25 13 9 5 8 9 7 8 6 0 1 1 1 0 2 1 2 1
+* 3p - 59 5670 6608 6464 5705 24447 22 76 18 20 10 5 5 7 14 5 6 16 2 0 2 1 1 4 0 0 1
+* 3p - 60 5275 6460 6070 5601 23406 21 71 20 12 16 8 8 7 8 3 10 12 0 2 2 2 0 1 0 0 1
+* 3p - 61 5110 5986 5982 5374 22452 28 76 20 21 15 6 3 4 8 3 8 5 0 1 3 1 0 0 0 0 1
+* 3p - 62 4857 5976 5583 5047 21463 26 56 22 22 5 14 4 11 9 5 9 4 1 1 0 2 0 1 0 0 1
+* 3p - 63 4570 5561 5387 5008 20526 21 55 18 12 8 10 10 8 5 9 3 8 0 0 1 1 1 0 0 0 0
+* 3p - 64 4569 5247 5171 4678 19665 17 67 15 20 7 7 3 13 6 4 3 4 1 1 0 2 0 0 0 0 0
+* 3p - 65 4389 5039 4911 4511 18850 14 58 19 16 10 3 6 13 4 6 9 6 0 0 0 0 0 0 0 0 0
+* 3p - 66 4062 4997 4565 4392 18016 18 40 9 19 8 5 3 6 4 2 2 4 0 0 1 0 1 0 2 0 0
+* 3p - 67 3912 4610 4547 4236 17305 13 66 18 4 6 4 2 11 6 5 2 4 0 0 1 1 1 1 0 0 0
+* 3p - 68 3775 4466 4222 4044 16507 18 58 12 20 9 4 1 2 4 2 4 5 0 0 2 0 2 0 0 0 0
+* 3p - 69 3508 4210 3945 3935 15598 9 48 11 15 6 5 3 7 9 4 4 4 0 1 0 0 0 0 0 0 0
+* 3p - 70 3409 3898 3737 3588 14632 14 67 9 10 6 3 5 5 3 6 3 7 2 0 0 1 0 0 0 0 0
+* 5p + 1 10477 14480 11642 13253 49852 117 467 77 73 99 63 62 81 54 67 93 96 0 0 0 0 0 0 0 0 21
+* 5p + 2 10376 13236 13732 12506 49850 81 398 75 66 62 53 43 35 47 43 49 66 2 5 7 4 1 1 1 0 15
+* 5p + 3 10902 13936 13289 11705 49832 60 233 60 65 68 33 29 37 49 42 41 74 8 5 6 7 3 4 3 3 6
+* 5p + 4 10925 14315 12832 11760 49832 49 190 56 58 83 27 27 65 38 26 32 53 6 7 7 3 5 8 4 4 5
+* 5p + 5 10769 14037 13515 11505 49826 52 173 61 49 51 21 27 29 32 25 15 24 8 4 8 6 8 8 7 6 4
+* 5p + 6 11221 13570 13440 11606 49837 58 151 48 53 40 17 20 23 31 24 15 24 8 9 4 7 11 4 1 2 3
+* 5p + 7 10996 14093 13395 11351 49835 55 135 59 67 48 26 25 14 25 10 21 19 6 4 5 5 1 9 5 6 3
+* 5p + 8 10842 14091 13803 11100 49836 45 119 61 53 52 18 24 23 26 21 19 22 5 4 3 4 5 4 6 5 3
+* 5p + 9 11167 13894 13547 11225 49833 48 103 54 54 51 14 26 22 24 14 16 22 1 3 3 3 9 5 3 5 3
+* 5p + 10 10855 14204 13674 11106 49839 48 96 57 51 33 15 17 19 37 14 18 20 3 1 4 1 5 9 1 2 3
+* 5p + 11 10582 14176 13937 11145 49840 40 107 54 39 39 28 16 10 23 16 19 26 0 5 4 4 4 2 3 7 2
+* 5p + 12 10977 13763 13910 11182 49832 49 87 69 49 45 25 12 13 27 15 12 16 1 2 8 3 6 5 4 9 2
+* 5p + 13 10854 14168 13459 11353 49834 55 83 61 68 49 14 30 28 34 15 15 33 1 4 5 6 5 6 3 9 2
+* 5p + 14 10740 14147 13795 11161 49843 59 89 58 64 36 11 20 20 33 14 18 26 4 3 1 3 5 2 2 5 2
+* 5p + 15 10728 13998 13905 11211 49842 38 93 54 53 49 20 14 20 29 18 10 21 4 4 6 17 3 1 8 2 2
+* 5p + 16 10584 14339 13807 11096 49826 65 90 43 86 58 20 29 35 38 31 50 26 16 10 21 11 2 7 4 4 2
+* 5p + 17 10536 14423 14097 10791 49847 64 96 45 48 43 13 27 25 39 14 21 27 12 26 3 5 0 3 6 1 2
+* 5p + 18 10883 13933 14003 11029 49848 57 78 53 63 44 17 38 24 44 22 14 28 12 5 2 12 2 2 1 4 2
+* 5p + 19 10619 14321 13847 11064 49851 51 90 46 63 52 12 20 23 27 12 13 19 10 9 6 5 2 1 2 1 2
+* 5p + 20 10652 14500 13782 10911 49845 63 71 58 61 48 15 30 26 33 25 29 23 12 5 10 9 5 2 3 2 2
+* 5p + 21 11001 13920 13987 10926 49834 72 87 72 54 74 17 32 37 47 16 21 23 8 8 5 7 0 3 4 0 2
+* 5p + 22 10744 14398 13908 10769 49819 70 98 55 65 65 14 22 34 32 25 29 27 7 3 8 3 3 0 2 1 2
+* 5p + 23 10662 14444 13868 10834 49808 67 83 60 68 51 16 42 34 61 27 31 12 4 2 8 6 1 2 2 3 1
+* 5p + 24 10837 14133 14056 10769 49795 63 69 59 76 45 25 28 31 38 15 31 20 4 2 5 3 1 2 4 1 1
+* 5p + 25 10561 14369 13868 10984 49782 60 84 59 64 75 14 34 29 34 17 23 25 7 4 2 3 1 3 2 2 1
+* 5p + 26 10499 14149 13993 10621 49262 59 67 65 54 62 10 24 18 45 18 30 21 6 5 3 7 2 1 4 3 1
+* 5p + 27 10621 13784 13945 10383 48733 61 77 71 65 57 6 45 26 39 16 40 25 3 1 0 4 0 2 4 2 1
+* 5p + 28 10484 13802 13511 10548 48345 67 66 55 58 39 14 25 20 30 19 21 16 1 2 4 3 1 0 3 2 1
+* 5p + 29 10262 13724 13838 10133 47957 73 60 44 62 41 11 31 23 44 14 23 29 2 1 3 4 2 3 1 0 1
+* 5p + 30 10520 13378 13698 10046 47642 84 74 55 55 30 20 24 19 37 12 26 14 0 2 2 0 3 1 2 0 1
+* 5p + 31 10163 13551 13384 10250 47348 80 66 45 52 38 13 23 23 36 11 17 15 1 2 3 3 2 1 1 0 1
+* 5p + 32 10129 13426 13492 10007 47054 90 65 50 48 34 8 15 23 31 7 19 21 2 2 1 1 0 3 2 0 1
+* 5p + 33 10019 13040 13439 10230 46728 87 55 43 51 27 8 21 18 26 10 17 19 1 2 4 1 0 1 1 1 1
+* 5p + 34 9996 13198 12926 10265 46385 62 67 46 51 30 9 26 15 44 13 12 12 4 0 1 4 1 2 3 0 1
+* 5p + 35 10020 13024 13080 9867 45991 71 43 36 39 38 7 15 24 31 17 18 26 2 0 2 2 3 0 2 1 1
+* 5p + 36 10147 12737 12824 9814 45522 70 65 38 58 40 12 21 31 31 9 18 14 0 1 2 3 5 1 3 0 1
+* 5p + 37 9950 12738 12632 9747 45067 79 59 50 67 28 8 27 21 19 5 18 13 0 1 1 1 1 1 1 0 1
+* 5p + 38 9839 12549 12567 9606 44561 66 55 50 60 44 15 12 34 20 15 22 14 1 1 3 1 4 1 0 1 1
+* 5p + 39 9939 12062 12453 9505 43959 70 56 39 33 35 6 19 26 31 16 11 16 0 2 1 2 2 1 1 1 1
+* 5p + 40 9616 12029 11980 9662 43287 87 42 53 42 28 12 26 25 25 24 13 14 0 2 2 1 5 1 2 3 1
+* 5p + 41 9594 11838 12019 9123 42574 96 62 43 44 33 7 25 18 19 11 10 18 0 1 0 2 0 3 1 3 1
+* 5p + 42 9511 11467 11886 8979 41843 100 64 51 44 25 6 14 25 19 13 17 13 0 1 2 1 3 1 2 1 1
+* 5p + 43 9339 11494 11303 8873 41009 82 57 37 34 22 6 15 25 18 9 10 15 0 2 2 0 6 3 2 3 1
+* 5p + 44 9216 10997 11156 8821 40190 88 47 38 53 18 6 16 16 21 12 16 19 1 1 2 1 0 0 0 1 1
+* 5p + 45 9050 10560 10959 8716 39285 99 52 47 41 23 5 17 22 17 17 10 18 1 2 3 4 1 3 0 1 1
+* 5p + 46 8787 10459 10582 8517 38345 96 39 48 40 24 9 12 15 19 13 16 20 0 2 6 3 1 2 1 0 1
+* 5p + 47 8584 10071 10483 8242 37380 107 36 33 33 12 10 17 17 25 7 17 21 1 3 3 2 3 2 0 0 0
+* 5p + 48 8605 9768 10132 7834 36339 101 44 41 38 15 12 12 20 20 21 11 15 2 1 1 0 0 0 2 1 0
+* 5p + 49 8143 9518 9715 7899 35275 99 35 24 22 23 11 21 22 17 21 12 14 2 1 0 1 4 0 1 1 0
+* 5p + 50 7874 9249 9460 7608 34191 68 27 36 30 16 3 10 20 11 13 7 9 1 3 4 0 6 1 3 2 0
+* 5p + 51 7757 8876 9236 7191 33060 107 42 34 23 23 7 12 11 11 14 13 17 0 3 0 3 2 0 1 1 0
+* 5p + 52 7553 8584 8672 7139 31948 88 29 37 36 17 9 15 19 15 19 20 8 0 2 1 4 4 0 1 1 0
+* 5p + 53 7316 8334 8356 6844 30850 94 30 30 23 19 12 18 28 9 9 9 15 1 3 1 0 4 1 0 0 0
+* 5p + 54 7064 7878 8009 6745 29696 106 33 32 36 14 8 14 22 11 14 13 11 1 0 2 2 2 2 1 3 0
+* 5p + 55 6824 7565 7729 6458 28576 88 36 19 35 15 11 16 25 11 23 11 20 0 1 7 0 7 0 0 1 0
+* 5p + 56 6454 7317 7424 6187 27382 88 27 27 23 11 9 20 27 18 19 8 13 1 1 0 2 8 1 3 3 0
+* 5p + 57 6335 6809 7315 5790 26249 92 23 38 27 11 7 12 29 13 21 9 11 1 1 5 1 13 2 3 6 0
+* 5p + 58 5932 6705 6879 5637 25153 70 30 30 24 9 7 12 32 27 36 5 11 2 4 6 2 23 1 1 6 0
+* 5p + 59 5835 6395 6438 5438 24106 94 34 40 34 12 14 21 46 23 53 5 10 2 3 9 2 26 4 3 5 0
+* 5p + 60 5624 5980 6276 5186 23066 103 22 26 19 6 7 26 57 20 41 10 11 1 9 15 1 41 6 1 12 0
+* 5p + 61 5326 5818 5866 5038 22048 121 29 35 20 13 13 32 61 21 66 12 17 4 15 11 4 51 5 2 17 0
+* 5p + 62 5030 5541 5772 4761 21104 93 37 35 18 13 15 39 80 34 61 20 19 4 12 15 6 55 7 3 22 0
+* 5p + 63 4998 5213 5416 4573 20200 88 28 39 17 18 17 54 102 39 96 17 17 5 24 19 5 86 2 5 23 0
+* 5p + 64 4638 4964 5255 4455 19312 95 25 51 23 20 14 72 131 50 112 23 22 6 27 32 7 73 14 6 33 0
+* 5p + 65 4444 4772 5008 4276 18500 114 39 60 23 22 24 93 147 63 171 25 28 2 30 31 10 90 11 4 22 0
+* 5p + 66 4272 4582 4747 4094 17695 127 46 69 37 26 32 114 208 94 177 17 32 7 21 25 4 78 20 3 10 0
+* 5p + 67 4187 4337 4490 3961 16975 123 44 93 35 27 40 136 222 93 224 41 41 8 17 21 6 54 17 9 16 0
+* 5p + 68 3979 4302 4213 3787 16281 156 64 101 58 33 53 142 227 131 217 42 47 6 15 20 4 56 15 3 15 0
+* 5p + 69 3723 3906 4216 3547 15392 177 84 123 44 42 71 139 255 132 226 60 79 6 6 7 5 20 13 4 5 0
+* 5p + 70 3467 3707 3831 3393 14398 158 80 149 57 46 64 163 225 137 227 69 76 2 3 3 0 17 10 4 2 0
+* 5p - 1 10589 14587 11770 13190 50136 138 484 101 76 99 76 61 101 58 71 99 102 0 0 0 0 0 0 0 0 32
+* 5p - 2 10564 13477 13652 12441 50134 76 404 69 57 74 58 34 44 63 40 62 99 4 0 1 1 2 0 0 0 22
+* 5p - 3 10848 14060 13228 11985 50121 65 252 67 65 65 29 39 28 44 31 41 80 1 2 6 5 3 3 1 0 17
+* 5p - 4 11036 14175 13151 11769 50131 36 210 74 47 71 31 26 46 43 33 28 56 4 5 0 6 2 1 0 2 15
+* 5p - 5 10969 14121 13415 11612 50117 52 166 55 50 44 16 21 21 35 16 17 23 3 2 4 10 5 7 4 4 15
+* 5p - 6 11373 13491 13649 11606 50119 47 128 52 67 57 22 27 28 44 16 19 23 2 5 4 7 8 6 3 3 15
+* 5p - 7 10999 14103 13545 11480 50127 63 119 67 66 39 13 18 26 26 15 6 27 5 3 6 5 4 5 1 2 13
+* 5p - 8 10924 14176 13764 11256 50120 60 107 50 56 41 19 20 22 32 16 23 13 3 6 3 6 8 6 4 3 13
+* 5p - 9 11191 14127 13821 10990 50129 51 131 57 39 53 19 19 22 34 27 13 15 5 4 2 4 1 3 4 3 13
+* 5p - 10 10924 14190 13684 11327 50125 52 96 56 70 42 12 14 21 35 18 18 23 3 4 6 2 4 3 3 5 12
+* 5p - 11 10894 14287 13875 11069 50125 46 112 61 68 44 16 11 24 34 19 11 20 3 4 2 8 4 6 5 0 11
+* 5p - 12 11024 13875 14087 11137 50123 65 96 46 49 53 14 19 20 31 19 13 21 3 1 4 3 5 5 3 5 9
+* 5p - 13 10714 14535 13532 11345 50126 66 104 61 50 41 12 34 19 50 14 20 21 3 2 4 1 5 0 6 6 7
+* 5p - 14 10624 14271 13853 11380 50128 45 112 36 65 52 21 19 22 33 9 16 23 1 4 3 5 4 4 2 5 6
+* 5p - 15 10841 13990 14155 11134 50120 55 96 48 50 35 18 15 24 36 17 19 18 1 2 6 18 8 6 6 3 6
+* 5p - 16 10810 14427 13639 11232 50108 70 95 55 92 73 15 38 41 32 24 45 17 9 6 21 7 5 9 2 6 6
+* 5p - 17 10722 14481 13817 11111 50131 57 91 65 74 48 12 20 28 37 14 17 13 12 26 6 8 5 3 2 2 5
+* 5p - 18 10819 14264 14040 11011 50134 67 94 66 53 51 16 38 24 36 25 19 21 8 9 12 14 2 1 1 5 5
+* 5p - 19 11037 14365 13877 10847 50126 47 77 53 58 51 15 21 11 31 14 17 18 9 11 8 14 3 3 3 8 4
+* 5p - 20 10710 14445 14014 10963 50132 55 94 72 64 53 19 26 31 27 23 33 20 5 10 12 10 3 5 1 2 4
+* 5p - 21 10687 14177 14237 11020 50121 73 85 65 61 53 20 37 26 43 33 28 24 7 7 3 6 3 2 2 0 4
+* 5p - 22 10719 14337 14034 11012 50102 68 73 54 64 55 20 36 32 38 15 20 21 6 9 8 9 2 1 4 4 3
+* 5p - 23 10644 14484 14029 10928 50085 58 95 83 78 46 18 33 27 47 19 34 23 8 5 1 5 0 1 3 3 3
+* 5p - 24 10635 14271 14209 10952 50067 47 68 62 60 51 19 24 28 43 7 30 19 3 3 4 3 1 2 4 5 3
+* 5p - 25 10755 14329 13929 11051 50064 72 79 71 57 50 19 38 28 46 25 16 17 2 3 4 3 0 2 1 3 3
+* 5p - 26 10457 14371 14042 10642 49512 79 74 47 61 59 10 27 28 32 17 18 16 4 3 3 1 1 3 1 1 3
+* 5p - 27 10748 13747 13946 10552 48993 76 91 53 70 47 19 43 38 39 15 42 21 4 3 7 2 0 1 2 0 3
+* 5p - 28 10489 13793 13607 10671 48560 71 61 69 72 45 16 15 33 37 15 21 20 5 6 3 4 0 4 3 2 3
+* 5p - 29 10158 13939 13755 10302 48154 42 72 56 68 30 13 30 29 45 17 24 24 1 5 3 1 3 2 5 1 2
+* 5p - 30 10593 13611 13451 10177 47832 79 61 52 55 52 16 22 24 41 14 26 11 1 4 4 6 2 2 2 2 2
+* 5p - 31 10227 13383 13473 10423 47506 57 57 43 51 47 16 18 20 27 8 12 19 3 3 3 4 4 0 2 2 2
+* 5p - 32 10321 13430 13458 9976 47185 76 62 49 45 41 12 17 20 26 16 18 20 3 1 1 1 1 3 3 2 2
+* 5p - 33 10287 13159 13349 10061 46856 68 62 50 42 26 9 34 22 21 16 10 19 3 3 2 2 0 1 1 2 2
+* 5p - 34 10204 13135 12887 10277 46503 83 53 43 60 33 14 11 19 30 11 20 13 2 1 4 3 0 2 3 5 2
+* 5p - 35 9917 13207 13032 9950 46106 71 69 45 60 34 9 17 21 31 8 14 20 2 2 1 2 2 2 4 3 2
+* 5p - 36 10089 12872 13126 9559 45646 76 49 35 47 44 13 10 34 21 10 23 15 1 2 1 2 2 2 1 3 2
+* 5p - 37 10082 12705 12612 9802 45201 86 47 37 49 36 5 15 14 19 13 15 18 3 5 2 0 1 1 2 3 2
+* 5p - 38 9795 12665 12654 9573 44687 68 52 54 46 35 5 25 33 23 8 30 14 1 0 0 3 0 3 1 5 2
+* 5p - 39 9833 12069 12626 9567 44095 91 59 49 27 21 7 18 23 24 17 18 22 1 1 0 3 0 3 1 1 1
+* 5p - 40 9719 12135 12125 9473 43452 81 48 61 42 29 8 26 22 27 16 19 17 0 0 0 1 1 1 0 3 1
+* 5p - 41 9581 11914 12061 9171 42727 118 51 43 43 27 10 17 21 20 9 15 6 1 0 2 3 3 1 0 0 1
+* 5p - 42 9505 11452 11902 9101 41960 79 52 51 39 23 8 14 15 10 11 13 14 0 0 2 2 1 0 2 0 1
+* 5p - 43 9392 11228 11337 9153 41110 83 49 39 32 22 8 18 13 20 18 13 19 0 2 2 4 0 1 0 0 1
+* 5p - 44 9077 11218 11145 8823 40263 103 45 42 48 21 13 21 22 13 7 11 8 0 2 2 1 2 1 0 1 1
+* 5p - 45 9029 10734 10885 8737 39385 93 55 42 43 21 6 28 13 17 17 14 18 2 0 1 2 2 1 3 1 1
+* 5p - 46 8760 10521 10423 8687 38391 107 56 47 40 21 9 19 27 26 18 12 14 1 2 1 3 2 1 1 0 0
+* 5p - 47 8578 10077 10466 8301 37422 112 42 29 39 22 9 14 21 22 23 15 22 0 1 6 1 2 1 1 8 0
+* 5p - 48 8460 9656 10305 8008 36429 85 45 30 37 18 14 15 18 19 21 10 13 3 1 2 5 3 3 1 3 0
+* 5p - 49 8231 9703 9578 7842 35354 100 39 33 29 16 12 13 22 22 14 9 15 1 1 2 6 0 1 2 3 0
+* 5p - 50 8063 9235 9431 7558 34287 91 32 35 39 15 11 20 21 21 18 18 19 3 3 2 2 3 2 2 3 0
+* 5p - 51 7783 8884 9322 7275 33264 82 57 29 32 16 10 14 25 16 14 15 14 0 0 2 2 4 4 2 0 0
+* 5p - 52 7418 8609 8846 7269 32142 73 35 41 33 16 8 19 30 16 21 13 16 2 1 0 0 0 3 0 3 0
+* 5p - 53 7333 8316 8416 6986 31051 89 42 41 17 15 14 19 17 11 19 12 13 0 1 4 2 3 0 1 2 0
+* 5p - 54 7020 7993 8257 6623 29893 90 39 18 20 14 9 13 19 16 18 7 10 0 2 0 2 1 0 1 2 0
+* 5p - 55 6680 7715 7861 6486 28742 73 25 27 30 17 12 20 26 5 25 4 16 3 3 1 1 2 1 0 2 0
+* 5p - 56 6636 7221 7581 6225 27663 90 33 22 14 8 13 15 30 6 22 12 14 1 3 4 2 1 1 1 3 0
+* 5p - 57 6332 6925 7423 5910 26590 83 28 33 21 15 9 18 22 14 32 6 12 2 3 8 2 5 0 0 3 0
+* 5p - 58 6098 6686 6851 5860 25495 69 22 20 19 15 4 16 30 16 41 3 12 2 3 3 0 9 6 2 7 0
+* 5p - 59 5894 6400 6618 5506 24418 93 26 30 25 12 12 27 56 18 43 7 24 3 5 7 2 15 1 2 11 0
+* 5p - 60 5692 6138 6243 5287 23360 90 33 41 30 12 7 18 62 24 57 6 15 3 4 11 1 22 8 1 11 0
+* 5p - 61 5397 5929 5946 5130 22402 84 21 33 34 15 15 28 74 29 73 10 12 1 13 19 3 23 8 3 8 0
+* 5p - 62 5125 5655 5781 4837 21398 96 28 29 18 13 10 48 89 21 91 19 17 1 11 17 4 25 6 5 20 0
+* 5p - 63 5015 5274 5512 4631 20432 117 29 47 19 14 20 45 84 40 98 13 28 4 12 17 8 39 10 7 23 0
+* 5p - 64 4848 5076 5114 4510 19548 115 45 53 25 17 24 47 170 42 131 29 22 6 13 17 14 43 16 6 37 0
+* 5p - 65 4498 4962 4970 4266 18696 121 42 49 29 16 25 85 179 58 175 29 29 11 24 29 7 63 18 7 48 0
+* 5p - 66 4438 4620 4811 4027 17896 129 40 69 27 28 26 95 209 68 181 21 31 3 22 28 8 56 11 7 34 0
+* 5p - 67 4236 4524 4488 3926 17174 160 51 95 27 26 48 129 225 92 213 31 41 5 15 25 13 54 14 16 32 0
+* 5p - 68 3825 4261 4426 3874 16386 163 71 101 52 41 45 142 242 95 240 51 56 5 18 20 11 45 22 12 21 0
+* 5p - 69 3750 3925 4340 3478 15493 166 61 123 46 37 59 156 223 135 206 59 68 3 10 14 7 37 19 7 16 0
+* 5p - 70 3500 3730 3904 3426 14560 143 79 141 64 57 65 170 219 141 223 68 73 2 9 16 5 26 7 8 8 0
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.summary b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.summary
new file mode 100644
index 0000000..3193963
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1845A.summary
@@ -0,0 +1,186 @@
+# Command:
+# /home/mischu/bin/pypeline/bin/bam_pipeline run 000_makefile.yaml --max-threads 16 --bwa-max-threads 4
+#
+# Directory:
+# /net/franklin/disk/franklin/data/mischu/projects/2013_09_nature_protocols/FINAL/alignment
+#
+# Makefile:
+# Filename: 000_makefile.yaml
+# SHA1Sum: ee7644dd0ecfee606a2873441020247def0e2355
+# MTime: 2013-10-21 17:37:32.317334
+#
+# Genomes:
+# Name Label Contigs Size Prefix
+# Pi_mito mitochondrial 1 37922 000_prefixes/Pi_mito.fasta
+# Pi_nucl nuclear 4921 228543505 000_prefixes/Pi_nucl.fasta
+#
+# Regions Of Interest:
+# Genome ROI Size NFeatures NIntervals Path
+#
+#
+Target Sample Library Measure Value # Description
+Pi1845A * * lib_type * # SE, PE, or * (for both)
+Pi1845A * * seq_reads_se 1127949653 # Total number of single-ended reads
+Pi1845A * * seq_trash_se 130620377 # Total number of trashed reads
+Pi1845A * * seq_trash_se_frac 0.115803375313 # Fraction of SE reads trashed
+Pi1845A * * seq_reads_pairs 456319045 # Total number of reads
+Pi1845A * * seq_trash_pe_1 65106675 # Total number of reads
+Pi1845A * * seq_trash_pe_1_frac 0.142677969972 # Fraction of PE mate 1 reads trashed
+Pi1845A * * seq_trash_pe_2 125789164 # Total number of reads
+Pi1845A * * seq_trash_pe_2_frac 0.275660561132 # Fraction of PE mate 2 reads trashed
+Pi1845A * * seq_collapsed 325600335 # Total number of pairs collapsed into one read
+Pi1845A * * seq_collapsed_frac 0.713536589296 # Fraction of PE pairs collapsed into one read
+Pi1845A * * seq_retained_reads 1393471192 # Total number of retained reads
+Pi1845A * * seq_retained_nts 84685375638 # Total number of NTs in retained reads
+Pi1845A * * seq_retained_length 60.7729647546 # Average number of NTs in retained reads
+
+Pi1845A * * hits_raw(endogenous) 56020163 # Total number of hits against the nuclear and mitochondrial genome
+Pi1845A * * hits_raw_frac(endogenous) 0.0402018809729 # Total number of hits vs. total number of reads retained
+Pi1845A * * hits_clonality(endogenous) 0.902085879329 # Fraction of hits that were PCR duplicates
+Pi1845A * * hits_unique(endogenous) 5485165 # Total number of unique reads (PCR duplicates removed)
+Pi1845A * * hits_unique_frac(endogenous) 0.00393633182479 # Total number of unique hits vs. total number of reads retained
+Pi1845A * * hits_coverage(endogenous) 1.53489316085 # Estimated coverage from unique hits
+Pi1845A * * hits_length(endogenous) 63.9630838817 # Average number of aligned bases per unique hit
+Pi1845A * * ratio_reads(nuc,mito) 211.044417814 # Ratio of unique hits: Hits(nuc) / H(mito)
+Pi1845A * * ratio_genome(mito,nuc) 45.8846688948 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(mito) / NTs(nuc)) / (Size(mito) / (2 * Size(nuc)))
+Pi1845A * * ratio_genome(nuc,mito) 0.0217937717344 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(nuc) / NTs(mito)) / ((2 * Size(nuc)) / Size(mito))
+
+Pi1845A * * hits_raw(mitochondrial) 434555 # Total number of hits (prior to PCR duplicate filtering)
+Pi1845A * * hits_raw_frac(mitochondrial) 0.000311850723929 # Total number of hits vs. total number of reads retained
+Pi1845A * * hits_clonality(mitochondrial) 0.940472437321 # Fraction of hits that were PCR duplicates
+Pi1845A * * hits_unique(mitochondrial) 25868 # Total number of hits (excluding any PCR duplicates)
+Pi1845A * * hits_unique_frac(mitochondrial) 1.85637135152e-05 # Total number of unique hits vs. total number of reads retained
+Pi1845A * * hits_coverage(mitochondrial) 35.086308739 # Estimated coverage from unique hits
+Pi1845A * * hits_length(mitochondrial) 51.4358667079 # Average number of aligned bases per unique hit
+
+Pi1845A * * hits_raw(nuclear) 55585608 # Total number of hits (prior to PCR duplicate filtering)
+Pi1845A * * hits_raw_frac(nuclear) 0.039890030249 # Total number of hits vs. total number of reads retained
+Pi1845A * * hits_clonality(nuclear) 0.901785782392 # Fraction of hits that were PCR duplicates
+Pi1845A * * hits_unique(nuclear) 5459297 # Total number of hits (excluding any PCR duplicates)
+Pi1845A * * hits_unique_frac(nuclear) 0.00391776811128 # Total number of unique hits vs. total number of reads retained
+Pi1845A * * hits_coverage(nuclear) 1.52932600732 # Estimated coverage from unique hits
+Pi1845A * * hits_length(nuclear) 64.0224420837 # Average number of aligned bases per unique hit
+
+
+Pi1845A Pi1845A * lib_type * # SE, PE, or * (for both)
+Pi1845A Pi1845A * seq_reads_se 1127949653 # Total number of single-ended reads
+Pi1845A Pi1845A * seq_trash_se 130620377 # Total number of trashed reads
+Pi1845A Pi1845A * seq_trash_se_frac 0.115803375313 # Fraction of SE reads trashed
+Pi1845A Pi1845A * seq_reads_pairs 456319045 # Total number of reads
+Pi1845A Pi1845A * seq_trash_pe_1 65106675 # Total number of reads
+Pi1845A Pi1845A * seq_trash_pe_1_frac 0.142677969972 # Fraction of PE mate 1 reads trashed
+Pi1845A Pi1845A * seq_trash_pe_2 125789164 # Total number of reads
+Pi1845A Pi1845A * seq_trash_pe_2_frac 0.275660561132 # Fraction of PE mate 2 reads trashed
+Pi1845A Pi1845A * seq_collapsed 325600335 # Total number of pairs collapsed into one read
+Pi1845A Pi1845A * seq_collapsed_frac 0.713536589296 # Fraction of PE pairs collapsed into one read
+Pi1845A Pi1845A * seq_retained_reads 1393471192 # Total number of retained reads
+Pi1845A Pi1845A * seq_retained_nts 84685375638 # Total number of NTs in retained reads
+Pi1845A Pi1845A * seq_retained_length 60.7729647546 # Average number of NTs in retained reads
+
+Pi1845A Pi1845A * hits_raw(endogenous) 56020163 # Total number of hits against the nuclear and mitochondrial genome
+Pi1845A Pi1845A * hits_raw_frac(endogenous) 0.0402018809729 # Total number of hits vs. total number of reads retained
+Pi1845A Pi1845A * hits_clonality(endogenous) 0.902085879329 # Fraction of hits that were PCR duplicates
+Pi1845A Pi1845A * hits_unique(endogenous) 5485165 # Total number of unique reads (PCR duplicates removed)
+Pi1845A Pi1845A * hits_unique_frac(endogenous) 0.00393633182479 # Total number of unique hits vs. total number of reads retained
+Pi1845A Pi1845A * hits_coverage(endogenous) 1.53489316085 # Estimated coverage from unique hits
+Pi1845A Pi1845A * hits_length(endogenous) 63.9630838817 # Average number of aligned bases per unique hit
+Pi1845A Pi1845A * ratio_reads(nuc,mito) 211.044417814 # Ratio of unique hits: Hits(nuc) / H(mito)
+Pi1845A Pi1845A * ratio_genome(mito,nuc) 45.8846688948 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(mito) / NTs(nuc)) / (Size(mito) / (2 * Size(nuc)))
+Pi1845A Pi1845A * ratio_genome(nuc,mito) 0.0217937717344 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(nuc) / NTs(mito)) / ((2 * Size(nuc)) / Size(mito))
+
+Pi1845A Pi1845A * hits_raw(mitochondrial) 434555 # Total number of hits (prior to PCR duplicate filtering)
+Pi1845A Pi1845A * hits_raw_frac(mitochondrial) 0.000311850723929 # Total number of hits vs. total number of reads retained
+Pi1845A Pi1845A * hits_clonality(mitochondrial) 0.940472437321 # Fraction of hits that were PCR duplicates
+Pi1845A Pi1845A * hits_unique(mitochondrial) 25868 # Total number of hits (excluding any PCR duplicates)
+Pi1845A Pi1845A * hits_unique_frac(mitochondrial) 1.85637135152e-05 # Total number of unique hits vs. total number of reads retained
+Pi1845A Pi1845A * hits_coverage(mitochondrial) 35.086308739 # Estimated coverage from unique hits
+Pi1845A Pi1845A * hits_length(mitochondrial) 51.4358667079 # Average number of aligned bases per unique hit
+
+Pi1845A Pi1845A * hits_raw(nuclear) 55585608 # Total number of hits (prior to PCR duplicate filtering)
+Pi1845A Pi1845A * hits_raw_frac(nuclear) 0.039890030249 # Total number of hits vs. total number of reads retained
+Pi1845A Pi1845A * hits_clonality(nuclear) 0.901785782392 # Fraction of hits that were PCR duplicates
+Pi1845A Pi1845A * hits_unique(nuclear) 5459297 # Total number of hits (excluding any PCR duplicates)
+Pi1845A Pi1845A * hits_unique_frac(nuclear) 0.00391776811128 # Total number of unique hits vs. total number of reads retained
+Pi1845A Pi1845A * hits_coverage(nuclear) 1.52932600732 # Estimated coverage from unique hits
+Pi1845A Pi1845A * hits_length(nuclear) 64.0224420837 # Average number of aligned bases per unique hit
+
+
+Pi1845A Pi1845A Pi1845A_id_CATAGA lib_type SE # SE, PE, or * (for both)
+Pi1845A Pi1845A Pi1845A_id_CATAGA seq_reads_se 1101784211 # Total number of single-ended reads
+Pi1845A Pi1845A Pi1845A_id_CATAGA seq_trash_se 125361465 # Total number of trashed reads
+Pi1845A Pi1845A Pi1845A_id_CATAGA seq_trash_se_frac 0.113780415211 # Fraction of SE reads trashed
+Pi1845A Pi1845A Pi1845A_id_CATAGA seq_retained_reads 976422746 # Total number of retained reads
+Pi1845A Pi1845A Pi1845A_id_CATAGA seq_retained_nts 62308984490 # Total number of NTs in retained reads
+Pi1845A Pi1845A Pi1845A_id_CATAGA seq_retained_length 63.8135323509 # Average number of NTs in retained reads
+
+Pi1845A Pi1845A Pi1845A_id_CATAGA hits_raw(endogenous) 39095254 # Total number of hits against the nuclear and mitochondrial genome
+Pi1845A Pi1845A Pi1845A_id_CATAGA hits_raw_frac(endogenous) 0.0400392700397 # Total number of hits vs. total number of reads retained
+Pi1845A Pi1845A Pi1845A_id_CATAGA hits_clonality(endogenous) 0.879599094049 # Fraction of hits that were PCR duplicates
+Pi1845A Pi1845A Pi1845A_id_CATAGA hits_unique(endogenous) 4707104 # Total number of unique reads (PCR duplicates removed)
+Pi1845A Pi1845A Pi1845A_id_CATAGA hits_unique_frac(endogenous) 0.00482076438641 # Total number of unique hits vs. total number of reads retained
+Pi1845A Pi1845A Pi1845A_id_CATAGA hits_coverage(endogenous) 1.33556949052 # Estimated coverage from unique hits
+Pi1845A Pi1845A Pi1845A_id_CATAGA hits_length(endogenous) 64.8565189977 # Average number of aligned bases per unique hit
+Pi1845A Pi1845A Pi1845A_id_CATAGA ratio_reads(nuc,mito) 250.488165839 # Ratio of unique hits: Hits(nuc) / H(mito)
+Pi1845A Pi1845A Pi1845A_id_CATAGA ratio_genome(mito,nuc) 40.68211864 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(mito) / NTs(nuc)) / (Size(mito) / (2 * Size(nuc)))
+Pi1845A Pi1845A Pi1845A_id_CATAGA ratio_genome(nuc,mito) 0.0245808240433 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(nuc) / NTs(mito)) / ((2 * Size(nuc)) / Size(mito))
+
+Pi1845A Pi1845A Pi1845A_id_CATAGA hits_raw(mitochondrial) 268180 # Total number of hits (prior to PCR duplicate filtering)
+Pi1845A Pi1845A Pi1845A_id_CATAGA hits_raw_frac(mitochondrial) 0.000274655625444 # Total number of hits vs. total number of reads retained
+Pi1845A Pi1845A Pi1845A_id_CATAGA hits_clonality(mitochondrial) 0.930207323439 # Fraction of hits that were PCR duplicates
+Pi1845A Pi1845A Pi1845A_id_CATAGA hits_unique(mitochondrial) 18717 # Total number of hits (excluding any PCR duplicates)
+Pi1845A Pi1845A Pi1845A_id_CATAGA hits_unique_frac(mitochondrial) 1.91689512321e-05 # Total number of unique hits vs. total number of reads retained
+Pi1845A Pi1845A Pi1845A_id_CATAGA hits_coverage(mitochondrial) 27.0800063288 # Estimated coverage from unique hits
+Pi1845A Pi1845A Pi1845A_id_CATAGA hits_length(mitochondrial) 54.8660575947 # Average number of aligned bases per unique hit
+
+Pi1845A Pi1845A Pi1845A_id_CATAGA hits_raw(nuclear) 38827074 # Total number of hits (prior to PCR duplicate filtering)
+Pi1845A Pi1845A Pi1845A_id_CATAGA hits_raw_frac(nuclear) 0.0397646144143 # Total number of hits vs. total number of reads retained
+Pi1845A Pi1845A Pi1845A_id_CATAGA hits_clonality(nuclear) 0.879249541184 # Fraction of hits that were PCR duplicates
+Pi1845A Pi1845A Pi1845A_id_CATAGA hits_unique(nuclear) 4688387 # Total number of hits (excluding any PCR duplicates)
+Pi1845A Pi1845A Pi1845A_id_CATAGA hits_unique_frac(nuclear) 0.00480159543518 # Total number of unique hits vs. total number of reads retained
+Pi1845A Pi1845A Pi1845A_id_CATAGA hits_coverage(nuclear) 1.33129774132 # Estimated coverage from unique hits
+Pi1845A Pi1845A Pi1845A_id_CATAGA hits_length(nuclear) 64.8964029633 # Average number of aligned bases per unique hit
+
+
+Pi1845A Pi1845A Pi1845A_id_CGCTAT lib_type * # SE, PE, or * (for both)
+Pi1845A Pi1845A Pi1845A_id_CGCTAT seq_reads_se 26165442 # Total number of single-ended reads
+Pi1845A Pi1845A Pi1845A_id_CGCTAT seq_trash_se 5258912 # Total number of trashed reads
+Pi1845A Pi1845A Pi1845A_id_CGCTAT seq_trash_se_frac 0.200986935363 # Fraction of SE reads trashed
+Pi1845A Pi1845A Pi1845A_id_CGCTAT seq_reads_pairs 456319045 # Total number of reads
+Pi1845A Pi1845A Pi1845A_id_CGCTAT seq_trash_pe_1 65106675 # Total number of reads
+Pi1845A Pi1845A Pi1845A_id_CGCTAT seq_trash_pe_1_frac 0.142677969972 # Fraction of PE mate 1 reads trashed
+Pi1845A Pi1845A Pi1845A_id_CGCTAT seq_trash_pe_2 125789164 # Total number of reads
+Pi1845A Pi1845A Pi1845A_id_CGCTAT seq_trash_pe_2_frac 0.275660561132 # Fraction of PE mate 2 reads trashed
+Pi1845A Pi1845A Pi1845A_id_CGCTAT seq_collapsed 325600335 # Total number of pairs collapsed into one read
+Pi1845A Pi1845A Pi1845A_id_CGCTAT seq_collapsed_frac 0.713536589296 # Fraction of PE pairs collapsed into one read
+Pi1845A Pi1845A Pi1845A_id_CGCTAT seq_retained_reads 417048446 # Total number of retained reads
+Pi1845A Pi1845A Pi1845A_id_CGCTAT seq_retained_nts 22376391148 # Total number of NTs in retained reads
+Pi1845A Pi1845A Pi1845A_id_CGCTAT seq_retained_length 53.654177021 # Average number of NTs in retained reads
+
+Pi1845A Pi1845A Pi1845A_id_CGCTAT hits_raw(endogenous) 16924909 # Total number of hits against the nuclear and mitochondrial genome
+Pi1845A Pi1845A Pi1845A_id_CGCTAT hits_raw_frac(endogenous) 0.0405825969676 # Total number of hits vs. total number of reads retained
+Pi1845A Pi1845A Pi1845A_id_CGCTAT hits_clonality(endogenous) 0.954028644999 # Fraction of hits that were PCR duplicates
+Pi1845A Pi1845A Pi1845A_id_CGCTAT hits_unique(endogenous) 778061 # Total number of unique reads (PCR duplicates removed)
+Pi1845A Pi1845A Pi1845A_id_CGCTAT hits_unique_frac(endogenous) 0.00186563697207 # Total number of unique hits vs. total number of reads retained
+Pi1845A Pi1845A Pi1845A_id_CGCTAT hits_coverage(endogenous) 0.199323670335 # Estimated coverage from unique hits
+Pi1845A Pi1845A Pi1845A_id_CGCTAT hits_length(endogenous) 58.5579909544 # Average number of aligned bases per unique hit
+Pi1845A Pi1845A Pi1845A_id_CGCTAT ratio_reads(nuc,mito) 107.804502867 # Ratio of unique hits: Hits(nuc) / H(mito)
+Pi1845A Pi1845A Pi1845A_id_CGCTAT ratio_genome(mito,nuc) 80.8601981127 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(mito) / NTs(nuc)) / (Size(mito) / (2 * Size(nuc)))
+Pi1845A Pi1845A Pi1845A_id_CGCTAT ratio_genome(nuc,mito) 0.0123670238676 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(nuc) / NTs(mito)) / ((2 * Size(nuc)) / Size(mito))
+
+Pi1845A Pi1845A Pi1845A_id_CGCTAT hits_raw(mitochondrial) 166375 # Total number of hits (prior to PCR duplicate filtering)
+Pi1845A Pi1845A Pi1845A_id_CGCTAT hits_raw_frac(mitochondrial) 0.000398934468155 # Total number of hits vs. total number of reads retained
+Pi1845A Pi1845A Pi1845A_id_CGCTAT hits_clonality(mitochondrial) 0.95701878287 # Fraction of hits that were PCR duplicates
+Pi1845A Pi1845A Pi1845A_id_CGCTAT hits_unique(mitochondrial) 7151 # Total number of hits (excluding any PCR duplicates)
+Pi1845A Pi1845A Pi1845A_id_CGCTAT hits_unique_frac(mitochondrial) 1.71466889964e-05 # Total number of unique hits vs. total number of reads retained
+Pi1845A Pi1845A Pi1845A_id_CGCTAT hits_coverage(mitochondrial) 8.00630241021 # Estimated coverage from unique hits
+Pi1845A Pi1845A Pi1845A_id_CGCTAT hits_length(mitochondrial) 42.457698224 # Average number of aligned bases per unique hit
+
+Pi1845A Pi1845A Pi1845A_id_CGCTAT hits_raw(nuclear) 16758534 # Total number of hits (prior to PCR duplicate filtering)
+Pi1845A Pi1845A Pi1845A_id_CGCTAT hits_raw_frac(nuclear) 0.0401836624995 # Total number of hits vs. total number of reads retained
+Pi1845A Pi1845A Pi1845A_id_CGCTAT hits_clonality(nuclear) 0.953998959575 # Fraction of hits that were PCR duplicates
+Pi1845A Pi1845A Pi1845A_id_CGCTAT hits_unique(nuclear) 770910 # Total number of hits (excluding any PCR duplicates)
+Pi1845A Pi1845A Pi1845A_id_CGCTAT hits_unique_frac(nuclear) 0.00184849028307 # Total number of unique hits vs. total number of reads retained
+Pi1845A Pi1845A Pi1845A_id_CGCTAT hits_coverage(nuclear) 0.198028265997 # Estimated coverage from unique hits
+Pi1845A Pi1845A Pi1845A_id_CGCTAT hits_length(nuclear) 58.707338081 # Average number of aligned bases per unique hit
+
+
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.coverage b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.coverage
new file mode 100644
index 0000000..89f5436
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.coverage
@@ -0,0 +1,34 @@
+# Timestamp: 2013-10-22T17:38:04.143987
+#
+# Columns:
+# Contig: Contig, chromosome, or feature for which a depth histogram was
+# created. Unnamed features are named after the chromosome or
+# contig on which they are located, with a star appended. For
+# example "chr1*".
+# Size: The total size of the region. Multiple features with the same
+# name are combined into one row, with the size representing to
+# total of these. Note that overlapping bases are counted 2 (or
+# more) times.
+# Hits: Sum of SE, PE_1, and PE_2 hits
+# SE, PE_1, PE_2: Number of Single Ended, and Pair Ended (mate 1 and 2) hits overlapping
+# the current contig or intervals. Note that a hit may be counted multiple
+# times if it overlaps multiple intervals
+# Collapsed: Number of hits for PE pair collapsed into a single read
+# M, I, D: Number of aligned (M), inserted (I) and deleted (D) bases relative to references
+# Coverage: Average number of bases covering each position in the contig(s)/intervals(s).
+Name Sample Library Contig Size Hits SE PE_1 PE_2 Collapsed M I D Coverage
+Pi1889 * * * 37922 49824 18509 452 602 30261 4246529 1015 621 111.980618111
+Pi1889 * * gi|58012130|gb|AY894835.1| 37922 49824 18509 452 602 30261 4246529 1015 621 111.980618111
+#
+#
+Pi1889 Pi1889 * * 37922 49824 18509 452 602 30261 4246529 1015 621 111.980618111
+Pi1889 Pi1889 * gi|58012130|gb|AY894835.1| 37922 49824 18509 452 602 30261 4246529 1015 621 111.980618111
+#
+Pi1889 Pi1889 Pi1889_id_CTTGTA * 37922 14793 5499 53 98 9143 1221519 226 177 32.2113548863
+Pi1889 Pi1889 Pi1889_id_CTTGTA gi|58012130|gb|AY894835.1| 37922 14793 5499 53 98 9143 1221519 226 177 32.2113548863
+#
+Pi1889 Pi1889 Pi1889_id_GGCTAC * 37922 17835 6559 95 133 11048 1538959 341 212 40.5822214018
+Pi1889 Pi1889 Pi1889_id_GGCTAC gi|58012130|gb|AY894835.1| 37922 17835 6559 95 133 11048 1538959 341 212 40.5822214018
+#
+Pi1889 Pi1889 Pi1889_id_TAGCTT * 37922 17196 6451 304 371 10070 1486051 448 232 39.1870418227
+Pi1889 Pi1889 Pi1889_id_TAGCTT gi|58012130|gb|AY894835.1| 37922 17196 6451 304 371 10070 1486051 448 232 39.1870418227
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.depths b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.depths
new file mode 100644
index 0000000..89d110d
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.depths
@@ -0,0 +1,32 @@
+# Timestamp: 2013-10-22T17:38:06.688600
+#
+# Columns:
+# Contig: Contig, chromosome, or feature for which a depth histogram was
+# created. Unnamed features are named after the chromosome or
+# contig on which they are located, with a star appended. For
+# example "chr1*".
+# Size: The total size of the region. Multiple features with the same
+# name are combined into one row, with the size representing to
+# total of these. Note that overlapping bases are counted 2 (or
+# more) times.
+# MaxDepth: Maximum depth to use when calling SNPs, in order to exclude
+# (at least) the 0.5% most extreme sites based on read depth,
+# not including sites with depth 0.
+# MD_*: Fraction of sites with a minimum depth of 1-200.
+#
+Name Sample Library Contig Size MaxDepth MD_001 MD_002 MD_003 MD_004 MD_005 MD_006 MD_007 MD_008 MD_009 MD_010 MD_011 MD_012 MD_013 MD_014 MD_015 MD_016 MD_017 MD_018 MD_019 MD_020 MD_021 MD_022 MD_023 MD_024 MD_025 MD_026 MD_027 MD_028 MD_029 MD_030 MD_031 MD_032 MD_033 MD_034 MD_035 MD_036 MD_037 MD_038 MD_039 MD_040 M [...]
+Pi1889 * * * 37922 NA 0.9995 0.9987 0.9987 0.9983 0.9979 0.9955 0.9923 0.9855 0.9773 0.9657 0.9536 0.9426 0.9319 0.9211 0.9057 0.8920 0.8784 0.8626 0.8461 0.8280 0.8111 0.7943 0.7794 0.7630 0.7477 0.7305 0.7154 0.7006 0.6855 0.6706 0.6567 0.6440 0.6321 0.6204 0.6085 0.5965 0.5825 0.5681 0.5548 0.5413 0 [...]
+Pi1889 * * gi|58012130|gb|AY894835.1| 37922 NA 0.9995 0.9987 0.9987 0.9983 0.9979 0.9955 0.9923 0.9855 0.9773 0.9657 0.9536 0.9426 0.9319 0.9211 0.9057 0.8920 0.8784 0.8626 0.8461 0.8280 0.8111 0.7943 0.7794 0.7630 0.7477 0.7305 0.7154 0.7006 0.6855 0.6706 0.6567 0.6440 0.6321 0.6204 0.6085 0.5965 0.5825 0.5681 0.5548 0.5413 0 [...]
+#
+#
+Pi1889 Pi1889 * * 37922 NA 0.9995 0.9987 0.9987 0.9983 0.9979 0.9955 0.9923 0.9855 0.9773 0.9657 0.9536 0.9426 0.9319 0.9211 0.9057 0.8920 0.8784 0.8626 0.8461 0.8280 0.8111 0.7943 0.7794 0.7630 0.7477 0.7305 0.7154 0.7006 0.6855 0.6706 0.6567 0.6440 0.6321 0.6204 0.6085 0.5965 0.5825 0.5681 0.5548 0.5413 0 [...]
+Pi1889 Pi1889 * gi|58012130|gb|AY894835.1| 37922 NA 0.9995 0.9987 0.9987 0.9983 0.9979 0.9955 0.9923 0.9855 0.9773 0.9657 0.9536 0.9426 0.9319 0.9211 0.9057 0.8920 0.8784 0.8626 0.8461 0.8280 0.8111 0.7943 0.7794 0.7630 0.7477 0.7305 0.7154 0.7006 0.6855 0.6706 0.6567 0.6440 0.6321 0.6204 0.6085 0.5965 0.5825 0.5681 0.5548 0.5413 0 [...]
+#
+Pi1889 Pi1889 Pi1889_id_CTTGTA * 37922 NA 0.9841 0.9547 0.9197 0.8706 0.8202 0.7550 0.6993 0.6414 0.5868 0.5412 0.5021 0.4642 0.4345 0.4097 0.3837 0.3617 0.3420 0.3237 0.3080 0.2940 0.2810 0.2669 0.2545 0.2446 0.2359 0.2288 0.2221 0.2163 0.2115 0.2070 0.2006 0.1967 0.1926 0.1888 0.1859 0.1826 0.1790 0.1766 0.1733 0.1706 0 [...]
+Pi1889 Pi1889 Pi1889_id_CTTGTA gi|58012130|gb|AY894835.1| 37922 NA 0.9841 0.9547 0.9197 0.8706 0.8202 0.7550 0.6993 0.6414 0.5868 0.5412 0.5021 0.4642 0.4345 0.4097 0.3837 0.3617 0.3420 0.3237 0.3080 0.2940 0.2810 0.2669 0.2545 0.2446 0.2359 0.2288 0.2221 0.2163 0.2115 0.2070 0.2006 0.1967 0.1926 0.1888 0.1859 0.1826 0.1790 0.1766 0.1733 0.1706 0 [...]
+#
+Pi1889 Pi1889 Pi1889_id_GGCTAC * 37922 NA 0.9948 0.9835 0.9584 0.9253 0.8916 0.8462 0.8053 0.7619 0.7230 0.6879 0.6512 0.6183 0.5839 0.5541 0.5260 0.4985 0.4723 0.4505 0.4312 0.4123 0.3946 0.3774 0.3613 0.3457 0.3323 0.3186 0.3073 0.2956 0.2863 0.2764 0.2680 0.2586 0.2497 0.2418 0.2353 0.2280 0.2222 0.2182 0.2149 0.2110 0 [...]
+Pi1889 Pi1889 Pi1889_id_GGCTAC gi|58012130|gb|AY894835.1| 37922 NA 0.9948 0.9835 0.9584 0.9253 0.8916 0.8462 0.8053 0.7619 0.7230 0.6879 0.6512 0.6183 0.5839 0.5541 0.5260 0.4985 0.4723 0.4505 0.4312 0.4123 0.3946 0.3774 0.3613 0.3457 0.3323 0.3186 0.3073 0.2956 0.2863 0.2764 0.2680 0.2586 0.2497 0.2418 0.2353 0.2280 0.2222 0.2182 0.2149 0.2110 0 [...]
+#
+Pi1889 Pi1889 Pi1889_id_TAGCTT * 37922 NA 0.9965 0.9892 0.9759 0.9564 0.9298 0.9027 0.8678 0.8318 0.7971 0.7625 0.7280 0.6953 0.6645 0.6316 0.5999 0.5706 0.5401 0.5109 0.4849 0.4608 0.4433 0.4253 0.4052 0.3876 0.3707 0.3558 0.3433 0.3314 0.3196 0.3076 0.2982 0.2906 0.2835 0.2762 0.2699 0.2620 0.2534 0.2452 0.2373 0.2311 0 [...]
+Pi1889 Pi1889 Pi1889_id_TAGCTT gi|58012130|gb|AY894835.1| 37922 NA 0.9965 0.9892 0.9759 0.9564 0.9298 0.9027 0.8678 0.8318 0.7971 0.7625 0.7280 0.6953 0.6645 0.6316 0.5999 0.5706 0.5401 0.5109 0.4849 0.4608 0.4433 0.4253 0.4052 0.3876 0.3707 0.3558 0.3433 0.3314 0.3196 0.3076 0.2982 0.2906 0.2835 0.2762 0.2699 0.2620 0.2534 0.2452 0.2373 0.2311 0 [...]
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/3pGtoA_freq.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/3pGtoA_freq.txt
new file mode 100644
index 0000000..0d67951
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/3pGtoA_freq.txt
@@ -0,0 +1,26 @@
+pos 5pG>A
+1 0.0161857846586911
+2 0.0349775784753363
+3 0.0166870166870167
+4 0.0143299767621998
+5 0.0131212723658052
+6 0.00681909346169274
+7 0.00775795190069822
+8 0.00539374325782093
+9 0.0054904831625183
+10 0.00401313389273988
+11 0.00283687943262411
+12 0.00482912332838039
+13 0.000359195402298851
+14 0.00529100529100529
+15 0.0025528811086798
+16 0.00191204588910134
+17 0.00178762960314623
+18 0.00407256571640133
+19 0.00144300144300144
+20 0.00391459074733096
+21 0.00414312617702448
+22 0.00142196942765731
+23 0.00178890876565295
+24 0.000751314800901578
+25 0.00497866287339972
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/5pCtoT_freq.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/5pCtoT_freq.txt
new file mode 100644
index 0000000..f719039
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/5pCtoT_freq.txt
@@ -0,0 +1,26 @@
+pos 5pC>T
+1 0.00404448938321537
+2 0.00460829493087558
+3 0.00551181102362205
+4 0.00301091456529921
+5 0.00193199381761978
+6 0.00400962309542903
+7 0.00502318392581144
+8 0.00353232073472271
+9 0.00111028867505551
+10 0.00254175744371823
+11 0.00191497510532363
+12 0.00147112909157779
+13 0.00285816362986781
+14 0.00183284457478006
+15 0.00188253012048193
+16 0.0035033086804204
+17 0.00307810696421701
+18 0.00518710633567988
+19 0.00256504213997801
+20 0.00219378427787934
+21 0.00265352539802881
+22 0.00245872848612575
+23 0.00180310133429499
+24 0.00186011904761905
+25 0.00290909090909091
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/Fragmisincorporation_plot.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/Fragmisincorporation_plot.pdf
new file mode 100644
index 0000000..807df9b
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/Fragmisincorporation_plot.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/Length_plot.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/Length_plot.pdf
new file mode 100644
index 0000000..38fb0f6
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/Length_plot.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/Runtime_log.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/Runtime_log.txt
new file mode 100644
index 0000000..2cc7d6b
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/Runtime_log.txt
@@ -0,0 +1,4 @@
+2013-10-22 17:13:03,768 INFO main: Started with the command: /home/mischu/bin/mapDamage/bin/mapDamage --no-stats --merge-reference-sequences -t mapDamage plot for library 'Pi1889_id_CTTGTA' -i - -d /home/mischu/scratch/bam_pipeline/7ffcba73-ce42-48bb-9374-19a7fced9b39 -r 000_prefixes/Pi_mito.fasta --downsample 100000
+2013-10-22 17:13:08,531 DEBUG main: BAM read in 7.296812 seconds
+2013-10-22 17:13:09,379 INFO main: Successful run
+2013-10-22 17:13:09,380 DEBUG main: Run completed in 8.145572 seconds
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_correct_prob.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_correct_prob.csv
new file mode 100644
index 0000000..39b417c
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_correct_prob.csv
@@ -0,0 +1,25 @@
+"","Position","C.T","G.A"
+"1",1,0.87227066911263,0.675437944126842
+"2",2,0.835115330424422,0.570100558635349
+"3",3,0.782548333330741,0.49407598741908
+"4",4,0.715873474177251,0.431788191935884
+"5",5,0.648951217889478,0.341288408551715
+"6",6,0.586418113517838,0.232678147499745
+"7",7,0.506104207930755,0.186361140772441
+"8",8,0.400343904278973,0.197907895625517
+"9",9,0.292053586793579,0.205855061168339
+"10",10,0.215491506506669,0.185165895372095
+"11",11,0.183591854947805,0.137856777393521
+"12",12,0.174790525140163,0.0831597687302719
+"13",-12,0.0826030567415252,0.166928894393074
+"14",-11,0.140394489183479,0.177180977266456
+"15",-10,0.18644377667245,0.211489654187489
+"16",-9,0.228850460275683,0.266678279296373
+"17",-8,0.288072986427762,0.326560068166496
+"18",-7,0.364370641446326,0.389775811483834
+"19",-6,0.455989094334581,0.453145415114076
+"20",-5,0.544657186129488,0.525579282333621
+"21",-4,0.619046336436815,0.610427244567712
+"22",-3,0.68501723561898,0.693232607186351
+"23",-2,0.754911910266033,0.756900311910626
+"24",-1,0.830161090617125,0.792903599302713
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_hist.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_hist.pdf
new file mode 100644
index 0000000..f8affa3
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_hist.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_iter_summ_stat.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_iter_summ_stat.csv
new file mode 100644
index 0000000..24a219b
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_iter_summ_stat.csv
@@ -0,0 +1,45 @@
+"","Theta","DeltaD","DeltaS","Lambda","Rho","LogLik"
+"Mean",0.0268140459220387,0.000242597739047013,0.0732009399531436,0.286508434062607,0.753234428190458,-4310.57905127929
+"Std.",0.000660836927743235,0.000224349015531467,0.00674387574868724,0.0203169335945215,0.0225942909824433,1.70863685963672
+"Acceptance ratio",0.19058,0.1833,0.28142,0.23416,0.24116,0.72394
+"0%",0.0244185465271013,1.55564357783868e-08,0.0501215936698641,0.198159717222829,0.673806158892296,-4323.48454200937
+"2.5%",0.0255476042541874,6.01123436383136e-06,0.0606226329113106,0.247838082110728,0.710241950605932,-4314.73529813362
+"5%",0.0257387779964195,1.30556749623329e-05,0.0626252600907769,0.253779471153437,0.716793479138783,-4313.78706776615
+"7.5%",0.0258642789579861,1.96016992915799e-05,0.0637122890336663,0.257851990419167,0.721365770777893,-4313.20484822215
+"10%",0.0259543355071761,2.72610266731405e-05,0.0647722359423057,0.261216200431202,0.724440810528629,-4312.84516821523
+"12.5%",0.0260417261729408,3.42893945155736e-05,0.0655174299536157,0.263582027004416,0.727487411511646,-4312.53169976731
+"15%",0.0261150624719362,4.14329872185814e-05,0.0661920171285927,0.265871630192666,0.729849433460022,-4312.27193512936
+"17.5%",0.0261878800164485,4.89945986355141e-05,0.066884548102796,0.267743467168052,0.731984854569962,-4312.05416531621
+"20%",0.0262462632373179,5.73777547024901e-05,0.0674869605838197,0.269612910835128,0.734087392233355,-4311.86683512056
+"22.5%",0.0262977892918702,6.66888227439816e-05,0.0680024716159262,0.271241640272689,0.735908467586519,-4311.67818330158
+"25%",0.0263492679382374,7.52644546572826e-05,0.0685059274136403,0.272716127926712,0.737708539640076,-4311.51069367478
+"27.5%",0.0264135721520821,8.54104667772462e-05,0.0689837091928766,0.2741964377212,0.739451732679606,-4311.34544169736
+"30%",0.0264649267621316,9.49918538760777e-05,0.0694607794730739,0.275643194375174,0.741228402746265,-4311.19732151359
+"32.5%",0.0265165792662094,0.000104625787426926,0.0699342074046913,0.27706144025366,0.742785959722828,-4311.05712525501
+"35%",0.0265611988522615,0.000112948616201345,0.0704406485658998,0.278487411202633,0.74409245073808,-4310.92351100921
+"37.5%",0.0266020531999352,0.000122268347382365,0.0708751050966102,0.279889769379216,0.745613834270069,-4310.80292339523
+"40%",0.0266435900612021,0.000133004421142137,0.0713150175060689,0.281018703810351,0.747077097039091,-4310.68536009855
+"42.5%",0.02668706493615,0.000144029518443549,0.0717863734833155,0.282348503955832,0.748390737019985,-4310.57979058139
+"45%",0.0267302152043519,0.00015529110187665,0.0722468514919247,0.283513740737385,0.749870028050262,-4310.47769736369
+"47.5%",0.0267710850705783,0.000166909283906197,0.0726705993414875,0.284822221580622,0.75139710204868,-4310.37143470496
+"50%",0.0268127044117473,0.000179424494686417,0.0730656959593831,0.286022096020801,0.752797084375922,-4310.27179498147
+"52.5%",0.0268428558237263,0.000192245504035562,0.0734783720399611,0.287149067992165,0.754255397194863,-4310.17542394414
+"55%",0.0268856742560616,0.00020605341338299,0.073856937387925,0.288488620774536,0.755786657960714,-4310.07685713079
+"57.5%",0.0269350232218525,0.000221610573628565,0.0743000389953186,0.289773942653627,0.757333135648067,-4309.98061505648
+"60%",0.0269745857001084,0.000235821238926949,0.0746706910838597,0.291217550276846,0.758755040863718,-4309.88634305728
+"62.5%",0.0270218577481882,0.000249545369651732,0.0751169063049116,0.292535978213103,0.760133552443049,-4309.79542666813
+"65%",0.0270674976301386,0.00026389334787594,0.0756274144283812,0.293851540464696,0.761798223464551,-4309.70449044168
+"67.5%",0.027115160641004,0.000278258341101514,0.0760469910411475,0.295297572901171,0.76339048889084,-4309.60931713939
+"70%",0.0271726196141441,0.000297585871482697,0.0765744545995292,0.296716395364951,0.765091307048769,-4309.50801147821
+"72.5%",0.0272164186324126,0.000319513960314735,0.0771284390270906,0.298232637059589,0.766895283840802,-4309.41020779432
+"75%",0.0272680876007831,0.000339663964055282,0.0776450007727333,0.299839153989692,0.768499336468971,-4309.31830348941
+"77.5%",0.0273291779018472,0.000366179400632878,0.0782258159395851,0.301562592570482,0.770246412837234,-4309.22093428966
+"80%",0.0273860061659284,0.000392025824522153,0.0787857008213124,0.303365421619652,0.772101448417441,-4309.11958319017
+"82.5%",0.027444846824717,0.000423577596128624,0.0794536149210925,0.305298351952744,0.774307601525975,-4309.02528983414
+"85%",0.027511380755112,0.000456143436461369,0.0802179155947401,0.307294338340941,0.77670957588976,-4308.9231422332
+"87.5%",0.0275713421059801,0.000493628203070918,0.0810099263032816,0.309656729165185,0.779190954627463,-4308.81722782991
+"90%",0.0276542365558125,0.000546910787265105,0.0819169461989368,0.312512968139701,0.782426788790996,-4308.69866610216
+"92.5%",0.0277699892569635,0.000613065693946058,0.0829107356090555,0.315710799886008,0.786031512670717,-4308.56918146443
+"95%",0.0278882526344828,0.000698889402592154,0.0845191859373113,0.320016249822472,0.79092283533229,-4308.42305073776
+"97.5%",0.0280915013423115,0.000838665322901309,0.0868636583635092,0.327451378030451,0.798157460353975,-4308.22031716261
+"100%",0.0292101758647466,0.00232052350211329,0.107417689975156,0.378900019336768,0.8647733361315,-4307.63486253567
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_post_pred.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_post_pred.pdf
new file mode 100644
index 0000000..78fcbef
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_post_pred.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_trace.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_trace.pdf
new file mode 100644
index 0000000..95ea1d3
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_trace.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/dnacomp.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/dnacomp.txt
new file mode 100644
index 0000000..2084daf
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/dnacomp.txt
@@ -0,0 +1,324 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_mito.fasta as reference file
+# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads
+Chr End Std Pos A C G T Total
+* 3p + -70 1628 857 929 1605 5019
+* 3p + -69 1692 815 959 1640 5106
+* 3p + -68 1730 870 1053 1571 5224
+* 3p + -67 1710 883 1125 1597 5315
+* 3p + -66 1775 873 1094 1686 5428
+* 3p + -65 1785 906 1081 1749 5521
+* 3p + -64 1817 961 1063 1784 5625
+* 3p + -63 1953 860 1092 1805 5710
+* 3p + -62 1854 927 1167 1858 5806
+* 3p + -61 1993 932 1137 1832 5894
+* 3p + -60 1955 913 1169 1960 5997
+* 3p + -59 2001 950 1266 1870 6087
+* 3p + -58 2024 1078 1163 1912 6177
+* 3p + -57 2053 1009 1205 1988 6255
+* 3p + -56 2054 1052 1337 1892 6335
+* 3p + -55 2059 1047 1337 1989 6432
+* 3p + -54 2077 1081 1291 2066 6515
+* 3p + -53 1968 1118 1513 1990 6589
+* 3p + -52 2063 1091 1421 2081 6656
+* 3p + -51 2223 1082 1326 2085 6716
+* 3p + -50 2066 1140 1507 2060 6773
+* 3p + -49 2117 1204 1458 2055 6834
+* 3p + -48 2181 1032 1378 2293 6884
+* 3p + -47 2280 1081 1468 2099 6928
+* 3p + -46 2286 1162 1413 2106 6967
+* 3p + -45 2383 1074 1363 2201 7021
+* 3p + -44 2310 1152 1483 2110 7055
+* 3p + -43 2332 1179 1450 2126 7087
+* 3p + -42 2310 1116 1451 2246 7123
+* 3p + -41 2213 1176 1621 2135 7145
+* 3p + -40 2307 1241 1478 2149 7175
+* 3p + -39 2248 1103 1468 2380 7199
+* 3p + -38 2355 1182 1472 2213 7222
+* 3p + -37 2155 1288 1501 2302 7246
+* 3p + -36 2262 1149 1455 2387 7253
+* 3p + -35 2375 1097 1613 2173 7258
+* 3p + -34 2335 1120 1494 2321 7270
+* 3p + -33 2283 1153 1540 2304 7280
+* 3p + -32 2323 1132 1525 2307 7287
+* 3p + -31 2317 1221 1460 2301 7299
+* 3p + -30 2389 1160 1445 2305 7299
+* 3p + -29 2416 1133 1502 2254 7305
+* 3p + -28 2274 1207 1591 2240 7312
+* 3p + -27 2343 1133 1443 2399 7318
+* 3p + -26 2252 1182 1578 2314 7326
+* 3p + -25 2276 1105 1596 2358 7335
+* 3p + -24 2479 1075 1410 2372 7336
+* 3p + -23 2380 1192 1564 2201 7337
+* 3p + -22 2314 1212 1584 2225 7335
+* 3p + -21 2373 1164 1452 2347 7336
+* 3p + -20 2432 1170 1485 2251 7338
+* 3p + -19 2457 1210 1502 2169 7338
+* 3p + -18 2431 1118 1417 2372 7338
+* 3p + -17 2407 1136 1563 2232 7338
+* 3p + -16 2436 1147 1442 2313 7338
+* 3p + -15 2381 1079 1537 2278 7275
+* 3p + -14 2471 1041 1592 2234 7338
+* 3p + -13 2370 1183 1512 2273 7338
+* 3p + -12 2441 1101 1526 2270 7338
+* 3p + -11 2329 1087 1606 2316 7338
+* 3p + -10 2368 1229 1481 2260 7338
+* 3p + -9 2590 1060 1484 2204 7338
+* 3p + -8 2547 1048 1501 2242 7338
+* 3p + -7 2321 1258 1356 2403 7338
+* 3p + -6 2588 1082 1384 2284 7338
+* 3p + -5 2490 1148 1343 2357 7338
+* 3p + -4 2531 1038 1322 2447 7338
+* 3p + -3 2505 990 1387 2456 7338
+* 3p + -2 2502 1246 1276 2314 7338
+* 3p + -1 1319 1279 1575 3165 7338
+* 3p + 1 1260 1547 562 3969 7338
+* 3p + 2 2203 1257 1356 2522 7338
+* 3p + 3 2452 1105 1423 2358 7338
+* 3p + 4 2547 1055 1351 2385 7338
+* 3p + 5 2577 945 1410 2406 7338
+* 3p + 6 2603 1031 1342 2362 7338
+* 3p + 7 2557 985 1334 2462 7338
+* 3p + 8 2584 992 1351 2411 7338
+* 3p + 9 2543 974 1271 2549 7337
+* 3p + 10 2525 968 1217 2627 7337
+* 3p - -70 1576 974 818 1575 4943
+* 3p - -69 1740 934 752 1613 5039
+* 3p - -68 1716 1036 787 1628 5167
+* 3p - -67 1756 1046 857 1629 5288
+* 3p - -66 1736 1088 803 1786 5413
+* 3p - -65 1731 1072 934 1789 5526
+* 3p - -64 1761 1106 941 1822 5630
+* 3p - -63 1832 1137 936 1843 5748
+* 3p - -62 1833 1133 978 1928 5872
+* 3p - -61 1917 1198 915 1926 5956
+* 3p - -60 1999 1208 899 1948 6054
+* 3p - -59 1943 1286 872 2056 6157
+* 3p - -58 1898 1315 1014 2023 6250
+* 3p - -57 1927 1344 976 2079 6326
+* 3p - -56 2032 1354 1017 2008 6411
+* 3p - -55 1948 1486 1086 1976 6496
+* 3p - -54 2132 1371 999 2082 6584
+* 3p - -53 2213 1276 1079 2091 6659
+* 3p - -52 2046 1415 1128 2139 6728
+* 3p - -51 2155 1411 1096 2138 6800
+* 3p - -50 2207 1445 1108 2098 6858
+* 3p - -49 2096 1523 1087 2214 6920
+* 3p - -48 2138 1573 1019 2247 6977
+* 3p - -47 2287 1440 1128 2173 7028
+* 3p - -46 2245 1510 1174 2158 7087
+* 3p - -45 2323 1375 1133 2303 7134
+* 3p - -44 2201 1459 1224 2292 7176
+* 3p - -43 2275 1514 1123 2308 7220
+* 3p - -42 2415 1364 1136 2346 7261
+* 3p - -41 2266 1488 1144 2374 7272
+* 3p - -40 2238 1583 1125 2354 7300
+* 3p - -39 2248 1559 1146 2371 7324
+* 3p - -38 2288 1559 1173 2322 7342
+* 3p - -37 2302 1625 1196 2228 7351
+* 3p - -36 2400 1575 1089 2300 7364
+* 3p - -35 2394 1528 1167 2283 7372
+* 3p - -34 2357 1588 1177 2265 7387
+* 3p - -33 2337 1628 1129 2301 7395
+* 3p - -32 2282 1503 1192 2426 7403
+* 3p - -31 2349 1550 1250 2264 7413
+* 3p - -30 2406 1499 1237 2272 7414
+* 3p - -29 2374 1513 1308 2222 7417
+* 3p - -28 2326 1483 1298 2319 7426
+* 3p - -27 2437 1514 1153 2326 7430
+* 3p - -26 2281 1604 1241 2315 7441
+* 3p - -25 2263 1592 1219 2376 7450
+* 3p - -24 2372 1429 1268 2386 7455
+* 3p - -23 2345 1458 1227 2423 7453
+* 3p - -22 2303 1549 1232 2365 7449
+* 3p - -21 2393 1456 1219 2387 7455
+* 3p - -20 2436 1439 1304 2276 7455
+* 3p - -19 2399 1541 1263 2251 7454
+* 3p - -18 2391 1528 1283 2253 7455
+* 3p - -17 2413 1521 1232 2289 7455
+* 3p - -16 2366 1545 1171 2373 7455
+* 3p - -15 2457 1391 1220 2328 7396
+* 3p - -14 2359 1497 1249 2350 7455
+* 3p - -13 2387 1497 1279 2291 7454
+* 3p - -12 2357 1446 1172 2480 7455
+* 3p - -11 2458 1436 1224 2337 7455
+* 3p - -10 2393 1590 1268 2204 7455
+* 3p - -9 2542 1443 1249 2221 7455
+* 3p - -8 2520 1407 1292 2236 7455
+* 3p - -7 2277 1484 1235 2458 7454
+* 3p - -6 2583 1280 1104 2488 7455
+* 3p - -5 2420 1415 1180 2440 7455
+* 3p - -4 2408 1349 1267 2430 7454
+* 3p - -3 2548 1253 1069 2585 7455
+* 3p - -2 2649 1540 922 2344 7455
+* 3p - -1 1309 1755 1213 3178 7455
+* 3p - 1 1238 1912 463 3841 7454
+* 3p - 2 2265 1497 1098 2594 7454
+* 3p - 3 2508 1279 1189 2478 7454
+* 3p - 4 2595 1327 1058 2474 7454
+* 3p - 5 2698 1255 988 2513 7454
+* 3p - 6 2540 1247 1080 2587 7454
+* 3p - 7 2616 1191 1075 2572 7454
+* 3p - 8 2706 1202 1007 2539 7454
+* 3p - 9 2524 1253 1053 2624 7454
+* 3p - 10 2658 1177 1040 2579 7454
+* 5p + -10 2498 1014 1281 2543 7336
+* 5p + -9 2437 1058 1258 2583 7336
+* 5p + -8 2527 978 1287 2544 7336
+* 5p + -7 2547 977 1209 2603 7336
+* 5p + -6 2468 1073 1148 2647 7336
+* 5p + -5 2481 1019 1252 2584 7336
+* 5p + -4 2459 1044 1365 2468 7336
+* 5p + -3 2464 1112 1290 2470 7336
+* 5p + -2 2756 966 1514 2101 7337
+* 5p + -1 4188 292 2105 752 7337
+* 5p + 1 3290 1209 1557 1282 7338
+* 5p + 2 2260 870 1476 2732 7338
+* 5p + 3 2511 1170 1277 2380 7338
+* 5p + 4 2456 1248 1257 2377 7338
+* 5p + 5 2422 1164 1367 2385 7338
+* 5p + 6 2504 1163 1323 2348 7338
+* 5p + 7 2530 1140 1472 2196 7338
+* 5p + 8 2326 1245 1278 2489 7338
+* 5p + 9 2159 1231 1360 2588 7338
+* 5p + 10 2236 1243 1493 2366 7338
+* 5p + 11 2375 1033 1443 2487 7338
+* 5p + 12 2432 1168 1445 2293 7338
+* 5p + 13 2206 1239 1502 2391 7338
+* 5p + 14 2318 1180 1466 2373 7337
+* 5p + 15 2364 1175 1413 2381 7333
+* 5p + 16 2419 1131 1440 2348 7338
+* 5p + 17 2429 1160 1355 2394 7338
+* 5p + 18 2281 1175 1496 2386 7338
+* 5p + 19 2333 1205 1468 2332 7338
+* 5p + 20 2361 1190 1422 2365 7338
+* 5p + 21 2421 1206 1408 2303 7338
+* 5p + 22 2316 1257 1543 2222 7338
+* 5p + 23 2373 1201 1455 2309 7338
+* 5p + 24 2321 1210 1526 2281 7338
+* 5p + 25 2364 1201 1465 2308 7338
+* 5p + 26 2284 1176 1551 2315 7326
+* 5p + 27 2331 1101 1477 2409 7318
+* 5p + 28 2353 1249 1438 2271 7311
+* 5p + 29 2333 1208 1450 2314 7305
+* 5p + 30 2300 1247 1477 2279 7303
+* 5p + 31 2321 1262 1467 2249 7299
+* 5p + 32 2424 1176 1500 2187 7287
+* 5p + 33 2363 1171 1532 2214 7280
+* 5p + 34 2389 1191 1467 2223 7270
+* 5p + 35 2343 1077 1467 2371 7258
+* 5p + 36 2427 1226 1473 2127 7253
+* 5p + 37 2368 1211 1527 2140 7246
+* 5p + 38 2365 1172 1425 2260 7222
+* 5p + 39 2379 1162 1458 2200 7199
+* 5p + 40 2309 1117 1520 2229 7175
+* 5p + 41 2367 1141 1386 2251 7145
+* 5p + 42 2367 1116 1349 2291 7123
+* 5p + 43 2237 1128 1440 2282 7087
+* 5p + 44 2230 1156 1391 2278 7055
+* 5p + 45 2308 1166 1413 2134 7021
+* 5p + 46 2190 1154 1450 2173 6967
+* 5p + 47 2190 1124 1481 2133 6928
+* 5p + 48 2219 1038 1519 2108 6884
+* 5p + 49 2172 1101 1417 2144 6834
+* 5p + 50 2158 1051 1384 2180 6773
+* 5p + 51 2103 1092 1432 2089 6716
+* 5p + 52 2148 1038 1376 2094 6656
+* 5p + 53 2028 1076 1319 2166 6589
+* 5p + 54 1984 1029 1392 2110 6515
+* 5p + 55 2077 1060 1413 1882 6432
+* 5p + 56 2044 992 1308 1991 6335
+* 5p + 57 2099 996 1281 1879 6255
+* 5p + 58 1913 1037 1378 1849 6177
+* 5p + 59 1944 937 1195 2011 6087
+* 5p + 60 1920 955 1238 1884 5997
+* 5p + 61 1914 940 1207 1833 5894
+* 5p + 62 1922 904 1132 1848 5806
+* 5p + 63 1865 915 1129 1801 5710
+* 5p + 64 1845 972 1085 1723 5625
+* 5p + 65 1768 857 1128 1768 5521
+* 5p + 66 1763 825 1110 1730 5428
+* 5p + 67 1620 875 1075 1745 5315
+* 5p + 68 1713 775 988 1748 5224
+* 5p + 69 1639 778 1056 1623 5096
+* 5p + 70 1635 840 1027 1518 5020
+* 5p - -10 2738 1210 979 2528 7455
+* 5p - -9 2659 1349 1001 2446 7455
+* 5p - -8 2573 1315 1018 2549 7455
+* 5p - -7 2518 1309 1031 2597 7455
+* 5p - -6 2499 1302 1019 2635 7455
+* 5p - -5 2456 1371 949 2679 7455
+* 5p - -4 2375 1437 998 2645 7455
+* 5p - -3 2461 1398 1134 2462 7455
+* 5p - -2 2712 1258 1311 2174 7455
+* 5p - -1 4412 422 1877 744 7455
+* 5p - 1 3190 1750 1273 1242 7455
+* 5p - 2 2171 1310 1257 2717 7455
+* 5p - 3 2527 1384 981 2563 7455
+* 5p - 4 2346 1435 1072 2599 7452
+* 5p - 5 2417 1440 1057 2541 7455
+* 5p - 6 2416 1350 1111 2578 7455
+* 5p - 7 2426 1458 1235 2336 7455
+* 5p - 8 2231 1612 999 2613 7455
+* 5p - 9 2312 1492 1042 2609 7455
+* 5p - 10 2289 1523 1227 2416 7455
+* 5p - 11 2460 1601 1063 2331 7455
+* 5p - 12 2375 1569 1090 2421 7455
+* 5p - 13 2431 1576 1130 2318 7455
+* 5p - 14 2444 1571 1063 2377 7455
+* 5p - 15 2312 1505 1181 2451 7449
+* 5p - 16 2390 1452 1189 2424 7455
+* 5p - 17 2294 1439 1195 2527 7455
+* 5p - 18 2400 1531 1193 2331 7455
+* 5p - 19 2343 1537 1218 2357 7455
+* 5p - 20 2324 1550 1180 2401 7455
+* 5p - 21 2422 1455 1236 2342 7455
+* 5p - 22 2341 1611 1209 2294 7455
+* 5p - 23 2321 1592 1090 2451 7454
+* 5p - 24 2427 1508 1078 2442 7455
+* 5p - 25 2380 1565 1155 2355 7455
+* 5p - 26 2356 1648 1251 2187 7442
+* 5p - 27 2396 1468 1201 2365 7430
+* 5p - 28 2369 1630 1127 2300 7426
+* 5p - 29 2277 1581 1192 2368 7418
+* 5p - 30 2267 1561 1133 2454 7415
+* 5p - 31 2343 1591 1140 2340 7414
+* 5p - 32 2360 1463 1145 2436 7404
+* 5p - 33 2304 1545 1173 2373 7395
+* 5p - 34 2448 1502 1151 2286 7387
+* 5p - 35 2297 1566 1141 2368 7372
+* 5p - 36 2360 1550 1178 2276 7364
+* 5p - 37 2349 1530 1293 2179 7351
+* 5p - 38 2281 1435 1243 2384 7343
+* 5p - 39 2345 1528 1184 2266 7323
+* 5p - 40 2274 1457 1240 2328 7299
+* 5p - 41 2294 1548 1109 2321 7272
+* 5p - 42 2323 1530 1180 2228 7261
+* 5p - 43 2226 1543 1220 2231 7220
+* 5p - 44 2211 1555 1084 2326 7176
+* 5p - 45 2259 1463 1126 2287 7135
+* 5p - 46 2160 1413 1184 2329 7086
+* 5p - 47 2291 1479 1105 2153 7028
+* 5p - 48 2353 1394 1112 2118 6977
+* 5p - 49 2188 1463 1224 2046 6921
+* 5p - 50 2172 1473 1141 2072 6858
+* 5p - 51 2118 1318 1174 2190 6800
+* 5p - 52 2074 1518 1003 2133 6728
+* 5p - 53 2046 1492 1065 2056 6659
+* 5p - 54 2086 1322 1130 2046 6584
+* 5p - 55 1969 1428 1096 2003 6496
+* 5p - 56 2007 1311 1051 2042 6411
+* 5p - 57 2051 1240 1118 1917 6326
+* 5p - 58 2021 1200 1059 1970 6250
+* 5p - 59 1941 1277 942 1997 6157
+* 5p - 60 1864 1310 956 1924 6054
+* 5p - 61 1878 1248 954 1876 5956
+* 5p - 62 1865 1170 946 1891 5872
+* 5p - 63 1850 1119 905 1874 5748
+* 5p - 64 1750 1143 1014 1723 5630
+* 5p - 65 1741 1164 866 1755 5526
+* 5p - 66 1763 1067 867 1716 5413
+* 5p - 67 1619 1126 905 1638 5288
+* 5p - 68 1634 1065 809 1659 5167
+* 5p - 69 1626 971 840 1594 5031
+* 5p - 70 1581 937 842 1583 4943
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/dnacomp_genome.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/dnacomp_genome.csv
new file mode 100644
index 0000000..94d15ba
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/dnacomp_genome.csv
@@ -0,0 +1,2 @@
+A,C,G,T
+0.388112441327,0.105690628131,0.117477981119,0.388718949422
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/lgdistribution.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/lgdistribution.txt
new file mode 100644
index 0000000..8822c45
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/lgdistribution.txt
@@ -0,0 +1,317 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_mito.fasta as reference file
+# Std: strand of reads
+Std Length Occurences
++ 24 1
++ 25 3
++ 26 4
++ 27 4
++ 28 4
++ 29 3
++ 30 3
++ 31 8
++ 32 9
++ 33 10
++ 34 12
++ 35 5
++ 36 7
++ 37 24
++ 38 23
++ 39 24
++ 40 30
++ 41 19
++ 42 37
++ 43 30
++ 44 33
++ 45 54
++ 46 39
++ 47 45
++ 48 50
++ 49 59
++ 50 57
++ 51 62
++ 52 65
++ 53 75
++ 54 82
++ 55 96
++ 56 78
++ 57 80
++ 58 90
++ 59 88
++ 60 105
++ 61 86
++ 62 96
++ 63 89
++ 64 103
++ 65 95
++ 66 108
++ 67 94
++ 68 108
++ 69 96
++ 70 98
++ 71 91
++ 72 96
++ 73 100
++ 74 111
++ 75 112
++ 76 99
++ 77 96
++ 78 84
++ 79 90
++ 80 96
++ 81 85
++ 82 95
++ 83 106
++ 84 85
++ 85 88
++ 86 102
++ 87 99
++ 88 81
++ 89 86
++ 90 81
++ 91 95
++ 92 129
++ 93 223
++ 94 1068
++ 95 61
++ 96 42
++ 97 41
++ 98 54
++ 99 34
++ 100 48
++ 101 47
++ 102 39
++ 103 41
++ 104 34
++ 105 30
++ 106 31
++ 107 45
++ 108 27
++ 109 42
++ 110 30
++ 111 21
++ 112 26
++ 113 19
++ 114 28
++ 115 31
++ 116 21
++ 117 23
++ 118 21
++ 119 28
++ 120 16
++ 121 21
++ 122 15
++ 123 23
++ 124 23
++ 125 20
++ 126 21
++ 127 20
++ 128 23
++ 129 23
++ 130 28
++ 131 15
++ 132 17
++ 133 22
++ 134 21
++ 135 13
++ 136 10
++ 137 11
++ 138 16
++ 139 11
++ 140 8
++ 141 14
++ 142 8
++ 143 17
++ 144 13
++ 145 11
++ 146 12
++ 147 10
++ 148 5
++ 149 9
++ 150 6
++ 151 14
++ 152 4
++ 153 7
++ 154 5
++ 155 3
++ 156 4
++ 157 7
++ 158 4
++ 159 3
++ 160 4
++ 161 5
++ 162 3
++ 163 7
++ 164 4
++ 165 4
++ 166 4
++ 167 5
++ 168 4
++ 169 3
++ 170 3
++ 171 2
++ 172 6
++ 173 7
++ 174 4
++ 175 2
++ 177 3
++ 181 2
++ 182 1
++ 183 2
++ 185 1
+- 25 3
+- 26 4
+- 27 1
+- 28 4
+- 29 2
+- 30 2
+- 31 9
+- 32 8
+- 33 7
+- 34 16
+- 35 8
+- 36 12
+- 37 8
+- 38 19
+- 39 22
+- 40 28
+- 41 12
+- 42 39
+- 43 43
+- 44 42
+- 45 48
+- 46 58
+- 47 48
+- 48 57
+- 49 64
+- 50 55
+- 51 76
+- 52 66
+- 53 76
+- 54 87
+- 55 87
+- 56 84
+- 57 81
+- 58 88
+- 59 104
+- 60 97
+- 61 82
+- 62 124
+- 63 120
+- 64 105
+- 65 111
+- 66 128
+- 67 121
+- 68 120
+- 69 104
+- 70 98
+- 71 108
+- 72 87
+- 73 104
+- 74 99
+- 75 90
+- 76 103
+- 77 99
+- 78 101
+- 79 107
+- 80 91
+- 81 90
+- 82 95
+- 83 102
+- 84 88
+- 85 91
+- 86 58
+- 87 90
+- 88 71
+- 89 81
+- 90 97
+- 91 91
+- 92 110
+- 93 268
+- 94 1045
+- 95 60
+- 96 41
+- 97 45
+- 98 57
+- 99 31
+- 100 26
+- 101 40
+- 102 36
+- 103 38
+- 104 42
+- 105 31
+- 106 31
+- 107 29
+- 108 27
+- 109 40
+- 110 34
+- 111 29
+- 112 31
+- 113 18
+- 114 30
+- 115 26
+- 116 20
+- 117 33
+- 118 22
+- 119 35
+- 120 26
+- 121 28
+- 122 18
+- 123 19
+- 124 29
+- 125 23
+- 126 17
+- 127 18
+- 128 17
+- 129 15
+- 130 16
+- 131 16
+- 132 20
+- 133 26
+- 134 14
+- 135 20
+- 136 16
+- 137 15
+- 138 6
+- 139 10
+- 140 13
+- 141 5
+- 142 8
+- 143 8
+- 144 13
+- 145 9
+- 146 7
+- 147 7
+- 148 6
+- 149 11
+- 150 10
+- 151 6
+- 152 7
+- 153 10
+- 154 6
+- 155 2
+- 156 9
+- 157 2
+- 158 6
+- 159 4
+- 160 3
+- 161 3
+- 162 7
+- 163 5
+- 164 3
+- 165 4
+- 166 2
+- 167 3
+- 168 4
+- 169 4
+- 170 5
+- 171 3
+- 173 5
+- 175 1
+- 176 1
+- 177 2
+- 178 4
+- 179 1
+- 181 3
+- 182 2
+- 185 1
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/misincorporation.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/misincorporation.txt
new file mode 100644
index 0000000..e8c6af0
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_CTTGTA/misincorporation.txt
@@ -0,0 +1,284 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_mito.fasta as reference file
+# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads
+Chr End Std Pos A C G T Total G>A C>T A>G T>C A>C A>T C>G C>A T>G T>A G>C G>T A>- T>- C>- G>- ->A ->T ->C ->G S
+* 3p + 1 1345 1257 1595 3141 7338 24 44 13 75 37 142 15 43 18 99 12 30 0 0 0 0 0 0 0 0 0
+* 3p + 2 2350 1260 1305 2423 7338 40 23 18 46 10 68 10 44 25 164 7 35 0 0 0 0 0 0 0 0 0
+* 3p + 3 2412 1011 1383 2532 7338 19 23 22 34 11 66 4 40 19 133 3 19 0 0 2 0 0 0 0 0 0
+* 3p + 4 2433 1053 1331 2520 7337 16 18 13 32 5 36 6 28 10 105 2 18 0 2 2 2 1 0 0 0 0
+* 3p + 5 2428 1143 1343 2421 7335 23 16 9 32 7 26 4 16 12 60 1 4 1 2 0 0 0 0 1 2 0
+* 3p + 6 2551 1068 1386 2323 7328 9 6 11 27 6 16 1 15 3 37 1 7 0 0 0 0 6 1 2 1 0
+* 3p + 7 2309 1224 1352 2433 7318 9 9 9 32 8 16 0 5 6 24 1 2 0 3 0 0 7 6 6 1 0
+* 3p + 8 2535 1025 1495 2263 7318 6 5 5 26 7 8 3 4 2 20 0 2 3 2 0 0 9 9 0 2 0
+* 3p + 9 2563 1045 1484 2226 7318 7 2 4 24 2 7 3 4 5 8 1 1 0 6 2 3 13 4 1 2 0
+* 3p + 10 2354 1207 1481 2281 7323 8 5 6 27 0 5 1 1 4 7 2 2 0 5 2 0 11 3 1 0 0
+* 3p + 11 2324 1049 1602 2356 7331 8 4 5 41 5 4 3 1 3 3 1 0 1 1 1 0 5 2 0 0 0
+* 3p + 12 2444 1084 1521 2287 7336 5 1 8 16 2 9 2 1 3 7 0 1 0 0 0 0 2 0 0 0 0
+* 3p + 13 2369 1169 1507 2289 7334 1 3 10 19 0 3 0 2 0 2 3 1 0 0 1 0 2 2 0 0 0
+* 3p + 14 2475 1018 1589 2254 7336 6 0 11 23 3 1 0 2 2 2 2 3 0 1 1 0 2 0 0 0 0
+* 3p + 15 2381 1061 1532 2300 7274 5 2 8 18 1 1 0 0 4 3 0 2 0 0 0 0 0 1 0 0 0
+* 3p + 16 2436 1129 1441 2331 7337 1 1 2 16 2 0 0 0 1 3 0 2 0 1 0 0 1 0 0 0 0
+* 3p + 17 2417 1114 1559 2248 7338 1 1 4 24 3 1 1 0 0 2 0 1 0 1 0 0 0 0 0 0 0
+* 3p + 18 2423 1098 1420 2396 7337 7 2 3 17 2 2 0 1 2 2 2 0 1 3 1 0 0 0 0 1 0
+* 3p + 19 2462 1200 1503 2172 7337 2 2 3 14 1 2 0 0 4 6 0 4 0 0 1 0 1 0 0 0 0
+* 3p + 20 2430 1148 1483 2275 7336 6 3 5 19 1 1 1 1 0 4 0 0 0 1 0 1 2 0 0 0 0
+* 3p + 21 2361 1151 1452 2371 7335 6 4 4 16 2 0 0 0 1 5 1 1 0 1 0 0 1 0 0 0 0
+* 3p + 22 2313 1206 1582 2233 7334 2 5 7 16 0 1 0 3 1 2 0 1 0 2 0 0 1 0 0 0 0
+* 3p + 23 2387 1179 1561 2209 7336 2 2 10 16 1 0 2 2 1 1 1 6 0 0 0 0 0 1 0 0 0
+* 3p + 24 2477 1060 1404 2395 7336 1 4 6 20 0 1 0 2 4 2 1 1 1 0 0 0 0 0 0 0 0
+* 3p + 25 2276 1086 1594 2379 7335 8 1 8 22 0 1 0 0 2 2 0 0 0 1 0 0 0 0 0 0 0
+* 3p + 26 2248 1171 1574 2335 7328 6 0 4 12 0 2 1 1 0 1 0 0 0 1 0 0 0 0 1 0 0
+* 3p + 27 2341 1116 1448 2412 7317 5 2 1 16 2 0 1 0 2 1 0 3 0 1 0 0 1 0 0 0 0
+* 3p + 28 2268 1201 1580 2261 7310 6 3 8 16 0 0 1 5 7 1 0 1 0 2 0 0 2 0 0 0 0
+* 3p + 29 2413 1107 1499 2286 7305 6 0 3 23 5 0 0 3 5 4 1 1 0 0 0 0 0 0 0 0 0
+* 3p + 30 2389 1150 1448 2312 7299 7 3 5 14 3 0 0 4 3 1 0 2 0 0 0 0 0 0 0 0 0
+* 3p + 31 2315 1205 1462 2317 7299 4 4 8 16 0 0 0 2 2 1 2 1 0 0 0 0 0 0 0 0 0
+* 3p + 32 2327 1116 1519 2327 7289 1 5 3 17 3 3 0 3 3 2 0 0 0 1 0 0 0 0 0 0 0
+* 3p + 33 2290 1145 1527 2318 7280 4 1 14 15 1 1 1 3 1 1 0 2 1 3 0 0 0 0 0 0 0
+* 3p + 34 2334 1108 1490 2337 7269 1 2 5 18 3 0 0 1 3 0 0 3 0 0 0 0 1 0 0 0 0
+* 3p + 35 2381 1083 1607 2187 7258 3 2 6 16 0 1 0 4 3 1 1 3 2 0 0 0 0 0 0 0 0
+* 3p + 36 2264 1137 1463 2389 7253 3 3 5 12 3 1 0 1 0 2 1 3 0 0 0 0 0 0 0 0 0
+* 3p + 37 2153 1281 1502 2310 7246 7 2 9 11 0 0 0 0 2 0 0 1 0 0 0 1 0 0 0 0 0
+* 3p + 38 2367 1165 1469 2221 7222 4 0 6 10 1 0 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0
+* 3p + 39 2237 1094 1464 2404 7199 6 0 6 19 0 1 0 5 4 1 0 2 0 0 0 0 0 0 0 0 0
+* 3p + 40 2292 1233 1485 2165 7175 9 3 4 12 1 2 1 3 0 0 0 1 0 0 0 0 0 0 0 0 0
+* 3p + 41 2230 1163 1613 2139 7145 4 3 11 12 2 1 0 6 1 0 1 1 0 0 0 0 0 0 0 0 0
+* 3p + 42 2307 1109 1447 2260 7123 5 1 5 12 3 0 0 5 0 3 0 0 0 0 0 0 0 0 0 0 0
+* 3p + 43 2332 1177 1453 2125 7087 0 3 3 6 1 1 0 0 0 1 0 1 0 1 0 0 0 0 0 0 0
+* 3p + 44 2300 1143 1488 2124 7055 11 4 3 14 1 0 0 1 2 1 0 1 1 1 0 0 0 0 0 0 0
+* 3p + 45 2390 1058 1355 2218 7021 12 5 14 14 7 2 1 0 3 1 0 1 0 0 0 0 0 0 0 0 0
+* 3p + 46 2289 1156 1424 2098 6967 3 4 6 12 4 0 0 10 0 1 1 4 0 0 0 0 0 0 0 0 0
+* 3p + 47 2277 1073 1463 2115 6928 3 3 7 13 3 0 0 11 0 1 0 4 0 0 0 0 0 0 0 0 0
+* 3p + 48 2176 1032 1371 2305 6884 1 2 4 15 0 0 0 2 0 2 0 2 1 0 0 0 0 0 0 0 0
+* 3p + 49 2113 1195 1459 2067 6834 3 3 5 9 2 0 0 3 1 2 1 0 0 1 0 0 0 0 0 0 0
+* 3p + 50 2073 1124 1497 2081 6775 1 1 6 13 3 0 0 0 3 1 0 0 0 2 0 0 0 0 0 0 0
+* 3p + 51 2215 1086 1328 2088 6717 2 5 4 4 0 1 0 4 0 2 0 1 0 0 0 0 0 0 0 0 0
+* 3p + 52 2075 1085 1422 2073 6655 1 2 5 5 5 0 1 0 0 1 0 0 0 0 0 0 1 0 0 0 0
+* 3p + 53 1977 1113 1501 1998 6589 1 4 9 5 4 0 0 0 0 4 0 2 0 0 0 0 0 0 0 0 0
+* 3p + 54 2080 1062 1284 2089 6515 0 1 5 16 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0
+* 3p + 55 2046 1041 1338 2007 6432 1 2 3 16 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0
+* 3p + 56 2070 1051 1333 1881 6335 1 3 5 5 2 1 0 0 1 2 0 3 0 1 0 0 0 0 0 0 0
+* 3p + 57 2057 999 1206 1995 6257 4 2 8 11 0 0 0 0 2 1 1 2 0 0 0 0 0 0 0 0 0
+* 3p + 58 2028 1073 1158 1918 6177 1 9 7 13 0 0 0 2 0 1 0 0 0 0 0 0 0 0 0 0 0
+* 3p + 59 2002 943 1254 1888 6087 1 3 2 8 0 0 0 1 2 0 1 0 0 2 0 0 0 0 0 0 0
+* 3p + 60 1961 898 1169 1971 5999 1 1 3 9 1 0 1 0 1 1 1 0 1 1 0 0 0 0 0 0 0
+* 3p + 61 1997 937 1135 1825 5894 1 2 6 3 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 3p + 62 1855 926 1164 1863 5808 1 5 6 6 1 1 0 0 0 3 0 3 0 1 0 0 0 0 0 0 0
+* 3p + 63 1961 855 1086 1810 5712 1 4 7 11 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
+* 3p + 64 1819 956 1058 1792 5625 0 2 5 10 2 1 0 1 0 0 0 2 0 0 0 0 0 0 0 0 0
+* 3p + 65 1796 891 1089 1745 5521 1 5 2 10 1 0 0 0 0 0 1 3 0 0 0 0 0 0 0 0 0
+* 3p + 66 1778 865 1083 1702 5428 2 1 7 6 1 0 0 0 1 1 0 1 1 1 0 0 0 0 0 0 0
+* 3p + 67 1719 887 1116 1597 5319 0 1 7 6 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 3p + 68 1736 857 1054 1579 5226 0 1 5 9 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0
+* 3p + 69 1696 812 949 1650 5107 1 4 4 5 3 1 0 1 1 0 1 1 0 0 0 0 0 0 0 0 0
+* 3p + 70 1634 853 938 1595 5020 0 2 4 5 1 1 0 0 0 0 2 4 1 0 0 0 0 0 0 0 0
+* 3p - 1 1381 1715 1247 3112 7455 22 37 21 57 37 153 8 29 13 88 20 34 0 0 0 0 0 0 0 0 1
+* 3p - 2 2519 1543 925 2468 7455 38 21 24 49 17 69 8 43 25 159 3 19 0 0 0 0 0 0 0 0 1
+* 3p - 3 2438 1232 1074 2703 7447 22 14 21 45 16 49 3 31 16 141 8 16 0 1 1 0 2 5 1 0 1
+* 3p - 4 2332 1331 1251 2534 7448 21 15 30 32 12 29 4 12 17 108 4 12 0 2 0 0 4 0 1 1 0
+* 3p - 5 2385 1410 1172 2480 7447 10 23 17 33 8 26 1 14 12 59 1 10 1 3 0 1 4 2 0 2 0
+* 3p - 6 2538 1253 1107 2546 7444 8 10 6 30 10 16 0 7 6 56 2 6 1 2 0 1 5 3 1 2 0
+* 3p - 7 2237 1461 1226 2514 7438 11 4 13 27 9 8 1 8 6 36 1 2 0 6 0 0 9 3 1 3 0
+* 3p - 8 2501 1394 1286 2259 7440 9 7 10 19 2 7 0 3 6 15 1 2 0 1 0 1 9 3 0 3 0
+* 3p - 9 2527 1427 1248 2247 7449 8 4 10 20 0 1 2 1 5 15 2 6 0 2 0 2 2 2 1 1 0
+* 3p - 10 2397 1572 1260 2224 7453 3 3 11 19 0 3 0 0 2 6 1 1 2 5 0 1 2 0 0 0 0
+* 3p - 11 2456 1429 1218 2351 7454 0 9 7 14 4 0 2 0 3 8 0 4 1 2 0 1 1 0 0 0 0
+* 3p - 12 2361 1432 1171 2490 7454 8 1 8 13 3 3 0 1 1 6 0 4 0 0 0 1 0 0 0 1 0
+* 3p - 13 2382 1482 1277 2312 7453 0 2 5 12 2 1 0 2 4 6 1 1 0 2 0 2 1 0 0 0 0
+* 3p - 14 2368 1489 1246 2352 7455 9 4 13 10 3 2 0 0 0 4 1 0 2 0 2 0 0 0 0 0 0
+* 3p - 15 2454 1378 1210 2354 7396 2 2 11 15 3 0 2 0 1 1 1 6 1 0 0 0 0 0 0 0 0
+* 3p - 16 2368 1536 1174 2375 7453 4 1 7 15 0 1 2 3 1 1 0 4 0 1 0 0 1 1 0 0 0
+* 3p - 17 2418 1504 1238 2294 7454 4 6 5 21 1 0 0 0 2 4 1 10 0 0 0 0 0 1 0 0 0
+* 3p - 18 2388 1505 1281 2281 7455 4 4 4 24 0 0 0 2 2 4 1 2 2 0 0 0 0 0 0 0 0
+* 3p - 19 2402 1528 1269 2255 7454 2 8 7 26 0 3 2 0 1 1 0 10 0 0 0 0 0 0 0 0 0
+* 3p - 20 2429 1435 1327 2263 7454 5 1 4 9 0 0 0 1 3 2 0 24 0 0 0 0 0 1 0 0 0
+* 3p - 21 2405 1437 1203 2404 7449 5 11 8 23 1 0 0 0 3 2 0 2 1 2 0 0 0 5 0 1 0
+* 3p - 22 2298 1555 1231 2362 7446 2 23 7 20 1 1 0 1 4 1 0 2 0 1 0 0 0 2 1 0 0
+* 3p - 23 2348 1429 1234 2442 7453 3 1 3 24 1 3 1 0 1 3 0 4 0 0 0 0 0 0 0 0 0
+* 3p - 24 2383 1415 1258 2399 7455 1 4 4 18 0 0 1 0 1 3 0 2 0 0 0 0 0 0 0 0 0
+* 3p - 25 2259 1580 1218 2394 7451 6 5 10 15 5 0 0 2 2 6 0 2 0 0 0 0 0 0 0 0 0
+* 3p - 26 2287 1602 1237 2312 7438 1 2 11 8 1 0 2 0 2 4 0 5 0 1 0 0 0 2 0 0 0
+* 3p - 27 2440 1496 1146 2346 7428 2 1 9 18 0 2 0 0 1 2 0 4 0 0 0 0 0 2 0 0 0
+* 3p - 28 2336 1481 1290 2319 7426 1 6 3 14 0 3 2 0 4 1 0 2 0 0 0 0 0 0 0 0 0
+* 3p - 29 2371 1502 1304 2240 7417 3 4 4 14 0 1 1 0 4 4 1 2 0 0 0 0 0 0 0 0 0
+* 3p - 30 2414 1487 1228 2285 7414 1 7 10 13 1 0 0 1 5 4 1 1 0 1 0 0 0 0 0 0 0
+* 3p - 31 2342 1539 1248 2284 7413 6 0 6 10 2 1 0 0 1 5 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 32 2300 1491 1176 2436 7403 4 1 14 16 0 3 0 0 6 2 0 1 0 1 0 0 0 0 0 0 0
+* 3p - 33 2338 1605 1123 2329 7395 1 0 6 22 1 2 0 0 5 1 0 4 0 1 0 0 0 0 0 0 0
+* 3p - 34 2362 1571 1170 2284 7387 1 1 12 16 1 0 0 0 1 0 0 2 0 1 0 0 0 0 0 0 0
+* 3p - 35 2408 1511 1153 2300 7372 0 0 10 16 0 2 1 0 3 3 0 2 0 0 0 0 0 0 0 0 0
+* 3p - 36 2410 1554 1081 2319 7364 0 3 10 19 0 2 0 0 0 6 0 1 0 0 0 0 0 0 0 0 0
+* 3p - 37 2316 1619 1181 2235 7351 1 4 9 15 2 3 0 0 2 0 0 2 0 0 0 0 0 0 0 0 0
+* 3p - 38 2296 1544 1171 2331 7342 4 2 10 16 1 3 1 1 1 1 2 1 0 1 0 0 0 0 0 0 0
+* 3p - 39 2260 1546 1134 2384 7324 1 5 12 14 0 1 1 1 1 3 0 0 1 1 0 0 0 0 0 0 0
+* 3p - 40 2233 1572 1124 2372 7301 2 3 9 18 3 1 0 2 0 4 0 1 0 0 0 0 0 0 0 0 0
+* 3p - 41 2269 1475 1135 2393 7272 6 1 7 13 2 2 1 0 2 1 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 42 2432 1354 1133 2342 7261 8 5 10 14 3 4 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0
+* 3p - 43 2282 1498 1120 2320 7220 1 2 9 19 1 2 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0
+* 3p - 44 2200 1439 1218 2318 7175 1 1 5 19 0 2 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0
+* 3p - 45 2338 1367 1111 2318 7134 0 2 10 8 4 0 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0
+* 3p - 46 2246 1502 1184 2155 7087 5 1 7 5 1 3 0 0 2 3 1 2 0 0 0 0 0 0 0 0 0
+* 3p - 47 2292 1431 1124 2180 7027 2 1 6 12 0 5 0 0 1 4 0 1 0 0 0 0 1 0 0 0 0
+* 3p - 48 2148 1563 1009 2258 6978 1 0 12 8 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0
+* 3p - 49 2099 1511 1081 2229 6920 2 2 11 9 0 0 0 0 0 2 0 1 0 0 0 0 0 0 0 0 0
+* 3p - 50 2221 1449 1106 2082 6858 1 5 3 5 0 2 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0
+* 3p - 51 2176 1398 1081 2147 6802 1 0 9 6 3 0 0 0 1 0 0 2 1 0 0 0 0 0 0 0 0
+* 3p - 52 2047 1412 1132 2138 6729 2 2 6 11 1 2 0 0 0 0 0 4 0 0 0 0 0 0 0 0 0
+* 3p - 53 2213 1264 1076 2106 6659 0 2 10 16 0 0 0 0 0 1 0 2 0 0 0 0 0 2 0 0 0
+* 3p - 54 2143 1367 990 2084 6584 0 4 7 8 0 0 0 0 0 0 0 2 0 1 0 0 0 1 0 0 0
+* 3p - 55 1951 1483 1083 1980 6497 2 2 5 5 0 0 0 0 1 1 0 2 0 1 0 0 0 0 0 0 0
+* 3p - 56 2030 1339 1024 2019 6412 2 1 1 13 0 0 0 0 1 0 2 1 0 0 0 0 0 0 0 0 0
+* 3p - 57 1938 1330 965 2093 6326 2 0 9 8 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 0 0
+* 3p - 58 1893 1317 1020 2021 6251 6 3 6 6 0 2 1 0 2 0 2 2 0 0 0 2 0 0 0 0 0
+* 3p - 59 1951 1277 868 2063 6159 1 2 4 8 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 60 2008 1205 891 1951 6055 0 2 5 5 2 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 61 1918 1195 914 1929 5956 3 4 4 10 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 62 1834 1118 981 1941 5874 1 5 2 11 1 1 0 0 2 0 2 1 0 0 0 0 1 0 0 0 0
+* 3p - 63 1837 1128 928 1858 5751 0 1 3 13 0 0 0 1 2 0 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 64 1766 1098 936 1831 5631 1 5 4 11 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 65 1731 1073 935 1787 5526 1 3 3 3 0 0 0 0 0 1 0 0 0 2 0 0 0 0 0 0 0
+* 3p - 66 1730 1086 802 1797 5415 3 2 2 6 0 1 0 0 3 0 0 0 1 2 0 0 1 0 0 0 0
+* 3p - 67 1764 1038 851 1635 5288 1 2 7 6 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 68 1723 1032 783 1632 5170 0 0 2 8 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 69 1737 933 749 1620 5039 0 1 5 3 0 2 0 0 1 3 1 0 0 0 0 0 0 0 0 0 0
+* 3p - 70 1584 964 820 1575 4943 2 7 2 8 0 1 0 0 0 1 1 3 0 0 0 0 0 0 0 0 0
+* 5p + 1 3300 1210 1545 1283 7338 2 5 13 3 1 1 1 0 2 3 1 1 0 0 0 0 0 0 0 0 0
+* 5p + 2 2281 871 1457 2729 7338 1 9 19 8 0 5 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0
+* 5p + 3 2536 1165 1254 2382 7337 0 8 22 10 5 1 2 3 3 1 2 2 1 1 0 0 0 1 0 0 0
+* 5p + 4 2461 1236 1255 2386 7338 2 5 4 18 1 2 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0
+* 5p + 5 2425 1158 1365 2390 7338 3 2 6 7 2 3 0 1 1 2 1 1 0 0 1 0 0 0 0 0 0
+* 5p + 6 2513 1154 1307 2363 7337 1 5 14 13 2 0 0 4 3 3 2 0 0 1 0 0 0 0 1 0 0
+* 5p + 7 2536 1134 1469 2198 7337 0 7 7 10 4 0 0 2 1 0 1 1 0 0 0 1 1 0 0 0 0
+* 5p + 8 2332 1231 1273 2500 7336 3 7 8 20 2 0 0 0 1 0 0 2 0 0 1 0 1 0 0 1 0
+* 5p + 9 2171 1221 1348 2598 7338 1 2 14 12 1 0 0 2 0 0 0 2 0 0 0 0 0 0 0 0 0
+* 5p + 10 2233 1233 1491 2380 7337 2 2 3 14 0 0 0 1 2 3 0 1 0 0 0 0 0 1 0 0 0
+* 5p + 11 2374 1021 1442 2500 7337 5 3 6 16 3 2 2 2 3 4 0 4 0 0 0 0 0 1 0 0 0
+* 5p + 12 2431 1165 1436 2306 7338 1 1 7 9 2 0 0 7 5 2 0 3 0 0 0 0 0 0 0 0 0
+* 5p + 13 2215 1230 1498 2395 7338 2 1 7 10 0 4 0 0 1 0 0 2 1 0 1 0 0 0 0 0 0
+* 5p + 14 2319 1168 1463 2387 7337 9 3 11 17 0 0 1 1 1 0 0 1 0 1 0 0 0 0 0 0 0
+* 5p + 15 2365 1160 1406 2401 7332 1 2 8 19 4 0 0 6 3 1 0 3 0 2 0 0 1 0 0 0 0
+* 5p + 16 2420 1125 1433 2359 7337 1 3 7 11 3 0 0 5 2 2 0 1 1 0 0 0 0 1 0 0 0
+* 5p + 17 2423 1164 1352 2397 7336 6 5 7 8 1 0 0 8 0 1 0 0 0 0 0 0 1 0 1 0 0
+* 5p + 18 2282 1172 1495 2389 7338 6 3 11 7 0 2 0 2 0 5 0 1 0 0 0 0 0 0 0 0 0
+* 5p + 19 2322 1198 1476 2341 7337 7 1 1 14 3 1 0 10 1 0 0 3 0 1 0 0 0 0 1 0 0
+* 5p + 20 2350 1188 1416 2384 7338 2 0 5 15 5 1 0 17 2 3 0 0 0 1 0 0 0 0 0 0 0
+* 5p + 21 2421 1191 1418 2308 7338 9 3 4 16 2 3 0 0 1 0 0 5 0 2 0 0 0 0 0 0 0
+* 5p + 22 2308 1243 1550 2237 7338 16 2 8 16 0 2 0 2 1 1 1 0 0 0 0 0 0 0 0 0 0
+* 5p + 23 2377 1193 1449 2319 7338 1 3 8 12 2 1 0 3 2 1 0 2 0 0 0 0 0 0 0 0 0
+* 5p + 24 2325 1204 1518 2290 7337 2 3 7 9 0 0 1 0 1 0 0 0 0 0 0 0 1 0 0 0 0
+* 5p + 25 2367 1190 1461 2320 7338 4 2 9 16 1 3 0 3 0 3 0 2 0 0 0 0 0 0 0 0 0
+* 5p + 26 2289 1163 1548 2329 7329 0 0 5 16 0 2 1 1 1 1 0 3 0 0 0 0 0 0 0 0 0
+* 5p + 27 2324 1088 1483 2423 7318 7 2 2 16 1 0 0 5 2 0 1 2 0 0 0 0 0 0 0 0 0
+* 5p + 28 2356 1240 1432 2283 7311 1 1 9 10 2 0 0 2 0 1 2 1 0 0 0 0 0 0 0 0 0
+* 5p + 29 2337 1192 1447 2329 7305 5 2 11 16 1 0 0 1 1 3 1 1 0 1 0 0 0 0 0 0 0
+* 5p + 30 2304 1230 1474 2295 7303 3 3 4 16 5 1 0 1 2 3 0 0 1 0 0 0 0 0 0 0 0
+* 5p + 31 2335 1251 1457 2256 7299 0 1 11 11 1 1 0 0 1 1 0 2 0 0 0 0 0 0 0 0 0
+* 5p + 32 2437 1153 1491 2208 7289 3 3 12 22 4 2 0 1 0 3 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 33 2365 1163 1524 2226 7278 0 1 7 10 1 2 0 1 4 3 0 3 2 0 0 0 0 2 0 0 0
+* 5p + 34 2399 1176 1458 2237 7270 4 5 9 19 2 1 0 1 4 1 0 2 1 0 0 0 0 0 0 0 0
+* 5p + 35 2346 1063 1466 2382 7257 4 4 5 15 2 4 1 0 1 5 0 3 0 0 0 0 1 0 0 0 0
+* 5p + 36 2431 1203 1463 2156 7253 0 0 8 25 1 1 1 2 2 2 1 1 1 0 0 0 0 0 0 0 0
+* 5p + 37 2375 1204 1522 2145 7246 0 2 7 12 2 1 0 3 1 1 0 2 0 0 0 0 0 0 0 0 0
+* 5p + 38 2368 1160 1418 2276 7222 2 5 6 18 0 3 1 0 2 2 0 3 0 0 0 0 0 0 0 0 0
+* 5p + 39 2384 1146 1455 2213 7198 2 3 8 17 0 0 0 1 1 1 0 2 2 1 0 0 0 0 0 1 0
+* 5p + 40 2314 1111 1512 2237 7174 0 2 4 9 1 0 2 1 0 2 0 1 0 1 0 0 1 0 0 0 0
+* 5p + 41 2358 1132 1390 2264 7144 1 4 1 18 0 0 0 2 1 4 1 2 0 2 0 0 1 0 0 0 0
+* 5p + 42 2361 1109 1343 2309 7122 3 9 3 16 1 1 1 2 3 3 0 1 0 2 0 0 0 0 0 1 0
+* 5p + 43 2233 1112 1443 2298 7086 3 3 2 19 2 0 0 2 2 7 0 1 0 0 0 1 0 1 0 0 0
+* 5p + 44 2224 1145 1395 2291 7055 4 1 5 13 2 1 1 3 1 3 2 3 0 0 0 0 0 0 0 0 0
+* 5p + 45 2316 1138 1407 2160 7021 5 3 10 27 4 2 0 2 1 5 1 2 0 0 0 0 0 0 0 0 0
+* 5p + 46 2180 1135 1450 2201 6966 2 1 1 19 0 3 0 1 1 10 0 0 0 0 0 0 1 0 0 0 0
+* 5p + 47 2195 1115 1476 2140 6926 0 6 6 17 2 3 1 1 3 2 0 3 0 0 0 0 1 0 1 0 0
+* 5p + 48 2210 1028 1517 2129 6884 5 3 4 14 1 0 1 3 1 8 0 2 1 0 0 0 0 0 0 0 0
+* 5p + 49 2180 1087 1416 2151 6834 1 5 7 11 5 1 0 0 0 4 1 1 0 0 0 0 0 0 0 0 0
+* 5p + 50 2150 1044 1384 2197 6775 7 3 4 12 2 3 1 1 1 9 0 0 3 1 0 0 0 0 0 0 0
+* 5p + 51 2094 1085 1441 2097 6717 3 4 1 16 1 3 0 4 0 9 0 5 0 3 0 0 0 0 0 0 0
+* 5p + 52 2146 1020 1367 2122 6655 2 2 7 15 3 7 0 0 2 16 0 2 0 1 0 0 1 0 0 0 0
+* 5p + 53 2014 1060 1324 2189 6587 6 0 5 18 2 4 0 2 2 12 0 1 0 0 0 0 2 0 0 0 0
+* 5p + 54 1971 1023 1390 2130 6514 3 3 0 10 4 9 0 4 2 11 1 3 0 1 1 0 1 0 0 0 0
+* 5p + 55 2078 1052 1412 1890 6432 5 6 6 13 6 5 0 8 3 12 0 3 0 0 1 0 0 0 0 0 0
+* 5p + 56 2050 984 1309 1991 6334 2 3 6 11 4 5 1 1 1 5 0 3 0 0 0 0 1 0 0 0 0
+* 5p + 57 2093 988 1276 1898 6255 4 4 6 8 4 7 0 2 1 10 0 0 0 0 0 0 2 0 0 0 0
+* 5p + 58 1916 1020 1375 1864 6175 1 1 7 16 1 9 0 3 3 11 1 3 0 0 0 0 0 1 1 0 0
+* 5p + 59 1932 935 1207 2011 6085 7 7 4 12 1 5 0 3 2 10 3 9 0 0 0 0 2 0 0 0 0
+* 5p + 60 1909 936 1242 1908 5995 2 1 1 19 2 5 0 4 0 13 0 2 1 2 0 0 4 0 0 0 0
+* 5p + 61 1915 936 1204 1835 5890 2 5 6 12 2 6 4 2 4 4 2 5 0 0 0 0 1 2 1 0 0
+* 5p + 62 1912 887 1134 1874 5807 5 3 1 20 2 5 1 5 4 13 1 4 0 0 0 0 0 0 1 0 0
+* 5p + 63 1860 904 1130 1816 5710 3 4 3 16 2 10 1 0 2 12 0 5 0 0 0 0 1 1 0 0 0
+* 5p + 64 1838 960 1087 1736 5621 6 2 6 12 2 8 0 6 3 11 2 2 0 0 0 0 1 0 2 1 0
+* 5p + 65 1751 853 1129 1784 5517 5 5 3 17 1 11 3 5 4 19 1 3 0 0 0 0 2 2 0 0 0
+* 5p + 66 1763 813 1111 1740 5427 8 5 4 10 5 11 0 2 5 11 1 3 1 1 0 1 1 0 0 0 0
+* 5p + 67 1612 864 1073 1770 5319 5 2 7 16 2 4 2 4 1 14 2 3 0 0 0 0 0 0 0 0 0
+* 5p + 68 1715 757 995 1757 5224 5 3 1 15 7 14 1 0 5 16 2 3 1 0 0 0 2 0 0 0 0
+* 5p + 69 1622 779 1058 1633 5092 5 5 0 5 0 8 0 2 5 9 0 4 0 0 0 0 3 0 2 0 0
+* 5p + 70 1633 827 1032 1529 5021 7 3 2 14 6 8 5 1 1 12 2 7 0 0 0 0 0 0 0 0 0
+* 5p - 1 3205 1757 1257 1236 7455 3 7 15 2 1 5 3 0 1 3 0 0 0 0 0 0 0 0 0 0 0
+* 5p - 2 2195 1299 1241 2720 7455 1 1 18 12 0 8 0 0 1 1 0 2 0 0 0 0 0 0 0 0 0
+* 5p - 3 2534 1375 978 2568 7455 3 6 6 13 1 6 0 0 1 3 1 0 0 0 0 0 0 0 0 0 0
+* 5p - 4 2362 1421 1056 2613 7452 1 3 16 15 4 4 1 1 2 6 0 2 0 0 0 0 0 0 0 0 0
+* 5p - 5 2420 1430 1046 2557 7453 0 3 9 13 1 0 2 0 1 7 0 2 0 0 0 0 0 0 1 1 0
+* 5p - 6 2424 1340 1102 2589 7455 0 5 9 15 0 0 0 0 2 1 0 1 0 0 0 1 0 0 0 0 0
+* 5p - 7 2442 1454 1219 2340 7455 1 6 17 10 0 1 0 0 1 1 0 1 0 0 0 0 0 0 0 0 0
+* 5p - 8 2238 1600 992 2625 7455 0 3 8 14 1 1 0 0 1 1 0 2 0 1 0 0 0 0 0 0 0
+* 5p - 9 2331 1481 1023 2620 7455 1 1 19 11 1 1 0 0 1 0 0 0 0 1 0 0 0 0 0 0 0
+* 5p - 10 2294 1521 1221 2419 7455 8 5 12 7 0 1 0 0 2 2 0 0 0 0 0 0 0 0 0 0 0
+* 5p - 11 2477 1590 1046 2342 7455 0 2 15 14 0 3 0 1 2 0 0 0 0 0 0 0 0 0 0 0 0
+* 5p - 12 2385 1554 1080 2436 7455 0 3 10 18 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0
+* 5p - 13 2443 1569 1117 2325 7454 0 7 13 11 2 1 0 0 1 2 1 0 0 1 0 0 1 0 0 0 0
+* 5p - 14 2456 1560 1055 2384 7455 2 2 10 10 1 4 0 0 0 1 0 1 0 2 0 0 0 0 0 0 0
+* 5p - 15 2315 1496 1171 2467 7449 2 3 6 14 0 0 0 0 5 2 0 0 0 1 0 0 0 0 0 0 0
+* 5p - 16 2401 1444 1182 2428 7455 1 6 8 11 0 4 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0
+* 5p - 17 2300 1435 1186 2534 7455 2 3 9 12 0 3 1 1 3 1 0 2 0 1 0 0 0 0 0 0 0
+* 5p - 18 2405 1527 1181 2342 7455 0 11 11 15 0 1 1 0 2 2 0 1 0 0 0 0 0 0 0 0 0
+* 5p - 19 2356 1531 1208 2359 7454 1 6 11 10 0 3 0 0 1 3 0 1 0 0 0 0 0 0 1 0 0
+* 5p - 20 2328 1547 1168 2410 7453 6 6 12 8 0 1 0 0 6 4 0 0 0 0 0 0 0 2 0 0 0
+* 5p - 21 2430 1447 1224 2352 7453 1 4 9 14 1 1 2 0 1 1 0 0 0 4 0 0 0 2 0 0 0
+* 5p - 22 2340 1604 1205 2306 7455 3 5 2 14 1 2 0 1 5 1 0 2 0 0 0 0 0 0 0 0 0
+* 5p - 23 2317 1580 1089 2468 7454 1 2 6 15 1 0 1 1 0 7 0 2 0 0 0 0 0 0 0 0 0
+* 5p - 24 2440 1484 1070 2461 7455 3 2 11 22 1 2 0 0 0 1 0 2 0 0 0 0 0 0 0 0 0
+* 5p - 25 2397 1560 1137 2361 7455 2 6 20 13 1 1 1 1 0 1 0 1 1 0 0 0 0 0 0 0 0
+* 5p - 26 2374 1637 1237 2193 7441 0 6 16 15 3 1 0 0 1 0 0 2 0 2 0 0 0 1 0 0 0
+* 5p - 27 2407 1459 1194 2369 7429 0 1 11 8 1 1 0 1 2 1 0 6 1 0 0 0 0 1 0 0 0
+* 5p - 28 2379 1613 1126 2308 7426 2 4 8 17 3 1 0 0 2 1 0 6 0 0 0 0 0 0 0 0 0
+* 5p - 29 2278 1574 1184 2381 7417 3 9 8 18 2 1 1 1 3 4 0 2 0 1 0 1 0 1 0 0 0
+* 5p - 30 2281 1551 1127 2456 7415 0 5 10 16 1 2 1 1 2 0 1 7 0 0 0 0 0 0 0 0 0
+* 5p - 31 2348 1582 1135 2349 7414 2 5 8 12 0 1 0 0 5 2 1 4 0 0 0 0 0 0 0 0 0
+* 5p - 32 2365 1448 1134 2457 7404 2 1 9 15 0 2 0 0 4 2 0 1 0 1 0 0 0 0 0 0 0
+* 5p - 33 2306 1540 1173 2372 7391 6 6 5 13 2 1 1 0 1 1 0 3 0 0 0 0 2 2 0 0 0
+* 5p - 34 2444 1497 1144 2300 7385 1 2 5 8 0 1 1 1 2 4 0 0 0 1 0 0 0 2 0 0 0
+* 5p - 35 2303 1551 1143 2375 7372 3 3 9 12 3 0 0 0 2 4 1 7 0 0 0 0 0 0 0 0 0
+* 5p - 36 2366 1543 1175 2279 7363 4 3 7 11 1 1 0 0 2 1 0 1 0 0 0 0 0 0 0 1 0
+* 5p - 37 2353 1522 1297 2178 7350 7 6 4 11 2 3 0 0 1 0 1 2 0 0 0 0 0 1 0 0 0
+* 5p - 38 2282 1423 1241 2396 7342 4 5 10 14 1 0 0 0 1 2 1 2 0 1 0 1 0 1 0 0 0
+* 5p - 39 2350 1512 1186 2274 7322 3 6 7 16 1 1 0 0 2 4 1 7 0 0 0 0 0 1 0 0 0
+* 5p - 40 2279 1453 1233 2335 7300 2 3 6 10 1 2 1 0 9 2 0 8 0 0 0 0 0 0 0 0 0
+* 5p - 41 2295 1535 1113 2329 7272 5 4 5 15 0 0 0 0 1 3 0 5 0 0 0 0 0 0 0 0 0
+* 5p - 42 2323 1516 1182 2238 7259 6 5 8 16 3 1 0 0 3 2 0 6 0 1 0 0 1 1 0 0 0
+* 5p - 43 2210 1542 1224 2244 7220 1 4 1 9 0 0 0 3 1 5 2 3 0 0 0 0 0 0 0 0 0
+* 5p - 44 2217 1550 1080 2328 7175 7 12 11 14 2 1 0 2 1 6 1 4 1 1 0 0 0 0 1 0 0
+* 5p - 45 2257 1454 1128 2296 7135 5 15 6 18 4 6 0 2 3 11 1 2 0 0 0 0 0 0 0 0 0
+* 5p - 46 2157 1405 1189 2333 7084 5 6 8 15 2 8 1 1 5 11 1 12 1 2 0 0 0 2 0 0 0
+* 5p - 47 2291 1469 1114 2151 7025 5 8 3 17 0 6 0 2 2 5 1 11 0 0 0 0 0 3 0 0 0
+* 5p - 48 2350 1380 1110 2138 6978 3 4 5 16 4 0 0 1 6 8 0 4 0 0 0 0 0 0 0 0 0
+* 5p - 49 2180 1459 1235 2045 6919 7 10 2 14 1 4 1 0 1 9 1 11 1 2 0 0 2 0 0 0 0
+* 5p - 50 2166 1462 1139 2088 6855 3 0 7 10 5 7 1 5 2 11 2 1 0 1 0 0 0 2 1 0 0
+* 5p - 51 2117 1318 1177 2188 6800 6 4 7 5 0 10 1 5 3 5 3 4 2 1 0 0 2 0 0 0 0
+* 5p - 52 2066 1514 1007 2142 6729 7 3 7 10 0 3 0 4 1 10 0 4 1 0 0 0 0 0 0 0 0
+* 5p - 53 2043 1476 1061 2081 6661 6 4 6 23 3 5 1 1 2 13 0 1 0 0 0 0 0 0 0 0 0
+* 5p - 54 2092 1305 1124 2064 6585 4 1 7 18 1 11 1 0 4 8 1 0 0 1 0 1 0 0 0 0 0
+* 5p - 55 1955 1421 1090 2029 6495 3 2 6 11 0 4 0 4 4 15 0 2 1 1 0 0 2 0 0 0 0
+* 5p - 56 1996 1295 1045 2074 6410 2 6 4 22 4 2 1 1 5 14 0 2 0 0 0 0 0 2 0 0 0
+* 5p - 57 2045 1227 1127 1925 6324 4 4 2 10 3 9 0 1 3 12 3 2 0 0 0 0 2 0 0 0 0
+* 5p - 58 2014 1198 1058 1980 6250 5 8 4 15 1 2 2 7 0 12 2 3 1 0 0 0 0 0 0 1 0
+* 5p - 59 1942 1256 945 2013 6156 7 6 6 22 4 13 0 1 4 13 3 1 0 0 0 0 1 2 0 0 0
+* 5p - 60 1855 1289 961 1949 6054 6 1 5 20 3 7 0 2 2 11 1 4 0 2 0 0 1 0 0 0 0
+* 5p - 61 1877 1230 953 1896 5956 7 3 6 17 0 7 0 1 6 9 4 5 0 0 0 0 0 0 0 0 0
+* 5p - 62 1865 1148 956 1905 5874 3 1 2 19 2 16 0 2 3 17 4 4 0 1 0 0 1 0 0 0 0
+* 5p - 63 1841 1116 902 1890 5749 0 10 4 14 1 10 2 0 3 21 1 6 0 0 0 0 2 0 0 0 0
+* 5p - 64 1746 1136 1003 1744 5629 2 5 8 10 2 7 1 5 7 13 1 3 0 1 0 0 0 2 0 0 0
+* 5p - 65 1725 1171 868 1758 5522 6 3 6 7 2 4 1 7 2 14 0 5 0 1 0 0 2 2 0 0 0
+* 5p - 66 1768 1052 863 1731 5414 2 3 9 13 0 9 0 0 4 9 4 2 0 1 0 0 2 0 0 0 0
+* 5p - 67 1621 1117 895 1654 5287 4 2 9 13 3 6 2 3 2 9 0 2 0 0 0 0 0 0 1 0 0
+* 5p - 68 1617 1057 821 1675 5170 12 6 1 12 2 5 2 0 2 16 0 3 1 0 0 0 0 0 0 0 0
+* 5p - 69 1619 963 839 1609 5030 3 4 5 14 4 4 0 4 1 11 0 2 0 0 0 0 1 0 0 0 0
+* 5p - 70 1572 931 844 1595 4942 5 4 7 10 4 3 0 4 1 10 2 1 0 2 0 1 1 0 0 0 0
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/3pGtoA_freq.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/3pGtoA_freq.txt
new file mode 100644
index 0000000..7aea3a0
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/3pGtoA_freq.txt
@@ -0,0 +1,26 @@
+pos 5pG>A
+1 0.0193509973206311
+2 0.0416184971098266
+3 0.02856159669649
+4 0.0152621101526211
+5 0.0121662723893207
+6 0.00805322128851541
+7 0.00863557858376511
+8 0.00728770595690748
+9 0.00611719253058596
+10 0.00295469468154957
+11 0.00281954887218045
+12 0.00516462233699161
+13 0.00458575359217365
+14 0.00236826524570752
+15 0.00446713465220166
+16 0.00193986420950533
+17 0.00247908273938643
+18 0.00161917098445596
+19 0.00379987333755541
+20 0.00187852222917971
+21 0.00419354838709677
+22 0.00211992731677771
+23 0.00310462589257994
+24 0.00163185378590078
+25 0.0030441400304414
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/5pCtoT_freq.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/5pCtoT_freq.txt
new file mode 100644
index 0000000..d4d1b23
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/5pCtoT_freq.txt
@@ -0,0 +1,26 @@
+pos 5pC>T
+1 0.0032602252519265
+2 0.00356435643564356
+3 0.00370744860128076
+4 0.00166944908180301
+5 0.00383408853258975
+6 0.00174764068507515
+7 0.00240137221269297
+8 0.00161655350792111
+9 0.00297816015883521
+10 0.00224143451809158
+11 0.000315756236185665
+12 0.0016926201760325
+13 0.00278637770897833
+14 0.00309885342423303
+15 0.00131233595800525
+16 0.00274725274725275
+17 0.00125984251968504
+18 0.00257898130238556
+19 0.00190779014308426
+20 0.0028772378516624
+21 0.00282574568288854
+22 0.000941028858218319
+23 0.002195045468799
+24 0.00168690958164642
+25 0.0037359900373599
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/Fragmisincorporation_plot.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/Fragmisincorporation_plot.pdf
new file mode 100644
index 0000000..e8c1ffd
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/Fragmisincorporation_plot.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/Length_plot.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/Length_plot.pdf
new file mode 100644
index 0000000..9650046
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/Length_plot.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/Runtime_log.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/Runtime_log.txt
new file mode 100644
index 0000000..2f4c317
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/Runtime_log.txt
@@ -0,0 +1,4 @@
+2013-10-22 17:17:49,766 INFO main: Started with the command: /home/mischu/bin/mapDamage/bin/mapDamage --no-stats --merge-reference-sequences -t mapDamage plot for library 'Pi1889_id_GGCTAC' -i - -d /home/mischu/scratch/bam_pipeline/fe516ea7-4e41-4794-858e-3c1d1ecc1d00 -r 000_prefixes/Pi_mito.fasta --downsample 100000
+2013-10-22 17:17:55,146 DEBUG main: BAM read in 8.035370 seconds
+2013-10-22 17:17:55,915 INFO main: Successful run
+2013-10-22 17:17:55,915 DEBUG main: Run completed in 8.804709 seconds
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_correct_prob.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_correct_prob.csv
new file mode 100644
index 0000000..bbd5ee6
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_correct_prob.csv
@@ -0,0 +1,25 @@
+"","Position","C.T","G.A"
+"1",1,0.905965147979896,0.851317575572406
+"2",2,0.867460479996637,0.816338766279527
+"3",3,0.817973015757612,0.771786652684647
+"4",4,0.768385003671509,0.70219825147599
+"5",5,0.730009197977233,0.586752332607376
+"6",6,0.691724682833541,0.434020728527498
+"7",7,0.646986757526749,0.271355229864189
+"8",8,0.573577194777417,0.209654345718612
+"9",9,0.46908412610797,0.225500953336785
+"10",10,0.345507498627502,0.249252897222653
+"11",11,0.257166196120877,0.227327216244992
+"12",12,0.231834905127787,0.15894250401124
+"13",-12,0.13648397466852,0.241332291323199
+"14",-11,0.18178836541215,0.288474232562927
+"15",-10,0.236610059052078,0.345579716875271
+"16",-9,0.311504541321792,0.404975095932622
+"17",-8,0.405511145290554,0.46388090755715
+"18",-7,0.502760264843111,0.527229799638339
+"19",-6,0.596037221668338,0.593479266966717
+"20",-5,0.672379055875483,0.667405152706348
+"21",-4,0.727311141495191,0.746403883350606
+"22",-3,0.776178321931103,0.81150118755555
+"23",-2,0.835893862373825,0.851872184647484
+"24",-1,0.895449473284666,0.869933342105769
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_hist.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_hist.pdf
new file mode 100644
index 0000000..68e233e
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_hist.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_iter_summ_stat.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_iter_summ_stat.csv
new file mode 100644
index 0000000..40444a3
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_iter_summ_stat.csv
@@ -0,0 +1,45 @@
+"","Theta","DeltaD","DeltaS","Lambda","Rho","LogLik"
+"Mean",0.0205161884355125,0.000217338297947547,0.0978693431282681,0.283717983858262,1.1897927272809,-6414.43407814038
+"Std.",0.000524232548061831,0.000204087542428916,0.00700313518340658,0.0151763348893451,0.0346226804150627,1.67557227883458
+"Acceptance ratio",0.25316,0.16714,0.27792,0.16824,0.16512,0.68682
+"0%",0.0184652921662282,2.68332083447158e-08,0.0774861888924155,0.232908896826712,1.0620208745594,-6426.97352037773
+"2.5%",0.0194864730718651,5.98616191734854e-06,0.0845800413837175,0.254608325817271,1.12377177508404,-6418.44592809611
+"5%",0.0196617745017828,1.1400713178174e-05,0.0865765823677068,0.258543873032978,1.13338118359316,-6417.55659684848
+"7.5%",0.0197731292071563,1.74741154780339e-05,0.0877993329906973,0.261731720260186,1.14003233149101,-6417.04351138843
+"10%",0.0198521026806183,2.26828315759433e-05,0.0888270396500504,0.264456229531059,1.14583163400737,-6416.66286224342
+"12.5%",0.0199203084774116,2.91266246704717e-05,0.0897931775469507,0.266483266815511,1.14996834762385,-6416.35807853826
+"15%",0.0199752235672,3.52112891197411e-05,0.0906199266624736,0.267878650528616,1.15412969105962,-6416.10754620254
+"17.5%",0.0200274736347861,4.16028151723156e-05,0.0913299497145145,0.269389354770449,1.15797526956607,-6415.87994855439
+"20%",0.0200794654003875,4.99604756880719e-05,0.0919237572228352,0.271018006941138,1.16082657497433,-6415.67777722978
+"22.5%",0.0201178101027408,5.77357866605223e-05,0.0924554049760004,0.272258247011929,1.16350048282079,-6415.49158172124
+"25%",0.0201635185638628,6.64929417037029e-05,0.0930178404011588,0.273457696134894,1.16631992142566,-6415.31860165032
+"27.5%",0.0202032811808304,7.42046743645067e-05,0.0935525853164335,0.274519115033599,1.16915367479191,-6415.17338298519
+"30%",0.0202401302191239,8.1939496408924e-05,0.0940719312647253,0.275747424519762,1.17195993381932,-6415.03464605372
+"32.5%",0.0202802113543834,9.06801119292253e-05,0.0945078305210888,0.276705721530609,1.174216670404,-6414.90312884779
+"35%",0.0203174967083556,9.76412045968827e-05,0.0949662892898128,0.277740747640362,1.17629199848308,-6414.78178072557
+"37.5%",0.0203473718288057,0.000107697882783618,0.095430055980412,0.278800752290352,1.17837295662974,-6414.66089116107
+"40%",0.0203825567742259,0.000117093833941146,0.0958758191106723,0.279648389881463,1.18072083139856,-6414.5490756114
+"42.5%",0.0204164211401381,0.000127847981834778,0.0963444534882712,0.280596393076576,1.18270540934206,-6414.43923523098
+"45%",0.020448156276053,0.000138634214768741,0.0968034536792386,0.281523173591913,1.18451599542915,-6414.32753994843
+"47.5%",0.0204766687662758,0.000149156657475876,0.0972584663171536,0.282625760498764,1.18652091286537,-6414.22456176835
+"50%",0.0205115918766186,0.000157916620995483,0.0977203980144479,0.283699397531429,1.1886161123303,-6414.13113463303
+"52.5%",0.0205439586080537,0.000169309229224945,0.098170434109583,0.284609927388761,1.19072287670455,-6414.02771609562
+"55%",0.0205788053045507,0.000181544835178489,0.0986288413763532,0.285616440528215,1.19291810658572,-6413.93392711309
+"57.5%",0.02061205545735,0.000197693266816523,0.0991139548888438,0.286417430813922,1.19542089801714,-6413.83894330637
+"60%",0.0206457236528958,0.000210486032310795,0.099523773050982,0.287473810641915,1.19764046175828,-6413.74772264986
+"62.5%",0.020680708043155,0.000224090713328748,0.0999918394685343,0.288533007603875,1.20029896866791,-6413.65614212244
+"65%",0.0207147358909581,0.000235681823928401,0.100451038626812,0.289444333302273,1.20265513070446,-6413.56935076401
+"67.5%",0.0207501550776515,0.000250538541646508,0.101007824884019,0.290432262318096,1.20512621854582,-6413.47960817108
+"70%",0.0207863331519099,0.000269100716069799,0.101543404338247,0.291616488827715,1.20763174840714,-6413.38924070616
+"72.5%",0.0208257544421896,0.000288549459151361,0.102095612227125,0.292678798749594,1.21013373926005,-6413.29382964648
+"75%",0.0208667266475668,0.000308840910354313,0.102581141131299,0.293738516277511,1.21291246030208,-6413.20693314712
+"77.5%",0.0209071338598845,0.000328602363180549,0.103102281882723,0.294837892074003,1.21591756614114,-6413.12192057722
+"80%",0.0209560004047132,0.000355054662916934,0.10378732749111,0.2961825409853,1.21892256737293,-6413.01947788728
+"82.5%",0.0210031200595306,0.000381078261185367,0.104501100693847,0.297563303524118,1.22203320654086,-6412.91666666559
+"85%",0.0210635252561187,0.000410832396959081,0.105238680242157,0.299241126164095,1.22577247430464,-6412.81960099963
+"87.5%",0.0211203554391525,0.000442783686967416,0.105997829528016,0.301290419673096,1.2294809136525,-6412.71335408889
+"90%",0.021200518396601,0.000484784851187943,0.106925764289224,0.30321804136332,1.23448473028421,-6412.59079110914
+"92.5%",0.0212850325880741,0.000548022561381344,0.108139693880442,0.305912093991245,1.23980898489583,-6412.45634119677
+"95%",0.0213815770744961,0.000626011552755668,0.109464234359763,0.309246324646528,1.24735972597727,-6412.31109307082
+"97.5%",0.0215421398290373,0.00076057428456335,0.111821956961311,0.31403547360917,1.25919374744679,-6412.10683039801
+"100%",0.0225463280985628,0.00163422665574628,0.127914467590743,0.343310316445305,1.33948556805895,-6411.54613181036
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_post_pred.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_post_pred.pdf
new file mode 100644
index 0000000..cacd7ad
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_post_pred.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_trace.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_trace.pdf
new file mode 100644
index 0000000..a82938d
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_trace.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/dnacomp.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/dnacomp.txt
new file mode 100644
index 0000000..1a0dc1d
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/dnacomp.txt
@@ -0,0 +1,324 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_mito.fasta as reference file
+# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads
+Chr End Std Pos A C G T Total
+* 3p + -70 2169 1053 1214 2000 6436
+* 3p + -69 2173 970 1259 2123 6525
+* 3p + -68 2245 1026 1327 2033 6631
+* 3p + -67 2179 1084 1341 2156 6760
+* 3p + -66 2233 1051 1336 2258 6878
+* 3p + -65 2306 1084 1339 2254 6983
+* 3p + -64 2224 1197 1321 2350 7092
+* 3p + -63 2487 1059 1276 2365 7187
+* 3p + -62 2415 1071 1455 2337 7278
+* 3p + -61 2506 1094 1437 2351 7388
+* 3p + -60 2548 1116 1404 2437 7505
+* 3p + -59 2583 1046 1508 2460 7597
+* 3p + -58 2513 1266 1473 2444 7696
+* 3p + -57 2476 1196 1518 2597 7787
+* 3p + -56 2633 1256 1515 2471 7875
+* 3p + -55 2597 1223 1650 2490 7960
+* 3p + -54 2654 1217 1524 2649 8044
+* 3p + -53 2573 1289 1722 2538 8122
+* 3p + -52 2616 1268 1718 2583 8185
+* 3p + -51 2671 1306 1614 2667 8258
+* 3p + -50 2643 1331 1738 2605 8317
+* 3p + -49 2657 1431 1672 2610 8370
+* 3p + -48 2758 1312 1609 2757 8436
+* 3p + -47 2769 1249 1727 2729 8474
+* 3p + -46 2699 1402 1752 2670 8523
+* 3p + -45 2848 1349 1657 2704 8558
+* 3p + -44 2872 1315 1793 2612 8592
+* 3p + -43 2860 1404 1761 2605 8630
+* 3p + -42 2818 1400 1661 2789 8668
+* 3p + -41 2801 1314 1845 2741 8701
+* 3p + -40 2763 1419 1718 2826 8726
+* 3p + -39 2881 1360 1683 2816 8740
+* 3p + -38 2951 1365 1703 2740 8759
+* 3p + -37 2732 1502 1737 2809 8780
+* 3p + -36 2877 1334 1798 2784 8793
+* 3p + -35 2950 1268 1788 2796 8802
+* 3p + -34 2933 1337 1735 2808 8813
+* 3p + -33 2945 1346 1726 2803 8820
+* 3p + -32 2823 1410 1791 2800 8824
+* 3p + -31 2839 1384 1826 2784 8833
+* 3p + -30 2962 1356 1723 2796 8837
+* 3p + -29 2941 1348 1768 2786 8843
+* 3p + -28 2835 1442 1881 2692 8850
+* 3p + -27 2965 1408 1668 2816 8857
+* 3p + -26 2801 1407 1815 2840 8863
+* 3p + -25 2849 1336 1779 2910 8874
+* 3p + -24 2849 1294 1727 3009 8879
+* 3p + -23 2986 1350 1771 2773 8880
+* 3p + -22 2803 1441 1823 2807 8874
+* 3p + -21 2888 1398 1675 2918 8879
+* 3p + -20 2963 1364 1756 2797 8880
+* 3p + -19 2999 1357 1741 2783 8880
+* 3p + -18 2901 1329 1750 2899 8879
+* 3p + -17 2958 1357 1819 2746 8880
+* 3p + -16 3034 1321 1715 2809 8879
+* 3p + -15 2981 1316 1725 2770 8792
+* 3p + -14 2918 1246 1892 2822 8878
+* 3p + -13 2907 1309 1792 2871 8879
+* 3p + -12 3022 1284 1755 2819 8880
+* 3p + -11 2837 1223 1889 2931 8880
+* 3p + -10 2950 1432 1654 2843 8879
+* 3p + -9 3209 1269 1704 2698 8880
+* 3p + -8 3230 1207 1701 2742 8880
+* 3p + -7 3007 1390 1637 2846 8880
+* 3p + -6 3216 1269 1565 2830 8880
+* 3p + -5 3103 1236 1599 2942 8880
+* 3p + -4 3127 1220 1605 2928 8880
+* 3p + -3 3144 1217 1566 2953 8880
+* 3p + -2 3091 1428 1517 2844 8880
+* 3p + -1 1585 1533 1881 3881 8880
+* 3p + 1 1654 1839 674 4713 8880
+* 3p + 2 2870 1385 1608 3017 8880
+* 3p + 3 3006 1327 1683 2864 8880
+* 3p + 4 3141 1190 1565 2984 8880
+* 3p + 5 3147 1143 1566 3024 8880
+* 3p + 6 3119 1211 1566 2983 8879
+* 3p + 7 3128 1244 1569 2938 8879
+* 3p + 8 3207 1221 1486 2965 8879
+* 3p + 9 3096 1195 1517 3071 8879
+* 3p + 10 3108 1184 1436 3151 8879
+* 3p - -70 2081 1186 911 2197 6375
+* 3p - -69 2111 1237 1000 2139 6487
+* 3p - -68 2317 1207 1023 2071 6618
+* 3p - -67 2195 1324 1091 2116 6726
+* 3p - -66 2264 1261 1061 2233 6819
+* 3p - -65 2318 1306 1053 2273 6950
+* 3p - -64 2339 1328 1089 2307 7063
+* 3p - -63 2366 1285 1150 2375 7176
+* 3p - -62 2345 1400 1151 2374 7270
+* 3p - -61 2333 1396 1123 2523 7375
+* 3p - -60 2482 1426 1077 2503 7488
+* 3p - -59 2505 1445 1100 2535 7585
+* 3p - -58 2448 1550 1252 2438 7688
+* 3p - -57 2486 1503 1183 2612 7784
+* 3p - -56 2500 1577 1200 2602 7879
+* 3p - -55 2467 1645 1224 2629 7965
+* 3p - -54 2595 1734 1166 2564 8059
+* 3p - -53 2659 1526 1268 2684 8137
+* 3p - -52 2666 1562 1365 2615 8208
+* 3p - -51 2699 1734 1281 2577 8291
+* 3p - -50 2791 1678 1255 2642 8366
+* 3p - -49 2622 1727 1342 2736 8427
+* 3p - -48 2729 1780 1190 2791 8490
+* 3p - -47 2828 1751 1317 2647 8543
+* 3p - -46 2740 1707 1356 2795 8598
+* 3p - -45 2875 1573 1311 2882 8641
+* 3p - -44 2801 1715 1352 2816 8684
+* 3p - -43 2799 1670 1433 2811 8713
+* 3p - -42 2850 1582 1318 2988 8738
+* 3p - -41 2829 1734 1324 2874 8761
+* 3p - -40 2706 1895 1303 2880 8784
+* 3p - -39 2802 1740 1328 2941 8811
+* 3p - -38 2878 1654 1423 2883 8838
+* 3p - -37 2798 1826 1393 2836 8853
+* 3p - -36 2840 1798 1285 2943 8866
+* 3p - -35 2857 1761 1348 2908 8874
+* 3p - -34 2843 1755 1342 2943 8883
+* 3p - -33 2807 1818 1290 2973 8888
+* 3p - -32 2774 1756 1349 3015 8894
+* 3p - -31 2908 1793 1443 2759 8903
+* 3p - -30 2967 1710 1410 2813 8900
+* 3p - -29 2989 1702 1448 2775 8914
+* 3p - -28 2799 1744 1489 2887 8919
+* 3p - -27 2904 1650 1349 3027 8930
+* 3p - -26 2898 1797 1410 2835 8940
+* 3p - -25 2816 1719 1512 2905 8952
+* 3p - -24 3010 1691 1339 2913 8953
+* 3p - -23 2863 1726 1442 2923 8954
+* 3p - -22 2806 1769 1498 2879 8952
+* 3p - -21 2910 1610 1437 2996 8953
+* 3p - -20 3038 1664 1428 2824 8954
+* 3p - -19 2949 1803 1398 2805 8955
+* 3p - -18 3014 1725 1336 2880 8955
+* 3p - -17 2949 1625 1418 2963 8955
+* 3p - -16 2934 1665 1384 2972 8955
+* 3p - -15 2900 1669 1394 2893 8856
+* 3p - -14 2969 1681 1475 2830 8955
+* 3p - -13 2895 1755 1471 2831 8952
+* 3p - -12 2937 1576 1351 3090 8954
+* 3p - -11 3078 1666 1308 2902 8954
+* 3p - -10 2961 1726 1405 2862 8954
+* 3p - -9 3191 1610 1391 2763 8955
+* 3p - -8 3219 1508 1455 2773 8955
+* 3p - -7 2980 1657 1269 3048 8954
+* 3p - -6 3108 1517 1295 3035 8955
+* 3p - -5 2992 1600 1368 2995 8955
+* 3p - -4 3011 1543 1405 2996 8955
+* 3p - -3 3108 1429 1308 3110 8955
+* 3p - -2 3252 1735 1027 2941 8955
+* 3p - -1 1715 1924 1406 3910 8955
+* 3p - 1 1715 2218 543 4479 8955
+* 3p - 2 2803 1720 1259 3173 8955
+* 3p - 3 3155 1484 1373 2943 8955
+* 3p - 4 3234 1444 1207 3070 8955
+* 3p - 5 3192 1424 1277 3062 8955
+* 3p - 6 3116 1458 1286 3095 8955
+* 3p - 7 3177 1343 1246 3189 8955
+* 3p - 8 3185 1471 1111 3188 8955
+* 3p - 9 3042 1509 1266 3138 8955
+* 3p - 10 3118 1483 1192 3162 8955
+* 5p + -10 3262 1183 1408 3027 8880
+* 5p + -9 3035 1174 1570 3101 8880
+* 5p + -8 3081 1111 1456 3232 8880
+* 5p + -7 3151 1091 1462 3176 8880
+* 5p + -6 3082 1252 1443 3103 8880
+* 5p + -5 3088 1215 1418 3159 8880
+* 5p + -4 3027 1222 1480 3151 8880
+* 5p + -3 2997 1350 1462 3071 8880
+* 5p + -2 3393 1142 1800 2545 8880
+* 5p + -1 5153 358 2414 955 8880
+* 5p + 1 3990 1461 1832 1597 8880
+* 5p + 2 2793 1032 1685 3370 8880
+* 5p + 3 3099 1360 1387 3034 8880
+* 5p + 4 2949 1454 1557 2917 8877
+* 5p + 5 3001 1293 1592 2993 8879
+* 5p + 6 3064 1256 1529 3031 8880
+* 5p + 7 2991 1294 1669 2926 8880
+* 5p + 8 2721 1430 1508 3221 8880
+* 5p + 9 2747 1369 1558 3206 8880
+* 5p + 10 2866 1368 1707 2939 8880
+* 5p + 11 2997 1377 1594 2912 8880
+* 5p + 12 3073 1290 1603 2914 8880
+* 5p + 13 2780 1460 1738 2902 8880
+* 5p + 14 2890 1461 1620 2909 8880
+* 5p + 15 2989 1318 1619 2948 8874
+* 5p + 16 3005 1295 1698 2882 8880
+* 5p + 17 2985 1411 1622 2862 8880
+* 5p + 18 2991 1377 1611 2901 8880
+* 5p + 19 2853 1426 1703 2898 8880
+* 5p + 20 2886 1452 1583 2959 8880
+* 5p + 21 3119 1464 1557 2740 8880
+* 5p + 22 3001 1410 1728 2741 8880
+* 5p + 23 3013 1384 1654 2829 8880
+* 5p + 24 2933 1315 1690 2942 8880
+* 5p + 25 2918 1423 1777 2762 8880
+* 5p + 26 2889 1380 1742 2852 8863
+* 5p + 27 2935 1257 1698 2967 8857
+* 5p + 28 2906 1455 1602 2887 8850
+* 5p + 29 2884 1484 1661 2814 8843
+* 5p + 30 2882 1428 1684 2847 8841
+* 5p + 31 2800 1445 1772 2816 8833
+* 5p + 32 2982 1320 1678 2844 8824
+* 5p + 33 2901 1385 1711 2823 8820
+* 5p + 34 2913 1354 1755 2791 8813
+* 5p + 35 2984 1310 1700 2808 8802
+* 5p + 36 3036 1313 1769 2675 8793
+* 5p + 37 2887 1387 1826 2680 8780
+* 5p + 38 2939 1367 1694 2759 8759
+* 5p + 39 2870 1426 1683 2761 8740
+* 5p + 40 2821 1276 1814 2815 8726
+* 5p + 41 2961 1339 1667 2734 8701
+* 5p + 42 2951 1322 1629 2766 8668
+* 5p + 43 2793 1410 1665 2762 8630
+* 5p + 44 2843 1330 1610 2809 8592
+* 5p + 45 2840 1426 1609 2683 8558
+* 5p + 46 2843 1330 1644 2706 8523
+* 5p + 47 2684 1325 1739 2726 8474
+* 5p + 48 2813 1219 1775 2629 8436
+* 5p + 49 2728 1347 1715 2580 8370
+* 5p + 50 2636 1328 1630 2723 8317
+* 5p + 51 2651 1308 1713 2586 8258
+* 5p + 52 2689 1303 1632 2561 8185
+* 5p + 53 2656 1222 1620 2624 8122
+* 5p + 54 2518 1188 1725 2613 8044
+* 5p + 55 2559 1289 1689 2423 7960
+* 5p + 56 2584 1188 1556 2547 7875
+* 5p + 57 2589 1129 1648 2421 7787
+* 5p + 58 2467 1211 1634 2384 7696
+* 5p + 59 2622 1090 1435 2450 7597
+* 5p + 60 2508 1082 1480 2435 7505
+* 5p + 61 2532 1153 1385 2318 7388
+* 5p + 62 2428 1134 1338 2378 7278
+* 5p + 63 2387 1066 1304 2431 7188
+* 5p + 64 2398 1128 1307 2259 7092
+* 5p + 65 2337 1049 1317 2280 6983
+* 5p + 66 2299 1063 1312 2204 6878
+* 5p + 67 2119 1082 1356 2203 6760
+* 5p + 68 2176 1036 1244 2175 6631
+* 5p + 69 2147 1062 1264 2038 6511
+* 5p + 70 2152 1043 1224 2017 6436
+* 5p - -10 3219 1318 1192 3226 8955
+* 5p - -9 3161 1496 1213 3085 8955
+* 5p - -8 3089 1497 1164 3205 8955
+* 5p - -7 2971 1534 1187 3263 8955
+* 5p - -6 2993 1584 1212 3166 8955
+* 5p - -5 3031 1561 1070 3293 8955
+* 5p - -4 2937 1594 1182 3242 8955
+* 5p - -3 2866 1660 1305 3124 8955
+* 5p - -2 3336 1499 1521 2599 8955
+* 5p - -1 5324 532 2084 1015 8955
+* 5p - 1 4006 1916 1441 1592 8955
+* 5p - 2 2800 1504 1477 3174 8955
+* 5p - 3 3062 1624 1182 3087 8955
+* 5p - 4 2937 1556 1213 3247 8953
+* 5p - 5 2960 1583 1288 3124 8955
+* 5p - 6 2916 1618 1257 3164 8955
+* 5p - 7 2981 1623 1360 2991 8955
+* 5p - 8 2744 1675 1159 3377 8955
+* 5p - 9 2750 1660 1255 3290 8955
+* 5p - 10 2896 1762 1330 2967 8955
+* 5p - 11 2997 1809 1241 2908 8955
+* 5p - 12 2909 1684 1352 3010 8955
+* 5p - 13 2946 1778 1317 2914 8955
+* 5p - 14 2978 1780 1230 2967 8955
+* 5p - 15 2903 1744 1271 3029 8947
+* 5p - 16 3036 1627 1324 2968 8955
+* 5p - 17 2772 1777 1440 2966 8955
+* 5p - 18 2934 1731 1345 2945 8955
+* 5p - 19 2846 1726 1403 2980 8955
+* 5p - 20 2886 1682 1364 3023 8955
+* 5p - 21 2964 1739 1336 2916 8955
+* 5p - 22 2814 1797 1400 2944 8955
+* 5p - 23 2865 1814 1344 2932 8955
+* 5p - 24 3030 1657 1271 2997 8955
+* 5p - 25 2952 1787 1345 2871 8955
+* 5p - 26 2873 1819 1388 2860 8940
+* 5p - 27 3013 1702 1386 2829 8930
+* 5p - 28 2818 1869 1348 2884 8919
+* 5p - 29 2884 1778 1355 2897 8914
+* 5p - 30 2829 1812 1344 2920 8905
+* 5p - 31 2821 1838 1393 2851 8903
+* 5p - 32 2879 1727 1404 2884 8894
+* 5p - 33 2910 1758 1363 2857 8888
+* 5p - 34 2868 1785 1381 2849 8883
+* 5p - 35 2861 1771 1409 2833 8874
+* 5p - 36 2876 1775 1318 2897 8866
+* 5p - 37 2923 1741 1509 2680 8853
+* 5p - 38 2816 1722 1387 2913 8838
+* 5p - 39 2794 1721 1426 2870 8811
+* 5p - 40 2750 1806 1384 2844 8784
+* 5p - 41 2793 1827 1260 2881 8761
+* 5p - 42 2800 1648 1484 2806 8738
+* 5p - 43 2728 1739 1417 2829 8713
+* 5p - 44 2747 1835 1286 2816 8684
+* 5p - 45 2769 1679 1312 2882 8642
+* 5p - 46 2715 1671 1404 2808 8598
+* 5p - 47 2787 1680 1293 2783 8543
+* 5p - 48 2782 1665 1291 2752 8490
+* 5p - 49 2632 1743 1386 2666 8427
+* 5p - 50 2707 1675 1297 2686 8365
+* 5p - 51 2671 1618 1355 2647 8291
+* 5p - 52 2635 1667 1273 2633 8208
+* 5p - 53 2571 1704 1272 2590 8137
+* 5p - 54 2695 1521 1237 2606 8059
+* 5p - 55 2465 1567 1242 2691 7965
+* 5p - 56 2485 1561 1209 2624 7879
+* 5p - 57 2572 1493 1211 2508 7784
+* 5p - 58 2556 1384 1247 2501 7688
+* 5p - 59 2555 1518 1085 2427 7585
+* 5p - 60 2488 1488 1105 2407 7488
+* 5p - 61 2402 1436 1174 2363 7375
+* 5p - 62 2436 1366 1109 2359 7270
+* 5p - 63 2419 1369 1081 2307 7176
+* 5p - 64 2326 1384 1152 2201 7063
+* 5p - 65 2258 1353 1046 2293 6950
+* 5p - 66 2205 1363 1058 2193 6819
+* 5p - 67 2167 1366 1064 2129 6726
+* 5p - 68 2069 1289 1032 2228 6618
+* 5p - 69 2173 1257 975 2075 6480
+* 5p - 70 2106 1184 978 2107 6375
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/dnacomp_genome.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/dnacomp_genome.csv
new file mode 100644
index 0000000..94d15ba
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/dnacomp_genome.csv
@@ -0,0 +1,2 @@
+A,C,G,T
+0.388112441327,0.105690628131,0.117477981119,0.388718949422
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/lgdistribution.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/lgdistribution.txt
new file mode 100644
index 0000000..17cdeeb
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/lgdistribution.txt
@@ -0,0 +1,325 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_mito.fasta as reference file
+# Std: strand of reads
+Std Length Occurences
++ 24 2
++ 25 5
++ 26 2
++ 27 3
++ 28 5
++ 29 3
++ 30 5
++ 31 7
++ 32 5
++ 33 5
++ 34 13
++ 35 9
++ 36 12
++ 37 22
++ 38 17
++ 39 15
++ 40 23
++ 41 34
++ 42 38
++ 43 35
++ 44 34
++ 45 34
++ 46 48
++ 47 38
++ 48 64
++ 49 53
++ 50 58
++ 51 72
++ 52 63
++ 53 74
++ 54 85
++ 55 88
++ 56 90
++ 57 89
++ 58 99
++ 59 93
++ 60 115
++ 61 112
++ 62 91
++ 63 93
++ 64 110
++ 65 108
++ 66 112
++ 67 135
++ 68 96
++ 69 97
++ 70 128
++ 71 117
++ 72 111
++ 73 124
++ 74 102
++ 75 106
++ 76 102
++ 77 114
++ 78 109
++ 79 124
++ 80 89
++ 81 107
++ 82 121
++ 83 107
++ 84 105
++ 85 86
++ 86 95
++ 87 91
++ 88 89
++ 89 112
++ 90 130
++ 91 117
++ 92 146
++ 93 276
++ 94 1415
++ 95 62
++ 96 64
++ 97 45
++ 98 56
++ 99 65
++ 100 62
++ 101 42
++ 102 45
++ 103 44
++ 104 46
++ 105 36
++ 106 55
++ 107 40
++ 108 46
++ 109 37
++ 110 57
++ 111 38
++ 112 36
++ 113 42
++ 114 24
++ 115 44
++ 116 49
++ 117 41
++ 118 37
++ 119 30
++ 120 28
++ 121 41
++ 122 31
++ 123 39
++ 124 31
++ 125 28
++ 126 35
++ 127 34
++ 128 28
++ 129 30
++ 130 29
++ 131 28
++ 132 31
++ 133 25
++ 134 21
++ 135 28
++ 136 25
++ 137 24
++ 138 25
++ 139 21
++ 140 12
++ 141 26
++ 142 19
++ 143 16
++ 144 23
++ 145 16
++ 146 11
++ 147 21
++ 148 9
++ 149 18
++ 150 25
++ 151 9
++ 152 15
++ 153 10
++ 154 17
++ 155 12
++ 156 13
++ 157 14
++ 158 11
++ 159 9
++ 160 11
++ 161 6
++ 162 6
++ 163 7
++ 164 5
++ 165 7
++ 166 2
++ 167 1
++ 168 5
++ 169 12
++ 170 3
++ 171 5
++ 172 3
++ 173 6
++ 174 5
++ 175 5
++ 176 4
++ 177 4
++ 178 2
++ 179 1
++ 180 2
++ 182 1
++ 183 1
++ 184 5
+- 25 2
+- 26 3
+- 27 5
+- 28 3
+- 29 7
+- 30 2
+- 31 8
+- 32 6
+- 33 5
+- 34 9
+- 35 8
+- 36 11
+- 37 13
+- 38 25
+- 39 27
+- 40 21
+- 41 22
+- 42 25
+- 43 29
+- 44 44
+- 45 43
+- 46 54
+- 47 53
+- 48 64
+- 49 59
+- 50 73
+- 51 85
+- 52 68
+- 53 83
+- 54 93
+- 55 90
+- 56 93
+- 57 93
+- 58 101
+- 59 93
+- 60 113
+- 61 103
+- 62 103
+- 63 114
+- 64 110
+- 65 134
+- 66 89
+- 67 108
+- 68 120
+- 69 126
+- 70 113
+- 71 110
+- 72 92
+- 73 120
+- 74 113
+- 75 118
+- 76 108
+- 77 118
+- 78 117
+- 79 118
+- 80 118
+- 81 92
+- 82 94
+- 83 103
+- 84 101
+- 85 98
+- 86 88
+- 87 103
+- 88 131
+- 89 105
+- 90 102
+- 91 108
+- 92 133
+- 93 294
+- 94 1385
+- 95 54
+- 96 50
+- 97 49
+- 98 66
+- 99 51
+- 100 56
+- 101 39
+- 102 36
+- 103 36
+- 104 48
+- 105 38
+- 106 45
+- 107 43
+- 108 33
+- 109 39
+- 110 49
+- 111 49
+- 112 42
+- 113 48
+- 114 39
+- 115 32
+- 116 46
+- 117 44
+- 118 32
+- 119 44
+- 120 45
+- 121 42
+- 122 33
+- 123 33
+- 124 27
+- 125 26
+- 126 25
+- 127 32
+- 128 29
+- 129 27
+- 130 33
+- 131 19
+- 132 26
+- 133 18
+- 134 23
+- 135 26
+- 136 21
+- 137 18
+- 138 28
+- 139 11
+- 140 19
+- 141 30
+- 142 20
+- 143 14
+- 144 25
+- 145 14
+- 146 13
+- 147 13
+- 148 8
+- 149 14
+- 150 15
+- 151 17
+- 152 16
+- 153 17
+- 154 8
+- 155 20
+- 156 23
+- 157 7
+- 158 11
+- 159 7
+- 160 15
+- 161 12
+- 162 9
+- 163 11
+- 164 3
+- 165 9
+- 166 17
+- 167 2
+- 168 4
+- 169 3
+- 170 4
+- 171 10
+- 172 8
+- 173 5
+- 174 4
+- 175 7
+- 176 3
+- 177 5
+- 178 2
+- 179 4
+- 180 3
+- 181 1
+- 182 4
+- 183 3
+- 184 1
+- 185 1
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/misincorporation.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/misincorporation.txt
new file mode 100644
index 0000000..088f433
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_GGCTAC/misincorporation.txt
@@ -0,0 +1,284 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_mito.fasta as reference file
+# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads
+Chr End Std Pos A C G T Total G>A C>T A>G T>C A>C A>T C>G C>A T>G T>A G>C G>T A>- T>- C>- G>- ->A ->T ->C ->G S
+* 3p + 1 1681 1507 1916 3776 8880 31 68 18 68 41 231 12 25 24 138 22 36 0 0 0 0 0 0 0 0 0
+* 3p + 2 2882 1465 1554 2979 8880 61 38 26 50 18 110 4 67 24 235 4 26 0 0 0 0 0 0 0 0 0
+* 3p + 3 2995 1264 1593 3026 8878 45 36 21 37 16 90 3 60 18 167 3 21 0 0 4 0 2 0 0 0 0
+* 3p + 4 2973 1215 1617 3070 8875 25 23 15 41 18 54 6 28 17 186 3 23 0 2 0 1 2 1 0 2 0
+* 3p + 5 3012 1248 1595 3020 8875 18 27 11 36 8 44 5 27 17 104 2 10 0 1 0 0 1 2 1 1 0
+* 3p + 6 3139 1263 1566 2895 8863 11 9 8 24 5 22 9 9 12 79 4 15 0 0 0 0 11 6 0 0 0
+* 3p + 7 2944 1375 1637 2901 8857 13 8 5 27 3 18 2 9 10 53 0 5 0 3 1 2 14 4 3 2 0
+* 3p + 8 3200 1193 1706 2756 8855 16 4 9 22 4 19 0 7 10 25 1 6 1 1 0 4 18 6 0 1 0
+* 3p + 9 3179 1253 1703 2728 8863 7 4 2 22 5 9 3 4 5 18 1 1 2 3 1 1 13 2 0 2 0
+* 3p + 10 2936 1415 1649 2868 8868 4 3 2 23 0 5 0 1 7 16 0 3 3 0 5 1 6 3 1 1 0
+* 3p + 11 2826 1209 1891 2943 8869 4 4 6 25 2 3 1 2 1 9 1 1 2 2 7 0 6 3 2 0 0
+* 3p + 12 3011 1264 1742 2860 8877 7 3 6 21 2 3 1 1 8 11 1 0 6 0 0 0 2 0 1 0 0
+* 3p + 13 2901 1287 1799 2888 8875 10 1 7 20 0 3 0 1 3 1 2 4 1 1 1 0 2 2 0 0 0
+* 3p + 14 2910 1231 1898 2838 8877 2 4 3 19 2 2 0 2 0 5 0 1 0 2 1 0 1 0 0 0 0
+* 3p + 15 2976 1310 1732 2770 8788 7 1 1 7 2 0 1 0 0 1 0 4 0 0 0 0 1 3 0 0 0
+* 3p + 16 3026 1300 1716 2834 8876 2 1 3 17 1 0 0 0 1 6 1 3 0 0 0 0 2 1 0 0 0
+* 3p + 17 2963 1341 1809 2765 8878 6 1 6 19 1 3 2 0 3 1 1 0 0 0 0 0 1 1 0 0 0
+* 3p + 18 2905 1301 1751 2922 8879 4 0 4 25 4 1 2 2 3 1 1 1 0 0 0 0 0 0 0 0 0
+* 3p + 19 3001 1351 1747 2780 8879 8 9 3 17 0 2 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0
+* 3p + 20 2952 1352 1763 2813 8880 4 2 0 13 1 0 1 0 1 1 2 2 1 0 0 0 0 0 0 0 0
+* 3p + 21 2889 1379 1672 2938 8878 7 3 11 22 4 1 0 2 1 3 0 0 1 0 0 0 0 1 0 0 0
+* 3p + 22 2802 1429 1816 2827 8874 5 0 4 12 2 1 1 0 2 3 1 1 0 1 0 0 0 0 0 0 0
+* 3p + 23 2981 1335 1771 2793 8880 4 1 5 12 3 1 1 2 1 4 0 0 2 1 0 0 0 0 0 0 0
+* 3p + 24 2862 1273 1728 3015 8878 2 5 4 24 0 2 0 0 0 0 0 3 1 0 1 0 0 1 0 0 0
+* 3p + 25 2851 1303 1779 2940 8873 4 2 5 36 3 1 0 2 2 2 1 1 0 0 0 0 0 1 0 0 0
+* 3p + 26 2791 1393 1822 2857 8863 5 0 3 12 3 1 0 0 1 0 0 5 0 2 0 0 1 0 0 0 0
+* 3p + 27 2969 1398 1659 2831 8857 4 1 6 12 2 3 1 2 3 2 0 0 0 0 0 0 0 0 0 0 0
+* 3p + 28 2842 1429 1879 2700 8850 7 1 3 11 3 2 0 1 6 1 0 2 0 0 0 0 0 0 0 0 0
+* 3p + 29 2930 1336 1774 2803 8843 5 5 4 15 2 1 0 1 3 1 0 1 1 1 0 0 0 0 0 0 0
+* 3p + 30 2964 1339 1721 2813 8837 3 2 2 22 1 3 1 3 1 4 0 3 0 0 0 0 0 0 0 0 0
+* 3p + 31 2840 1374 1829 2790 8833 5 1 1 13 3 1 0 5 3 2 1 2 0 0 0 0 0 0 0 0 0
+* 3p + 32 2811 1390 1793 2831 8825 1 0 3 14 1 2 0 0 1 4 0 3 0 0 0 0 0 0 0 0 0
+* 3p + 33 2946 1334 1733 2807 8820 8 1 1 13 3 2 1 2 0 0 0 1 0 1 0 0 0 0 0 0 0
+* 3p + 34 2946 1331 1725 2813 8815 1 1 4 12 1 0 0 1 2 1 0 1 2 0 0 0 0 0 0 0 0
+* 3p + 35 2949 1259 1783 2811 8802 0 6 4 17 1 1 1 5 2 2 0 3 0 1 0 0 0 0 0 0 0
+* 3p + 36 2874 1320 1797 2800 8791 2 6 3 18 1 1 0 0 3 0 1 2 0 0 0 0 2 0 0 0 0
+* 3p + 37 2722 1493 1741 2821 8777 10 0 2 10 0 1 1 1 2 1 0 1 0 1 0 0 3 0 0 0 0
+* 3p + 38 2947 1355 1704 2752 8758 6 3 8 16 1 1 0 0 1 3 1 1 0 0 0 0 0 1 0 0 0
+* 3p + 39 2888 1344 1683 2825 8740 4 1 7 17 3 2 0 4 1 4 0 1 0 0 0 0 0 0 0 0 0
+* 3p + 40 2761 1402 1719 2844 8726 3 2 3 16 6 1 0 3 0 3 1 2 0 0 0 0 0 0 0 0 0
+* 3p + 41 2788 1311 1853 2749 8701 2 5 0 7 4 2 2 4 2 0 1 3 0 0 0 0 0 0 0 0 0
+* 3p + 42 2828 1386 1662 2792 8668 2 4 5 16 1 0 0 2 0 0 0 3 0 2 0 0 0 0 0 0 0
+* 3p + 43 2862 1391 1759 2618 8630 2 3 2 11 1 1 0 1 1 0 1 1 0 1 0 0 0 0 0 0 0
+* 3p + 44 2857 1313 1796 2626 8592 13 3 6 12 1 0 1 2 1 2 0 4 0 1 0 0 0 0 0 0 0
+* 3p + 45 2857 1343 1659 2699 8558 8 8 8 18 0 4 2 0 1 1 0 2 0 0 0 0 0 0 0 0 0
+* 3p + 46 2692 1404 1750 2677 8523 0 4 4 7 3 2 0 11 3 1 0 3 0 0 0 0 0 0 0 0 0
+* 3p + 47 2758 1251 1733 2732 8474 10 2 5 8 0 0 0 7 1 1 1 3 2 0 0 0 0 0 0 0 0
+* 3p + 48 2756 1300 1611 2770 8437 0 3 5 15 0 0 0 1 1 0 0 3 0 0 0 0 0 0 0 0 0
+* 3p + 49 2664 1422 1660 2624 8370 0 2 1 15 3 1 0 10 4 2 0 1 0 0 0 0 0 0 0 0 0
+* 3p + 50 2642 1319 1738 2618 8317 6 1 4 10 1 2 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0
+* 3p + 51 2665 1297 1624 2673 8259 4 2 1 11 3 0 1 0 1 2 1 4 0 1 0 0 0 0 0 0 0
+* 3p + 52 2619 1264 1723 2579 8185 4 2 4 12 1 1 1 1 0 1 1 4 0 0 0 0 0 0 0 0 0
+* 3p + 53 2567 1281 1718 2555 8121 1 4 0 11 0 1 0 2 1 1 0 2 1 2 0 0 1 0 0 0 0
+* 3p + 54 2657 1209 1530 2651 8047 2 2 4 7 1 2 0 0 1 1 1 3 0 0 0 0 1 0 0 0 0
+* 3p + 55 2601 1216 1654 2492 7963 1 3 3 10 1 0 0 0 0 0 0 3 0 1 0 0 0 0 0 0 0
+* 3p + 56 2629 1249 1512 2486 7876 3 0 5 9 0 1 1 0 1 1 1 1 0 0 0 0 1 0 0 0 0
+* 3p + 57 2483 1186 1519 2599 7787 1 1 4 7 2 5 0 0 0 1 2 0 0 2 0 0 0 0 0 0 0
+* 3p + 58 2524 1268 1460 2444 7696 2 4 6 6 0 0 2 3 2 1 0 0 0 0 0 0 0 0 0 0 0
+* 3p + 59 2580 1041 1512 2464 7597 0 2 2 6 1 0 0 1 0 3 0 1 0 0 0 0 0 0 0 0 0
+* 3p + 60 2546 1114 1403 2445 7508 1 1 4 3 0 1 0 2 0 1 1 1 1 0 0 0 0 0 0 0 0
+* 3p + 61 2507 1093 1437 2353 7390 1 5 5 6 0 0 0 0 1 5 0 4 0 0 0 0 0 0 0 0 0
+* 3p + 62 2426 1070 1450 2332 7278 0 10 4 7 0 1 0 1 2 2 0 0 0 0 0 0 0 0 0 0 0
+* 3p + 63 2483 1060 1272 2372 7187 2 1 4 6 0 1 0 0 2 1 0 2 0 0 0 0 0 0 0 0 0
+* 3p + 64 2230 1186 1317 2363 7096 0 2 4 9 0 0 0 0 2 0 2 3 0 0 0 0 0 0 0 0 0
+* 3p + 65 2306 1080 1341 2256 6983 2 2 2 11 0 0 0 0 1 2 0 2 0 1 0 0 0 0 0 0 0
+* 3p + 66 2231 1045 1334 2268 6878 1 1 3 5 0 2 0 0 2 0 0 4 0 0 0 0 0 0 0 0 0
+* 3p + 67 2181 1075 1347 2159 6762 1 1 3 7 0 2 0 0 0 1 1 2 0 0 0 0 0 0 0 0 0
+* 3p + 68 2244 1029 1324 2035 6632 1 3 2 3 0 0 0 1 0 1 0 2 0 0 0 0 0 0 0 0 0
+* 3p + 69 2179 970 1256 2123 6528 2 0 5 0 1 0 1 0 2 0 0 1 0 0 0 0 0 0 0 0 0
+* 3p + 70 2169 1051 1216 2000 6436 0 4 0 5 1 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0
+* 3p - 1 1866 1863 1443 3783 8955 34 47 30 77 51 265 8 22 22 139 10 53 0 0 0 0 0 0 0 0 1
+* 3p - 2 3078 1730 1041 3103 8952 47 20 18 53 27 124 12 49 37 245 6 28 0 0 0 1 1 1 0 1 0
+* 3p - 3 2994 1421 1313 3219 8947 38 13 28 58 13 97 13 39 21 172 1 32 0 4 0 0 1 4 1 2 0
+* 3p - 4 2865 1534 1397 3141 8937 21 14 21 33 12 53 4 21 25 183 2 17 0 2 0 2 8 8 0 2 0
+* 3p - 5 2915 1601 1364 3061 8941 18 15 20 24 9 41 3 20 13 96 2 14 0 5 2 0 7 1 4 2 0
+* 3p - 6 3029 1527 1290 3087 8933 12 12 12 23 2 25 6 21 15 72 1 14 1 3 2 2 12 6 3 1 0
+* 3p - 7 2913 1652 1258 3105 8928 12 10 7 15 4 13 3 6 9 54 1 4 1 11 2 0 14 3 6 3 0
+* 3p - 8 3189 1507 1450 2782 8928 7 13 9 18 3 11 3 6 3 26 1 3 1 5 3 0 12 8 3 4 0
+* 3p - 9 3161 1583 1403 2791 8938 12 5 4 22 9 3 0 3 2 25 3 4 0 5 0 1 6 6 1 4 0
+* 3p - 10 2960 1702 1397 2886 8945 5 1 17 19 5 1 1 3 3 8 2 7 1 6 0 5 5 3 1 0 0
+* 3p - 11 3060 1644 1301 2941 8946 5 1 5 13 4 2 2 0 4 16 1 2 1 2 0 4 2 0 2 4 0
+* 3p - 12 2931 1562 1356 3100 8949 9 3 8 13 5 4 1 0 1 10 3 2 0 3 0 0 3 0 2 0 0
+* 3p - 13 2902 1736 1472 2836 8946 5 0 8 15 4 3 0 2 0 2 2 1 3 2 0 0 3 1 0 2 0
+* 3p - 14 2973 1658 1480 2841 8952 6 5 9 23 7 3 0 2 3 1 2 5 0 1 0 1 1 2 0 0 0
+* 3p - 15 2904 1642 1402 2909 8857 7 1 5 19 7 1 0 0 3 3 1 9 1 1 0 0 0 0 0 0 0
+* 3p - 16 2951 1651 1377 2975 8954 4 5 16 17 3 1 1 1 1 1 1 5 0 2 0 0 0 0 0 0 0
+* 3p - 17 2964 1612 1418 2961 8955 2 12 9 26 4 3 0 3 1 1 0 12 0 1 1 0 0 0 0 0 0
+* 3p - 18 2998 1708 1337 2911 8954 1 4 6 16 0 0 0 1 0 5 0 3 1 0 0 1 1 0 0 0 0
+* 3p - 19 2953 1798 1411 2791 8953 4 9 5 12 7 1 0 1 3 0 0 13 0 1 0 0 1 1 0 0 0
+* 3p - 20 3050 1643 1431 2829 8953 2 2 4 19 2 2 0 0 2 0 0 14 1 0 0 1 0 1 0 0 0
+* 3p - 21 2924 1595 1428 3002 8949 6 12 9 23 2 5 3 1 1 1 1 0 0 1 0 0 0 4 0 0 0
+* 3p - 22 2819 1765 1486 2879 8949 2 16 13 20 2 2 0 3 7 1 0 3 0 0 0 0 1 2 0 0 0
+* 3p - 23 2852 1718 1450 2934 8954 6 2 1 14 3 3 2 1 0 3 0 1 0 1 0 0 0 0 0 0 0
+* 3p - 24 2993 1688 1336 2936 8953 3 4 1 8 1 1 0 1 0 4 1 0 0 0 0 0 0 0 0 0 0
+* 3p - 25 2829 1716 1506 2899 8950 6 3 10 9 2 1 1 2 4 3 1 4 0 0 0 0 0 2 0 0 0
+* 3p - 26 2893 1785 1423 2838 8939 6 2 3 11 2 1 1 0 1 3 0 3 0 0 0 0 0 3 0 0 0
+* 3p - 27 2915 1637 1351 3027 8930 2 3 6 14 0 2 0 0 1 2 1 5 0 0 0 0 0 0 0 0 0
+* 3p - 28 2801 1724 1475 2919 8919 5 0 5 18 1 0 0 0 2 2 0 1 0 0 0 0 0 0 0 0 0
+* 3p - 29 2988 1692 1441 2793 8914 2 7 6 17 1 2 1 0 10 0 1 3 0 0 0 0 0 0 0 0 0
+* 3p - 30 2967 1697 1411 2826 8901 4 0 3 12 0 0 0 0 2 5 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 31 2907 1786 1446 2762 8901 6 4 4 7 2 2 0 0 1 6 0 2 0 1 0 0 0 2 0 0 0
+* 3p - 32 2784 1738 1347 3023 8892 1 1 9 14 2 1 1 0 1 3 2 1 0 0 0 0 0 2 0 0 0
+* 3p - 33 2806 1808 1287 2987 8888 1 0 3 12 3 3 1 0 1 2 3 1 0 0 0 0 0 0 0 0 0
+* 3p - 34 2853 1743 1339 2948 8883 1 1 6 8 2 1 0 1 3 0 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 35 2857 1742 1342 2933 8874 1 2 6 19 2 0 0 1 0 1 0 0 1 2 0 0 0 0 0 0 0
+* 3p - 36 2849 1785 1285 2947 8866 0 2 3 20 2 2 1 0 0 3 1 0 0 0 0 0 0 0 0 0 0
+* 3p - 37 2805 1811 1389 2848 8853 5 0 10 8 1 0 0 1 1 0 0 3 0 3 0 0 0 0 0 0 0
+* 3p - 38 2878 1644 1411 2905 8838 4 3 10 11 1 0 0 0 0 3 2 0 0 0 0 0 0 0 0 0 0
+* 3p - 39 2814 1720 1320 2958 8812 1 0 7 15 2 2 0 0 4 2 0 1 0 0 0 0 0 0 0 0 0
+* 3p - 40 2701 1887 1307 2889 8784 4 1 5 9 3 2 1 0 1 3 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 41 2844 1725 1318 2874 8761 3 1 7 9 2 2 0 0 0 0 1 2 1 0 0 0 0 0 0 0 0
+* 3p - 42 2847 1577 1313 3001 8738 4 2 7 10 0 3 2 0 0 4 0 2 0 0 0 0 0 0 0 0 0
+* 3p - 43 2815 1651 1421 2826 8713 0 3 5 16 2 3 2 1 2 0 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 44 2797 1710 1355 2822 8684 3 2 5 9 2 3 0 0 1 1 1 0 0 0 0 0 0 0 0 0 0
+* 3p - 45 2877 1565 1315 2884 8641 3 1 3 13 2 6 1 1 0 2 1 3 0 0 0 0 0 0 0 0 0
+* 3p - 46 2759 1694 1347 2798 8598 0 1 5 8 0 5 0 0 1 2 0 1 0 0 0 0 0 0 0 0 0
+* 3p - 47 2821 1745 1325 2652 8543 6 1 1 5 0 1 0 1 2 1 0 1 0 0 0 0 0 0 0 0 0
+* 3p - 48 2723 1767 1189 2811 8490 2 1 5 16 0 0 0 0 1 3 1 1 0 0 0 0 0 0 0 0 0
+* 3p - 49 2640 1707 1342 2738 8427 2 0 5 14 1 0 0 0 0 1 0 2 0 0 0 0 0 0 0 0 0
+* 3p - 50 2779 1682 1258 2649 8368 3 3 1 6 1 1 0 0 0 1 1 2 1 0 0 0 0 0 0 0 0
+* 3p - 51 2699 1722 1280 2592 8293 3 6 0 16 1 0 0 0 2 0 0 1 1 0 0 0 0 0 0 0 0
+* 3p - 52 2676 1554 1356 2623 8209 0 0 7 9 1 1 0 2 1 0 1 2 0 0 0 0 0 0 0 0 0
+* 3p - 53 2659 1522 1268 2691 8140 1 4 5 7 2 1 0 1 1 3 0 1 0 0 0 0 0 0 0 0 0
+* 3p - 54 2597 1718 1165 2580 8060 2 1 4 10 1 2 0 0 1 3 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 55 2466 1639 1227 2633 7965 4 3 6 8 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 56 2505 1572 1195 2607 7879 4 2 9 9 1 1 0 0 0 0 0 2 1 0 0 1 0 0 0 0 0
+* 3p - 57 2498 1499 1179 2608 7784 1 2 6 7 3 0 0 0 1 1 0 0 0 0 0 1 0 0 0 0 0
+* 3p - 58 2455 1537 1254 2442 7688 2 1 1 9 1 1 0 0 0 0 0 2 0 0 0 0 0 1 0 0 0
+* 3p - 59 2501 1438 1102 2543 7584 2 1 1 3 2 1 0 1 0 2 0 0 0 1 0 0 0 1 0 0 0
+* 3p - 60 2490 1426 1079 2496 7491 2 3 3 4 1 3 0 1 0 0 0 1 0 0 0 0 1 0 0 0 0
+* 3p - 61 2333 1399 1119 2528 7379 0 4 1 4 1 0 1 0 2 0 1 3 0 0 0 0 0 0 0 0 0
+* 3p - 62 2344 1394 1148 2389 7275 1 1 2 8 1 0 1 2 0 0 1 0 0 0 0 0 0 0 0 0 0
+* 3p - 63 2370 1280 1150 2377 7177 0 0 1 5 0 0 0 0 0 0 0 3 0 0 0 0 0 0 0 0 0
+* 3p - 64 2349 1332 1083 2299 7063 0 4 6 3 2 0 2 0 0 0 1 1 0 0 0 0 0 0 0 0 0
+* 3p - 65 2315 1298 1061 2276 6950 3 0 2 3 1 2 0 1 0 3 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 66 2268 1261 1055 2235 6819 1 1 1 6 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 67 2200 1317 1083 2127 6727 0 2 6 3 0 1 0 0 0 2 0 1 0 0 0 0 0 0 0 0 0
+* 3p - 68 2315 1211 1017 2078 6621 0 3 5 3 0 0 0 0 3 0 0 1 0 0 0 0 0 0 0 0 0
+* 3p - 69 2119 1233 1004 2132 6488 3 3 4 5 0 2 1 0 3 4 3 1 0 0 0 0 0 0 0 0 0
+* 3p - 70 2087 1183 907 2199 6376 2 3 3 4 1 0 1 0 1 2 0 1 0 0 0 0 0 0 0 0 0
+* 5p + 1 4003 1453 1823 1601 8880 1 2 12 8 2 3 1 0 1 3 1 3 0 0 0 0 0 0 0 0 1
+* 5p + 2 2805 1032 1668 3374 8879 3 7 14 11 0 5 1 3 5 1 0 0 0 0 0 0 0 1 0 0 0
+* 5p + 3 3115 1351 1368 3046 8880 1 3 21 13 0 2 0 2 3 4 1 3 0 0 0 0 0 0 0 0 0
+* 5p + 4 2945 1450 1555 2927 8877 2 1 3 9 0 2 2 2 2 5 0 3 1 1 0 0 0 0 0 0 0
+* 5p + 5 3004 1286 1593 2994 8877 3 5 8 13 0 0 0 1 2 0 0 8 0 0 0 0 1 1 0 0 0
+* 5p + 6 3068 1253 1528 3030 8879 3 3 6 4 3 2 0 1 2 3 0 4 0 0 0 0 0 1 0 0 0
+* 5p + 7 2998 1294 1662 2925 8879 0 4 7 4 1 3 1 1 1 3 1 1 1 0 0 0 1 0 0 0 0
+* 5p + 8 2719 1424 1506 3231 8880 1 4 2 11 1 2 0 2 3 4 0 2 0 2 0 0 0 0 0 0 0
+* 5p + 9 2741 1365 1560 3213 8879 1 6 0 9 0 1 0 1 1 4 0 2 1 2 0 0 1 0 0 0 0
+* 5p + 10 2871 1359 1703 2947 8880 1 2 5 9 2 0 0 0 1 1 0 1 1 0 0 0 0 0 0 0 0
+* 5p + 11 3001 1362 1594 2923 8880 2 0 4 10 5 1 0 1 4 3 1 6 0 0 0 0 0 0 0 0 0
+* 5p + 12 3075 1285 1599 2921 8880 0 3 6 13 0 1 1 4 1 1 0 4 0 0 0 0 0 0 0 0 0
+* 5p + 13 2780 1448 1735 2917 8880 3 1 1 15 3 1 2 1 2 1 0 1 0 0 0 0 0 0 0 0 0
+* 5p + 14 2885 1449 1627 2919 8880 6 2 1 13 0 1 1 0 1 1 1 0 0 0 0 0 0 0 0 0 0
+* 5p + 15 2979 1304 1616 2972 8871 3 0 2 18 1 0 0 5 2 2 0 0 0 0 0 0 1 1 1 0 0
+* 5p + 16 3001 1288 1702 2888 8879 3 7 2 15 1 0 0 4 0 0 0 1 0 1 0 0 0 1 0 0 0
+* 5p + 17 2971 1406 1627 2876 8880 5 1 1 15 0 0 0 10 1 0 1 1 0 0 0 0 0 0 0 0 0
+* 5p + 18 2990 1373 1609 2908 8880 4 4 2 9 0 3 0 0 4 5 0 2 0 0 0 0 0 0 0 0 0
+* 5p + 19 2831 1422 1707 2920 8880 10 0 4 14 1 1 0 10 2 4 0 0 0 2 0 0 0 0 0 0 0
+* 5p + 20 2878 1448 1583 2971 8880 3 3 3 15 2 1 0 12 3 3 0 2 0 0 0 0 0 0 0 0 0
+* 5p + 21 3107 1455 1566 2752 8880 10 5 3 14 1 0 0 2 0 1 1 1 2 0 0 0 0 0 0 0 0
+* 5p + 22 2998 1399 1738 2744 8879 12 1 2 9 5 1 0 1 1 1 0 3 0 0 0 0 0 1 0 0 0
+* 5p + 23 3015 1376 1654 2835 8880 4 2 5 11 2 1 2 3 2 0 0 4 0 0 0 0 0 0 0 0 0
+* 5p + 24 2933 1310 1694 2943 8880 3 1 3 7 2 3 0 3 0 1 0 2 0 0 0 0 0 0 0 0 0
+* 5p + 25 2915 1425 1769 2770 8879 4 5 7 8 1 4 1 4 3 4 0 1 0 0 0 0 1 0 0 0 0
+* 5p + 26 2890 1376 1743 2852 8861 3 5 6 8 1 2 1 3 1 0 1 3 0 2 0 0 3 0 0 0 0
+* 5p + 27 2928 1255 1693 2979 8855 7 1 8 10 1 0 1 3 0 2 1 0 0 0 0 1 2 0 0 0 0
+* 5p + 28 2913 1443 1602 2892 8850 3 3 4 15 1 5 0 2 2 0 0 2 0 0 0 0 0 0 0 0 0
+* 5p + 29 2890 1471 1661 2821 8843 4 1 5 10 4 1 1 0 5 0 1 2 0 0 0 0 0 0 0 0 0
+* 5p + 30 2885 1423 1679 2854 8841 2 5 5 7 4 0 1 1 4 1 1 2 1 0 0 0 0 0 0 0 0
+* 5p + 31 2803 1434 1775 2819 8831 3 3 5 15 0 2 0 0 0 2 0 6 1 0 0 0 0 2 0 0 0
+* 5p + 32 2989 1312 1670 2852 8823 0 4 1 9 1 2 0 0 4 1 0 0 0 0 0 0 2 0 0 0 0
+* 5p + 33 2899 1375 1707 2837 8818 0 2 4 12 0 3 0 1 1 1 1 1 0 0 0 0 2 0 0 0 0
+* 5p + 34 2921 1344 1747 2802 8814 2 2 6 11 0 2 0 0 5 3 0 1 0 0 0 0 0 1 0 0 0
+* 5p + 35 2982 1293 1703 2824 8802 2 2 2 16 3 1 0 1 3 0 1 1 0 0 0 0 0 0 0 0 0
+* 5p + 36 3042 1307 1766 2678 8793 4 4 6 10 0 4 0 0 2 1 0 2 1 0 0 0 0 0 0 0 0
+* 5p + 37 2885 1378 1821 2694 8778 1 4 4 12 0 1 0 0 5 3 2 2 0 1 0 0 0 0 0 2 0
+* 5p + 38 2942 1344 1692 2780 8758 4 1 10 18 0 6 0 0 0 8 3 1 0 1 0 0 1 0 0 0 0
+* 5p + 39 2879 1414 1675 2770 8738 1 1 7 10 2 3 0 0 4 3 1 1 1 0 0 0 0 2 0 0 0
+* 5p + 40 2827 1276 1805 2818 8726 0 3 7 6 0 2 0 1 4 4 0 3 0 0 0 0 0 0 0 0 0
+* 5p + 41 2963 1332 1662 2744 8701 4 5 6 10 2 2 1 1 3 6 0 3 0 1 0 0 0 0 0 0 0
+* 5p + 42 2946 1313 1629 2780 8668 0 6 2 16 1 1 2 1 1 5 1 1 0 0 0 0 0 0 0 0 0
+* 5p + 43 2793 1406 1669 2762 8630 0 2 3 10 0 3 0 4 0 6 0 8 0 0 0 0 0 0 0 0 0
+* 5p + 44 2839 1320 1599 2834 8592 3 1 8 15 3 3 2 3 3 9 0 2 0 0 0 0 0 0 0 0 0
+* 5p + 45 2840 1416 1605 2697 8558 1 2 4 12 1 0 1 0 4 5 0 2 0 1 0 0 0 0 0 0 0
+* 5p + 46 2826 1307 1648 2740 8521 5 1 1 27 1 2 1 1 2 10 0 4 0 3 0 0 0 2 0 0 0
+* 5p + 47 2672 1316 1743 2743 8474 4 1 2 14 1 3 0 5 1 10 0 1 0 0 0 1 0 0 0 0 0
+* 5p + 48 2825 1202 1772 2636 8435 1 4 5 12 4 8 0 0 1 9 0 4 1 0 0 0 2 0 0 0 0
+* 5p + 49 2719 1350 1713 2588 8370 2 4 4 9 0 0 1 3 3 6 0 4 0 1 0 0 0 0 0 0 0
+* 5p + 50 2635 1318 1624 2740 8317 2 2 8 14 3 9 1 3 1 11 0 1 2 0 0 0 0 0 0 0 0
+* 5p + 51 2644 1293 1721 2599 8257 6 2 2 12 1 3 0 0 0 11 1 4 0 1 2 0 1 1 0 0 0
+* 5p + 52 2673 1295 1636 2581 8185 4 1 2 14 2 4 0 3 0 10 1 2 0 0 0 0 0 0 0 0 0
+* 5p + 53 2647 1206 1618 2649 8120 4 2 1 19 1 6 0 3 4 14 0 1 0 0 0 0 2 0 0 0 0
+* 5p + 54 2521 1171 1732 2624 8048 4 5 5 21 3 11 0 3 1 10 1 6 0 1 0 0 0 0 0 0 0
+* 5p + 55 2551 1276 1696 2436 7959 3 2 5 12 2 5 0 5 2 11 3 4 0 1 0 0 2 2 0 0 0
+* 5p + 56 2569 1176 1561 2568 7874 7 6 4 17 6 5 1 3 3 17 0 5 0 0 1 0 2 1 0 0 0
+* 5p + 57 2583 1128 1650 2425 7786 6 10 3 11 4 5 2 7 2 9 2 3 2 1 0 0 1 0 0 0 0
+* 5p + 58 2459 1210 1641 2384 7694 0 6 1 9 1 7 0 2 2 12 2 5 0 1 0 0 1 1 0 0 0
+* 5p + 59 2608 1093 1432 2462 7595 1 7 2 10 1 11 1 6 4 18 3 1 0 0 0 0 1 1 0 0 0
+* 5p + 60 2506 1064 1480 2456 7506 8 0 3 18 1 16 0 4 6 13 1 3 0 0 0 0 1 1 0 0 0
+* 5p + 61 2509 1154 1389 2335 7387 8 4 5 12 0 4 4 5 5 15 3 7 0 0 0 0 3 0 0 0 0
+* 5p + 62 2390 1125 1349 2412 7276 9 6 1 16 4 8 2 9 0 29 1 4 2 0 0 2 1 0 1 0 0
+* 5p + 63 2376 1057 1306 2448 7187 2 5 1 10 4 5 2 2 6 20 2 5 0 3 0 0 1 0 0 0 0
+* 5p + 64 2393 1115 1311 2273 7092 6 4 7 14 9 12 0 6 1 18 2 1 0 0 0 0 4 0 0 0 0
+* 5p + 65 2318 1039 1313 2308 6978 9 5 6 18 1 13 4 0 5 27 0 7 1 0 0 0 1 4 0 0 0
+* 5p + 66 2274 1058 1322 2220 6874 14 4 1 12 4 18 4 2 7 28 0 4 2 0 0 0 4 0 0 0 0
+* 5p + 67 2103 1075 1361 2219 6758 8 5 2 15 6 16 3 3 1 23 0 4 2 1 1 0 2 1 0 1 0
+* 5p + 68 2169 1024 1246 2190 6629 5 4 3 19 2 8 3 4 3 14 3 5 0 0 1 0 1 2 0 0 0
+* 5p + 69 2142 1052 1272 2046 6512 7 8 2 14 2 15 1 3 5 14 3 3 0 0 0 0 0 0 2 0 0
+* 5p + 70 2138 1031 1223 2041 6433 9 4 3 18 6 8 3 2 4 18 0 2 0 2 0 0 2 0 0 1 0
+* 5p - 1 4008 1921 1438 1588 8955 6 9 13 1 1 2 0 0 1 8 2 3 0 0 0 0 0 0 0 0 0
+* 5p - 2 2807 1493 1470 3185 8955 4 2 12 10 2 5 1 0 0 8 2 0 0 0 0 0 0 0 0 0 0
+* 5p - 3 3073 1616 1181 3085 8955 7 8 8 11 6 9 1 0 2 5 0 3 0 0 0 0 0 0 0 0 0
+* 5p - 4 2941 1545 1207 3259 8952 4 4 8 14 3 4 1 1 4 5 0 3 0 0 0 0 1 0 0 0 0
+* 5p - 5 2964 1583 1280 3127 8954 1 6 6 8 0 3 2 0 4 4 0 2 0 0 0 1 0 1 0 0 0
+* 5p - 6 2924 1608 1253 3170 8955 1 2 6 9 2 2 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0
+* 5p - 7 2996 1621 1345 2993 8955 0 3 16 4 2 0 0 0 0 2 0 1 0 0 1 0 0 0 0 0 0
+* 5p - 8 2751 1669 1153 3380 8953 0 1 8 8 1 4 0 3 1 2 0 3 0 0 0 0 2 0 0 0 0
+* 5p - 9 2762 1657 1249 3287 8955 0 3 8 5 1 3 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0
+* 5p - 10 2904 1764 1323 2964 8955 4 5 12 2 2 4 0 0 0 5 0 2 0 1 0 0 0 0 0 0 0
+* 5p - 11 3003 1805 1237 2910 8955 5 1 8 4 0 2 1 0 2 1 1 0 0 0 0 1 0 0 0 0 0
+* 5p - 12 2917 1669 1347 3022 8955 1 2 8 13 1 1 0 1 1 1 1 0 2 0 0 1 0 0 0 0 0
+* 5p - 13 2952 1782 1313 2907 8954 1 8 7 9 0 3 1 1 0 2 0 3 0 0 0 0 0 1 0 0 0
+* 5p - 14 2979 1778 1223 2974 8954 0 8 6 11 0 0 0 0 4 4 0 5 1 0 0 0 0 0 0 1 0
+* 5p - 15 2909 1744 1261 3029 8943 0 4 5 5 1 0 1 1 2 0 0 1 0 0 0 0 1 2 0 1 0
+* 5p - 16 3037 1624 1324 2969 8954 0 1 4 3 0 2 0 0 0 3 0 1 0 0 0 0 0 0 1 0 0
+* 5p - 17 2777 1769 1439 2970 8955 4 3 4 7 1 2 0 0 1 0 0 1 1 0 0 0 0 0 0 0 0
+* 5p - 18 2942 1729 1340 2943 8954 0 4 6 8 1 2 1 1 2 1 0 3 0 0 0 0 0 1 0 0 0
+* 5p - 19 2845 1723 1402 2984 8954 4 6 6 9 2 1 0 1 1 1 0 2 0 1 0 0 0 1 0 0 0
+* 5p - 20 2883 1680 1365 3027 8955 4 6 3 9 0 1 1 1 1 2 0 2 0 0 0 0 0 0 0 0 0
+* 5p - 21 2969 1730 1331 2925 8955 0 4 2 14 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 0 0
+* 5p - 22 2819 1789 1394 2953 8955 0 2 4 11 0 2 0 1 2 0 0 1 1 0 0 0 0 0 0 0 0
+* 5p - 23 2865 1813 1343 2933 8954 1 5 6 5 0 1 0 0 0 2 1 2 0 0 0 0 0 0 1 0 0
+* 5p - 24 3044 1654 1260 2997 8955 5 4 15 6 1 1 0 0 3 1 0 4 0 0 0 0 0 0 0 0 0
+* 5p - 25 2964 1787 1338 2865 8954 3 7 15 4 2 1 1 1 2 2 1 4 0 0 0 0 1 0 0 0 0
+* 5p - 26 2879 1820 1381 2861 8941 1 7 7 9 2 1 2 0 2 3 0 4 0 0 0 0 0 1 0 0 0
+* 5p - 27 3021 1697 1379 2831 8928 2 0 10 5 0 0 0 0 2 1 0 3 1 0 0 0 0 2 0 0 0
+* 5p - 28 2822 1867 1352 2878 8919 2 6 3 7 3 2 1 2 1 1 0 6 0 0 0 0 0 0 0 0 0
+* 5p - 29 2890 1762 1349 2913 8914 1 3 7 14 3 0 0 0 3 0 1 1 0 0 0 0 0 0 0 0 0
+* 5p - 30 2834 1794 1342 2936 8906 3 2 4 15 2 3 2 0 3 3 1 2 0 0 0 1 0 0 0 0 0
+* 5p - 31 2829 1826 1392 2856 8903 3 4 6 13 5 1 0 0 1 0 0 3 0 0 0 0 0 0 0 0 0
+* 5p - 32 2884 1728 1398 2884 8894 1 1 7 2 1 0 2 0 2 2 2 4 0 0 0 0 0 0 0 0 0
+* 5p - 33 2915 1749 1366 2858 8888 1 5 4 12 1 2 2 0 0 1 2 4 0 0 0 0 0 0 0 0 0
+* 5p - 34 2874 1774 1376 2859 8883 0 3 4 14 0 1 0 1 3 0 1 2 0 0 0 0 0 0 0 0 0
+* 5p - 35 2864 1764 1411 2835 8874 7 2 9 9 3 0 0 2 2 0 0 6 0 0 0 0 0 0 0 0 0
+* 5p - 36 2867 1769 1321 2909 8866 6 3 3 9 0 0 1 0 1 3 1 0 0 0 0 0 0 0 0 0 0
+* 5p - 37 2929 1736 1502 2686 8853 0 8 7 11 0 0 2 0 1 3 0 3 0 0 0 0 0 0 0 0 0
+* 5p - 38 2823 1716 1384 2915 8838 1 7 7 13 2 0 1 0 1 1 1 4 0 0 0 0 0 0 0 0 0
+* 5p - 39 2796 1714 1426 2876 8812 4 6 9 12 3 1 0 0 1 4 0 7 0 2 0 0 0 0 0 0 0
+* 5p - 40 2752 1798 1384 2850 8784 5 7 9 15 1 0 1 0 3 4 0 8 1 0 0 0 0 0 0 0 0
+* 5p - 41 2785 1817 1267 2890 8759 7 1 4 12 1 0 0 1 1 2 0 4 0 1 0 0 0 2 0 0 0
+* 5p - 42 2806 1633 1482 2814 8735 7 3 8 13 2 1 1 1 4 3 1 5 0 0 0 0 0 2 1 0 0
+* 5p - 43 2720 1733 1420 2839 8712 1 2 1 10 0 2 0 0 2 6 1 3 0 0 0 1 1 0 0 0 0
+* 5p - 44 2743 1824 1282 2834 8683 6 9 8 20 2 3 2 3 1 7 1 4 1 0 0 0 1 0 0 0 0
+* 5p - 45 2767 1680 1311 2884 8642 4 11 6 10 1 4 0 2 4 8 2 3 0 0 0 0 0 0 0 0 0
+* 5p - 46 2707 1665 1418 2806 8596 6 6 4 8 1 2 0 0 1 7 0 8 0 2 0 0 1 1 0 0 0
+* 5p - 47 2785 1670 1302 2783 8540 2 2 1 11 2 5 0 2 3 6 5 9 0 1 0 0 0 3 0 0 0
+* 5p - 48 2780 1653 1287 2770 8490 3 2 5 11 6 5 1 2 2 7 0 1 0 2 0 0 0 0 0 0 0
+* 5p - 49 2626 1742 1386 2672 8426 3 7 1 6 2 5 1 1 6 6 1 3 0 3 0 0 1 0 0 0 0
+* 5p - 50 2713 1654 1290 2706 8363 2 2 8 20 0 2 0 0 3 6 0 3 1 0 0 0 3 1 0 0 0
+* 5p - 51 2663 1613 1359 2656 8291 9 1 2 10 4 6 2 2 5 11 0 4 1 0 0 0 0 2 0 0 0
+* 5p - 52 2642 1660 1262 2643 8207 2 11 6 16 2 9 1 3 10 10 1 2 0 0 0 0 0 1 0 1 0
+* 5p - 53 2563 1696 1269 2608 8136 4 7 8 12 0 6 0 3 4 16 1 5 1 1 0 0 2 2 0 0 0
+* 5p - 54 2686 1507 1244 2619 8056 7 1 1 15 3 1 0 0 3 5 1 3 0 1 0 0 0 4 0 0 0
+* 5p - 55 2450 1559 1236 2720 7965 5 2 2 13 4 7 2 2 6 21 0 0 0 0 0 0 0 0 0 0 0
+* 5p - 56 2475 1551 1213 2640 7879 1 9 2 12 2 5 1 2 1 16 3 3 1 1 0 0 0 0 0 0 0
+* 5p - 57 2560 1485 1216 2520 7781 7 3 5 11 0 12 0 0 3 17 0 4 0 0 0 0 2 1 0 0 0
+* 5p - 58 2537 1376 1248 2525 7686 7 4 8 15 3 9 2 2 0 22 1 7 0 7 0 0 1 1 1 0 0
+* 5p - 59 2544 1499 1086 2447 7576 3 2 4 21 1 10 1 5 5 23 2 4 0 0 0 0 2 5 2 0 0
+* 5p - 60 2463 1474 1116 2433 7486 8 2 4 18 3 10 0 3 4 27 0 10 0 0 0 1 3 3 0 0 0
+* 5p - 61 2398 1427 1178 2376 7379 6 1 5 10 2 21 1 2 7 23 0 11 0 1 0 0 0 0 0 0 0
+* 5p - 62 2410 1366 1109 2388 7273 9 4 4 8 4 15 1 7 10 31 1 2 0 2 0 0 1 1 0 0 0
+* 5p - 63 2395 1366 1077 2332 7170 4 10 4 16 6 12 1 10 4 33 0 5 0 0 0 0 6 0 1 0 0
+* 5p - 64 2313 1376 1152 2216 7057 6 4 0 12 3 22 0 6 8 23 1 2 0 0 0 0 2 2 0 2 0
+* 5p - 65 2233 1337 1060 2316 6946 10 0 1 11 3 14 0 0 5 26 2 6 0 1 0 0 1 2 0 1 0
+* 5p - 66 2203 1349 1055 2211 6818 7 3 5 15 6 17 0 5 5 21 0 2 0 2 0 0 1 0 0 0 0
+* 5p - 67 2154 1359 1069 2141 6723 11 6 6 16 6 14 1 4 6 20 0 6 0 0 0 1 3 0 0 1 0
+* 5p - 68 2063 1276 1035 2243 6617 4 0 8 9 7 16 1 3 1 23 0 6 1 0 0 3 3 1 0 0 0
+* 5p - 69 2159 1247 977 2092 6475 4 6 3 14 8 6 0 7 1 18 0 3 1 0 0 2 2 1 0 3 0
+* 5p - 70 2111 1184 983 2094 6372 5 10 4 12 1 16 3 2 2 10 1 10 0 0 0 0 2 0 1 1 0
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/3pGtoA_freq.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/3pGtoA_freq.txt
new file mode 100644
index 0000000..eb25710
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/3pGtoA_freq.txt
@@ -0,0 +1,26 @@
+pos 5pG>A
+1 0.0262458471760797
+2 0.0510510510510511
+3 0.03003003003003
+4 0.0262172284644195
+5 0.0168869309838473
+6 0.0094876660341556
+7 0.0111390585698886
+8 0.00541455160744501
+9 0.00467794170564951
+10 0.00392156862745098
+11 0.00551533953809031
+12 0.00552486187845304
+13 0.00236167341430499
+14 0.0023094688221709
+15 0.00449671393981321
+16 0.00291226792864944
+17 0.00545702592087312
+18 0.0034447123665174
+19 0.0023592854735423
+20 0.00350017500875044
+21 0.00269269606193201
+22 0.00450606585788562
+23 0.00102249488752556
+24 0.00176491351923756
+25 0.0037568306010929
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/5pCtoT_freq.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/5pCtoT_freq.txt
new file mode 100644
index 0000000..6c98a47
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/5pCtoT_freq.txt
@@ -0,0 +1,26 @@
+pos 5pC>T
+1 0.0026525198938992
+2 0.00542740841248304
+3 0.00252434186801298
+4 0.00340522133938706
+5 0.00267686424474187
+6 0.00391389432485323
+7 0.00191204588910134
+8 0.00287666307083783
+9 0.00179920834832674
+10 0.00204988042364195
+11 0.00105969622041681
+12 0.000723327305605787
+13 0.00335683115139308
+14 0.00173370319001387
+15 0.00248579545454545
+16 0.0022189349112426
+17 0.00251979841612671
+18 0.00356125356125356
+19 0.00103698582786035
+20 0.00178316690442225
+21 0.00285408490902604
+22 0.00206967919972404
+23 0.00247087892693258
+24 0.00145454545454545
+25 0.00203665987780041
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/Fragmisincorporation_plot.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/Fragmisincorporation_plot.pdf
new file mode 100644
index 0000000..ad739ac
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/Fragmisincorporation_plot.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/Length_plot.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/Length_plot.pdf
new file mode 100644
index 0000000..86ac27c
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/Length_plot.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/Runtime_log.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/Runtime_log.txt
new file mode 100644
index 0000000..29859ed
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/Runtime_log.txt
@@ -0,0 +1,4 @@
+2013-10-22 17:20:58,032 INFO main: Started with the command: /home/mischu/bin/mapDamage/bin/mapDamage --no-stats --merge-reference-sequences -t mapDamage plot for library 'Pi1889_id_TAGCTT' -i - -d /home/mischu/scratch/bam_pipeline/a7edd5f6-72b3-49fb-9e8c-6c75b0380625 -r 000_prefixes/Pi_mito.fasta --downsample 100000
+2013-10-22 17:21:02,918 DEBUG main: BAM read in 7.957100 seconds
+2013-10-22 17:21:03,765 INFO main: Successful run
+2013-10-22 17:21:03,766 DEBUG main: Run completed in 8.805061 seconds
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_correct_prob.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_correct_prob.csv
new file mode 100644
index 0000000..9ce6e31
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_correct_prob.csv
@@ -0,0 +1,25 @@
+"","Position","C.T","G.A"
+"1",1,0.925998571161551,0.882787227319832
+"2",2,0.914337537292083,0.797582740068081
+"3",3,0.891720420244259,0.702429484870993
+"4",4,0.854757604301429,0.637955985583337
+"5",5,0.808392867361064,0.567273652429126
+"6",6,0.755297075135886,0.481396579451544
+"7",7,0.684815005994372,0.426519670734632
+"8",8,0.599985778586409,0.38492899138771
+"9",9,0.499036182408831,0.358662873160201
+"10",10,0.379511793306211,0.346875155848198
+"11",11,0.241847519069161,0.343931578090552
+"12",12,0.109468731381488,0.335337034692814
+"13",-12,0.18676319578116,0.290721981766262
+"14",-11,0.261112922792338,0.330411060484778
+"15",-10,0.335673759669022,0.385286864451621
+"16",-9,0.405781118075488,0.456998048876173
+"17",-8,0.483176063209752,0.531693713311073
+"18",-7,0.570832045207107,0.60056296265569
+"19",-6,0.667264714255921,0.655505706193023
+"20",-5,0.740960881831015,0.717093217679107
+"21",-4,0.790350994427183,0.785709119353961
+"22",-3,0.833445800684247,0.84049201053831
+"23",-2,0.87883024217971,0.875299924554874
+"24",-1,0.921381649204713,0.891841845467951
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_hist.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_hist.pdf
new file mode 100644
index 0000000..9b17f7e
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_hist.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_iter_summ_stat.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_iter_summ_stat.csv
new file mode 100644
index 0000000..5456a06
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_iter_summ_stat.csv
@@ -0,0 +1,45 @@
+"","Theta","DeltaD","DeltaS","Lambda","Rho","LogLik"
+"Mean",0.0219006246202408,0.000301435797850012,0.135963124257766,0.280859595577021,1.32153807785633,-7196.45768442725
+"Std.",0.000577508511709199,0.000270695538345054,0.00829792945188973,0.0128542350694799,0.0389621378981312,1.66756953366878
+"Acceptance ratio",0.25816,0.22266,0.17846,0.2766,0.1681,0.71376
+"0%",0.0198380040231543,1.42454373036551e-07,0.105715258920058,0.232092428890298,1.17262762596825,-7208.4028232571
+"2.5%",0.0207999681487075,7.97617899961323e-06,0.120449580917865,0.256632399980433,1.24420886613335,-7200.50857340635
+"5%",0.0209619377524795,1.74239994285326e-05,0.122882687590637,0.260147525269519,1.25735756692252,-7199.60533224006
+"7.5%",0.0210768439624952,2.61663696933128e-05,0.124427849522337,0.262712496349474,1.26502812916114,-7199.0652803913
+"10%",0.0211556057510592,3.57668773304168e-05,0.125589378472983,0.264500880486889,1.27152708890266,-7198.6763488737
+"12.5%",0.0212380274553014,4.59863246483008e-05,0.126529306233487,0.266061010134952,1.27622428737988,-7198.37371146721
+"15%",0.0212965318751965,5.64469521232714e-05,0.127295326369494,0.26731377361585,1.2810373257763,-7198.11299209031
+"17.5%",0.0213545286888209,6.57435369120891e-05,0.128151990268606,0.268701268083403,1.28488666102036,-7197.89223809179
+"20%",0.021404150839001,7.75433239188574e-05,0.1288756895558,0.269832175100336,1.28850743989622,-7197.69015234756
+"22.5%",0.021454811520864,8.7688082633059e-05,0.129605162821424,0.271016837284611,1.29150781292373,-7197.50270308477
+"25%",0.0215012669435862,9.87486618499508e-05,0.130291329058379,0.272036623908784,1.29444215043912,-7197.33978483946
+"27.5%",0.021551642968403,0.000109707589569778,0.130876807678426,0.273029553027177,1.297516623753,-7197.18979769401
+"30%",0.0215919136813323,0.000121942718117046,0.131393032699831,0.27392399944051,1.30084460503644,-7197.0610631835
+"32.5%",0.021633629016959,0.000133990310444212,0.131889832175742,0.274712382003804,1.30379545341862,-7196.92400625381
+"35%",0.0216711796506022,0.000145442321288712,0.132390746261601,0.275576760294265,1.30648098034317,-7196.80253412275
+"37.5%",0.021712568700984,0.000157325009001209,0.13298177418537,0.276437823366443,1.30934737833592,-7196.67780995717
+"40%",0.021746513874276,0.000168861496333151,0.133508314788367,0.277268275266228,1.31190356996841,-7196.56162350965
+"42.5%",0.0217861693567535,0.000182576295032637,0.134105849442706,0.278025103283447,1.31416691172066,-7196.45612109293
+"45%",0.0218166574413764,0.000194572935578752,0.134593600493832,0.278844728993976,1.31661711167875,-7196.35413772633
+"47.5%",0.0218572797750236,0.000206861382243106,0.135132123647678,0.279737662784732,1.31869776280782,-7196.25528104668
+"50%",0.0218904913040355,0.000221480205429371,0.135717266463774,0.280507741728843,1.32100544098881,-7196.15719631076
+"52.5%",0.021925443392891,0.00023907447045693,0.136209481021594,0.281363339749952,1.32334079471677,-7196.05506692594
+"55%",0.021960565550387,0.000258028822695882,0.136735335440015,0.282249268172593,1.32550948326703,-7195.95989943836
+"57.5%",0.0219970696408049,0.000276920360160845,0.137291096961473,0.283063424117479,1.32835682214939,-7195.87186830903
+"60%",0.0220354732641326,0.000292750441321672,0.137845013138172,0.283954502438702,1.33095800276877,-7195.78497636926
+"62.5%",0.0220727378763268,0.000313295472211612,0.138438237987297,0.284735809685111,1.33356243285064,-7195.68839130381
+"65%",0.0221120896707441,0.000333892523868731,0.13893683157507,0.2857206049311,1.33651657970179,-7195.59886638452
+"67.5%",0.0221533609927046,0.000354724264727856,0.13958492124218,0.286563387675962,1.33945200268971,-7195.51015935451
+"70%",0.0221907405503245,0.000379243299691518,0.140159157542582,0.287544079565563,1.34206553010055,-7195.41684889046
+"72.5%",0.0222326438570029,0.000402726859897231,0.140767695603421,0.288425556734909,1.34544693218872,-7195.32740676445
+"75%",0.0222814811611222,0.00042929327239936,0.141376978959461,0.28949410400511,1.34843417123769,-7195.23770753798
+"77.5%",0.0223258474773438,0.000458977934803026,0.142103251040501,0.290575715579286,1.35205750045743,-7195.13942547137
+"80%",0.0223814742343838,0.000487492034385724,0.142880131360132,0.291696925738954,1.35544805263817,-7195.05133145234
+"82.5%",0.0224457662565876,0.000524026816142365,0.143801760912239,0.292937831664337,1.35851945663808,-7194.9539139229
+"85%",0.0225208660177311,0.000564920838548091,0.144663489314176,0.294401695543006,1.36212981905504,-7194.8546677846
+"87.5%",0.0225826130746442,0.000613802109865189,0.145554938972738,0.296053961989943,1.3665808949157,-7194.74317241523
+"90%",0.0226584729019201,0.00067572313878485,0.146674748706783,0.297904412966022,1.37157885733898,-7194.62736249628
+"92.5%",0.0227542089826694,0.00074412941900157,0.148210046903166,0.2995952777018,1.3775990004394,-7194.49562604906
+"95%",0.0228784645286842,0.000838798495692459,0.150122129887906,0.302372251599279,1.38669876571134,-7194.34259604818
+"97.5%",0.0230635522407027,0.000999811954683026,0.152706643347987,0.306678831263994,1.39831160066654,-7194.13064297287
+"100%",0.0243966160330714,0.00226910407169435,0.169124915594876,0.328329723241691,1.46031974279686,-7193.5528556739
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_post_pred.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_post_pred.pdf
new file mode 100644
index 0000000..f36b843
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_post_pred.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_trace.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_trace.pdf
new file mode 100644
index 0000000..9b77207
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_trace.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/dnacomp.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/dnacomp.txt
new file mode 100644
index 0000000..c04ed29
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/dnacomp.txt
@@ -0,0 +1,324 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_mito.fasta as reference file
+# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads
+Chr End Std Pos A C G T Total
+* 3p + -70 2011 965 1066 2040 6082
+* 3p + -69 2074 918 1045 2145 6182
+* 3p + -68 2215 900 1127 2042 6284
+* 3p + -67 2073 972 1223 2121 6389
+* 3p + -66 2169 949 1204 2149 6471
+* 3p + -65 2168 961 1253 2182 6564
+* 3p + -64 2160 1040 1247 2229 6676
+* 3p + -63 2262 974 1246 2282 6764
+* 3p + -62 2346 1038 1208 2270 6862
+* 3p + -61 2382 1052 1271 2260 6965
+* 3p + -60 2323 1052 1304 2374 7053
+* 3p + -59 2426 1005 1319 2430 7180
+* 3p + -58 2411 1162 1381 2327 7281
+* 3p + -57 2443 1136 1327 2454 7360
+* 3p + -56 2508 1138 1416 2389 7451
+* 3p + -55 2504 1114 1435 2477 7530
+* 3p + -54 2561 1149 1366 2544 7620
+* 3p + -53 2503 1232 1531 2431 7697
+* 3p + -52 2502 1186 1491 2591 7770
+* 3p + -51 2664 1199 1381 2595 7839
+* 3p + -50 2559 1215 1559 2566 7899
+* 3p + -49 2572 1262 1536 2585 7955
+* 3p + -48 2688 1205 1443 2662 7998
+* 3p + -47 2661 1209 1478 2701 8049
+* 3p + -46 2762 1283 1532 2526 8103
+* 3p + -45 2823 1170 1465 2690 8148
+* 3p + -44 2801 1211 1597 2584 8193
+* 3p + -43 2764 1251 1581 2635 8231
+* 3p + -42 2709 1254 1619 2676 8258
+* 3p + -41 2765 1242 1629 2644 8280
+* 3p + -40 2665 1210 1602 2820 8297
+* 3p + -39 2699 1220 1619 2776 8314
+* 3p + -38 2807 1299 1520 2725 8351
+* 3p + -37 2747 1342 1562 2721 8372
+* 3p + -36 2788 1307 1513 2779 8387
+* 3p + -35 2862 1219 1631 2689 8401
+* 3p + -34 2742 1209 1633 2829 8413
+* 3p + -33 2879 1218 1542 2779 8418
+* 3p + -32 2852 1269 1550 2758 8429
+* 3p + -31 2843 1252 1674 2672 8441
+* 3p + -30 2935 1234 1514 2757 8440
+* 3p + -29 2977 1213 1539 2725 8454
+* 3p + -28 2713 1245 1693 2815 8466
+* 3p + -27 2886 1286 1503 2803 8478
+* 3p + -26 2829 1252 1615 2795 8491
+* 3p + -25 2876 1238 1624 2769 8507
+* 3p + -24 2839 1225 1563 2893 8520
+* 3p + -23 2929 1184 1605 2802 8520
+* 3p + -22 2793 1312 1616 2787 8508
+* 3p + -21 2790 1285 1616 2832 8523
+* 3p + -20 2931 1227 1564 2803 8525
+* 3p + -19 2910 1222 1636 2757 8525
+* 3p + -18 2874 1212 1545 2894 8525
+* 3p + -17 2910 1201 1571 2843 8525
+* 3p + -16 3036 1225 1531 2733 8525
+* 3p + -15 2878 1146 1599 2837 8460
+* 3p + -14 2861 1188 1703 2772 8524
+* 3p + -13 2866 1204 1616 2839 8525
+* 3p + -12 2905 1144 1610 2865 8524
+* 3p + -11 2839 1137 1622 2927 8525
+* 3p + -10 2921 1343 1494 2766 8524
+* 3p + -9 3167 1159 1467 2732 8525
+* 3p + -8 3100 1047 1597 2781 8525
+* 3p + -7 2877 1264 1488 2896 8525
+* 3p + -6 3131 1099 1478 2817 8525
+* 3p + -5 2974 1120 1450 2981 8525
+* 3p + -4 3182 1103 1402 2837 8524
+* 3p + -3 3118 1082 1447 2878 8525
+* 3p + -2 3190 1292 1267 2776 8525
+* 3p + -1 1672 1424 1656 3773 8525
+* 3p + 1 1787 1688 721 4329 8525
+* 3p + 2 2728 1344 1505 2948 8525
+* 3p + 3 3024 1181 1468 2852 8525
+* 3p + 4 3084 1133 1478 2830 8525
+* 3p + 5 3099 1097 1410 2917 8523
+* 3p + 6 3148 1124 1395 2855 8522
+* 3p + 7 3044 1119 1429 2930 8522
+* 3p + 8 3029 1184 1401 2908 8522
+* 3p + 9 2999 1134 1392 2997 8522
+* 3p + 10 3034 1117 1355 3016 8522
+* 3p - -70 1945 1076 887 2142 6050
+* 3p - -69 2092 1104 882 2072 6150
+* 3p - -68 2225 1132 845 2066 6268
+* 3p - -67 2172 1139 980 2076 6367
+* 3p - -66 2178 1208 932 2178 6496
+* 3p - -65 2259 1197 930 2216 6602
+* 3p - -64 2273 1156 997 2279 6705
+* 3p - -63 2322 1209 1000 2289 6820
+* 3p - -62 2356 1200 1009 2345 6910
+* 3p - -61 2410 1268 1040 2287 7005
+* 3p - -60 2442 1288 1013 2357 7100
+* 3p - -59 2454 1278 1026 2456 7214
+* 3p - -58 2386 1377 1093 2452 7308
+* 3p - -57 2453 1306 1096 2575 7430
+* 3p - -56 2476 1407 1089 2559 7531
+* 3p - -55 2474 1476 1125 2565 7640
+* 3p - -54 2549 1438 1131 2594 7712
+* 3p - -53 2564 1437 1182 2603 7786
+* 3p - -52 2510 1504 1232 2611 7857
+* 3p - -51 2572 1514 1190 2643 7919
+* 3p - -50 2730 1497 1142 2610 7979
+* 3p - -49 2523 1600 1168 2759 8050
+* 3p - -48 2661 1572 1088 2787 8108
+* 3p - -47 2737 1574 1213 2645 8169
+* 3p - -46 2665 1579 1230 2753 8227
+* 3p - -45 2767 1420 1292 2806 8285
+* 3p - -44 2780 1440 1327 2776 8323
+* 3p - -43 2796 1585 1243 2729 8353
+* 3p - -42 2894 1477 1224 2787 8382
+* 3p - -41 2822 1572 1223 2798 8415
+* 3p - -40 2738 1614 1233 2860 8445
+* 3p - -39 2819 1590 1166 2896 8471
+* 3p - -38 2795 1573 1296 2834 8498
+* 3p - -37 2736 1668 1305 2806 8515
+* 3p - -36 2823 1660 1151 2898 8532
+* 3p - -35 2830 1584 1255 2878 8547
+* 3p - -34 2763 1676 1238 2888 8565
+* 3p - -33 2891 1622 1181 2879 8573
+* 3p - -32 2803 1573 1307 2891 8574
+* 3p - -31 2845 1568 1332 2840 8585
+* 3p - -30 2999 1580 1254 2752 8585
+* 3p - -29 2818 1608 1328 2852 8606
+* 3p - -28 2832 1537 1425 2822 8616
+* 3p - -27 2884 1561 1233 2946 8624
+* 3p - -26 2823 1601 1369 2852 8645
+* 3p - -25 2782 1637 1298 2943 8660
+* 3p - -24 2927 1553 1270 2918 8668
+* 3p - -23 2801 1586 1344 2939 8670
+* 3p - -22 2787 1590 1285 2999 8661
+* 3p - -21 2929 1408 1361 2971 8669
+* 3p - -20 2977 1570 1273 2851 8671
+* 3p - -19 2920 1598 1326 2826 8670
+* 3p - -18 2986 1570 1358 2757 8671
+* 3p - -17 2966 1486 1342 2876 8670
+* 3p - -16 2923 1586 1219 2942 8670
+* 3p - -15 2890 1509 1278 2933 8610
+* 3p - -14 2986 1542 1336 2807 8671
+* 3p - -13 2968 1585 1357 2760 8670
+* 3p - -12 2925 1462 1285 2999 8671
+* 3p - -11 3034 1543 1276 2818 8671
+* 3p - -10 2992 1528 1323 2825 8668
+* 3p - -9 3165 1475 1311 2719 8670
+* 3p - -8 3141 1426 1377 2727 8671
+* 3p - -7 2818 1514 1309 3030 8671
+* 3p - -6 3059 1377 1201 3034 8671
+* 3p - -5 2945 1444 1267 3015 8671
+* 3p - -4 2967 1344 1246 3114 8671
+* 3p - -3 3156 1252 1199 3064 8671
+* 3p - -2 3354 1496 980 2841 8671
+* 3p - -1 1710 1840 1213 3908 8671
+* 3p - 1 1858 1935 566 4312 8671
+* 3p - 2 2732 1615 1244 3080 8671
+* 3p - 3 2999 1479 1251 2942 8671
+* 3p - 4 3047 1413 1164 3047 8671
+* 3p - 5 3166 1260 1180 3065 8671
+* 3p - 6 3080 1325 1193 3073 8671
+* 3p - 7 3135 1378 1086 3072 8671
+* 3p - 8 3103 1296 1136 3136 8671
+* 3p - 9 3091 1423 1080 3077 8671
+* 3p - 10 3046 1417 1143 3065 8671
+* 5p + -10 3155 1079 1272 3017 8523
+* 5p + -9 3110 1117 1372 2924 8523
+* 5p + -8 3055 1030 1298 3140 8523
+* 5p + -7 3062 1084 1363 3014 8523
+* 5p + -6 2971 1134 1312 3106 8523
+* 5p + -5 2860 1124 1290 3249 8523
+* 5p + -4 2873 1163 1427 3060 8523
+* 5p + -3 2960 1166 1418 2979 8523
+* 5p + -2 3271 1084 1680 2488 8523
+* 5p + -1 4978 338 2245 962 8523
+* 5p + 1 3961 1238 1779 1547 8525
+* 5p + 2 2686 903 1602 3334 8525
+* 5p + 3 3024 1301 1295 2904 8524
+* 5p + 4 2955 1246 1373 2951 8525
+* 5p + 5 3066 1222 1386 2851 8525
+* 5p + 6 3023 1119 1356 3027 8525
+* 5p + 7 3025 1207 1502 2791 8525
+* 5p + 8 2648 1299 1396 3182 8525
+* 5p + 9 2708 1303 1370 3144 8525
+* 5p + 10 2793 1324 1535 2873 8525
+* 5p + 11 2963 1173 1435 2954 8525
+* 5p + 12 2960 1198 1495 2872 8525
+* 5p + 13 2691 1341 1547 2946 8525
+* 5p + 14 2845 1292 1461 2927 8525
+* 5p + 15 2936 1266 1389 2919 8510
+* 5p + 16 2941 1226 1480 2878 8525
+* 5p + 17 2883 1230 1504 2908 8525
+* 5p + 18 2798 1278 1558 2891 8525
+* 5p + 19 2877 1279 1535 2834 8525
+* 5p + 20 2898 1234 1518 2875 8525
+* 5p + 21 2942 1289 1520 2773 8524
+* 5p + 22 2864 1294 1560 2803 8521
+* 5p + 23 3006 1230 1545 2740 8521
+* 5p + 24 2834 1207 1560 2920 8521
+* 5p + 25 2904 1277 1565 2775 8521
+* 5p + 26 2864 1258 1558 2812 8492
+* 5p + 27 2964 1206 1509 2799 8478
+* 5p + 28 2756 1369 1496 2845 8466
+* 5p + 29 2804 1259 1545 2847 8455
+* 5p + 30 2754 1262 1595 2839 8450
+* 5p + 31 2815 1245 1671 2710 8441
+* 5p + 32 2927 1286 1590 2626 8429
+* 5p + 33 2865 1179 1624 2750 8418
+* 5p + 34 2865 1236 1584 2728 8413
+* 5p + 35 2896 1225 1571 2709 8401
+* 5p + 36 2805 1176 1665 2741 8387
+* 5p + 37 2719 1354 1652 2647 8372
+* 5p + 38 2804 1261 1537 2749 8351
+* 5p + 39 2846 1183 1532 2753 8314
+* 5p + 40 2761 1200 1649 2687 8297
+* 5p + 41 2880 1260 1456 2684 8280
+* 5p + 42 2821 1197 1467 2773 8258
+* 5p + 43 2727 1287 1543 2674 8231
+* 5p + 44 2725 1295 1504 2669 8193
+* 5p + 45 2768 1213 1446 2721 8148
+* 5p + 46 2608 1216 1543 2736 8103
+* 5p + 47 2659 1161 1507 2722 8049
+* 5p + 48 2783 1090 1537 2588 7998
+* 5p + 49 2566 1244 1589 2556 7955
+* 5p + 50 2627 1162 1424 2686 7899
+* 5p + 51 2618 1135 1613 2473 7839
+* 5p + 52 2559 1226 1489 2496 7770
+* 5p + 53 2533 1132 1373 2659 7697
+* 5p + 54 2533 1063 1490 2534 7620
+* 5p + 55 2455 1203 1581 2291 7530
+* 5p + 56 2529 1135 1391 2396 7451
+* 5p + 57 2482 1091 1369 2418 7360
+* 5p + 58 2430 1139 1375 2337 7281
+* 5p + 59 2453 1031 1271 2425 7180
+* 5p + 60 2434 962 1274 2383 7053
+* 5p + 61 2344 1101 1225 2295 6965
+* 5p + 62 2281 1062 1194 2325 6862
+* 5p + 63 2337 957 1229 2241 6764
+* 5p + 64 2243 1033 1237 2163 6676
+* 5p + 65 2166 925 1179 2294 6564
+* 5p + 66 2140 927 1198 2206 6471
+* 5p + 67 2011 1046 1191 2141 6389
+* 5p + 68 2081 911 1128 2164 6284
+* 5p + 69 2140 882 1115 2021 6158
+* 5p + 70 2081 952 1156 1893 6082
+* 5p - -10 3144 1346 1117 3064 8671
+* 5p - -9 3110 1477 1025 3059 8671
+* 5p - -8 3048 1399 1094 3130 8671
+* 5p - -7 3038 1389 1073 3171 8671
+* 5p - -6 2988 1454 1096 3133 8671
+* 5p - -5 2972 1458 1036 3205 8671
+* 5p - -4 2802 1495 1134 3240 8671
+* 5p - -3 2764 1540 1241 3126 8671
+* 5p - -2 3242 1378 1443 2608 8671
+* 5p - -1 5189 489 1908 1085 8671
+* 5p - 1 3939 1789 1424 1519 8671
+* 5p - 2 2732 1316 1316 3307 8671
+* 5p - 3 3136 1480 995 3060 8671
+* 5p - 4 2950 1414 1097 3210 8671
+* 5p - 5 3020 1414 1128 3109 8671
+* 5p - 6 2865 1442 1093 3271 8671
+* 5p - 7 2993 1414 1286 2978 8671
+* 5p - 8 2679 1499 1095 3398 8671
+* 5p - 9 2818 1493 1133 3227 8671
+* 5p - 10 2771 1616 1306 2978 8671
+* 5p - 11 2914 1666 1125 2966 8671
+* 5p - 12 2878 1584 1204 3005 8671
+* 5p - 13 2904 1647 1227 2892 8670
+* 5p - 14 2939 1610 1125 2997 8671
+* 5p - 15 2878 1554 1240 2982 8654
+* 5p - 16 2918 1483 1246 3024 8671
+* 5p - 17 2884 1553 1318 2916 8671
+* 5p - 18 3012 1524 1176 2959 8671
+* 5p - 19 2864 1620 1304 2883 8671
+* 5p - 20 2877 1568 1232 2994 8671
+* 5p - 21 2986 1525 1235 2925 8671
+* 5p - 22 2832 1613 1267 2959 8671
+* 5p - 23 2860 1613 1195 3002 8670
+* 5p - 24 3028 1557 1192 2893 8670
+* 5p - 25 2971 1675 1211 2813 8670
+* 5p - 26 2854 1614 1269 2909 8646
+* 5p - 27 2780 1545 1277 3024 8626
+* 5p - 28 2750 1624 1222 3020 8616
+* 5p - 29 2790 1601 1232 2983 8606
+* 5p - 30 2811 1656 1215 2915 8597
+* 5p - 31 2799 1678 1273 2835 8585
+* 5p - 32 2874 1592 1252 2857 8575
+* 5p - 33 2939 1597 1181 2856 8573
+* 5p - 34 2971 1590 1191 2813 8565
+* 5p - 35 2848 1630 1280 2790 8548
+* 5p - 36 2891 1615 1210 2816 8532
+* 5p - 37 2772 1563 1394 2786 8515
+* 5p - 38 2791 1548 1232 2927 8498
+* 5p - 39 2804 1580 1301 2786 8471
+* 5p - 40 2733 1516 1295 2901 8445
+* 5p - 41 2721 1619 1252 2823 8415
+* 5p - 42 2743 1545 1245 2849 8382
+* 5p - 43 2693 1591 1238 2832 8354
+* 5p - 44 2724 1550 1224 2825 8323
+* 5p - 45 2671 1516 1258 2840 8285
+* 5p - 46 2463 1566 1327 2871 8227
+* 5p - 47 2735 1478 1220 2736 8169
+* 5p - 48 2719 1461 1226 2702 8108
+* 5p - 49 2658 1512 1223 2657 8050
+* 5p - 50 2591 1617 1200 2571 7979
+* 5p - 51 2629 1451 1173 2666 7919
+* 5p - 52 2597 1550 1126 2583 7856
+* 5p - 53 2502 1580 1179 2525 7786
+* 5p - 54 2591 1394 1203 2524 7712
+* 5p - 55 2481 1464 1145 2550 7640
+* 5p - 56 2467 1396 1119 2549 7531
+* 5p - 57 2525 1300 1158 2447 7430
+* 5p - 58 2571 1268 1081 2388 7308
+* 5p - 59 2490 1268 994 2462 7214
+* 5p - 60 2376 1360 986 2378 7100
+* 5p - 61 2197 1284 1077 2447 7005
+* 5p - 62 2378 1194 1005 2333 6910
+* 5p - 63 2265 1239 1032 2283 6819
+* 5p - 64 2302 1190 1069 2144 6705
+* 5p - 65 2263 1243 1011 2085 6602
+* 5p - 66 2164 1206 968 2158 6496
+* 5p - 67 2079 1266 962 2060 6367
+* 5p - 68 2108 1134 924 2102 6268
+* 5p - 69 2095 1112 925 1992 6124
+* 5p - 70 2054 1056 923 2017 6050
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/dnacomp_genome.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/dnacomp_genome.csv
new file mode 100644
index 0000000..94d15ba
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/dnacomp_genome.csv
@@ -0,0 +1,2 @@
+A,C,G,T
+0.388112441327,0.105690628131,0.117477981119,0.388718949422
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/lgdistribution.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/lgdistribution.txt
new file mode 100644
index 0000000..b868d04
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/lgdistribution.txt
@@ -0,0 +1,327 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_mito.fasta as reference file
+# Std: strand of reads
+Std Length Occurences
++ 25 7
++ 26 4
++ 27 5
++ 28 6
++ 29 4
++ 30 6
++ 31 11
++ 32 9
++ 33 4
++ 34 11
++ 35 13
++ 36 14
++ 37 23
++ 38 32
++ 39 19
++ 40 15
++ 41 23
++ 42 22
++ 43 42
++ 44 44
++ 45 42
++ 46 53
++ 47 50
++ 48 40
++ 49 55
++ 50 59
++ 51 69
++ 52 72
++ 53 74
++ 54 91
++ 55 79
++ 56 88
++ 57 83
++ 58 103
++ 59 121
++ 60 89
++ 61 104
++ 62 95
++ 63 90
++ 64 112
++ 65 89
++ 66 86
++ 67 108
++ 68 90
++ 69 109
++ 70 109
++ 71 94
++ 72 101
++ 73 110
++ 74 100
++ 75 88
++ 76 88
++ 77 101
++ 78 103
++ 79 109
++ 80 95
++ 81 79
++ 82 86
++ 83 88
++ 84 68
++ 85 90
++ 86 86
++ 87 76
++ 88 86
++ 89 72
++ 90 100
++ 91 104
++ 92 130
++ 93 291
++ 94 1369
++ 95 58
++ 96 36
++ 97 48
++ 98 89
++ 99 34
++ 100 49
++ 101 35
++ 102 44
++ 103 46
++ 104 41
++ 105 45
++ 106 42
++ 107 41
++ 108 38
++ 109 24
++ 110 34
++ 111 35
++ 112 34
++ 113 23
++ 114 34
++ 115 31
++ 116 35
++ 117 29
++ 118 25
++ 119 40
++ 120 40
++ 121 23
++ 122 29
++ 123 28
++ 124 29
++ 125 37
++ 126 26
++ 127 28
++ 128 29
++ 129 20
++ 130 23
++ 131 21
++ 132 20
++ 133 31
++ 134 20
++ 135 19
++ 136 23
++ 137 19
++ 138 22
++ 139 13
++ 140 21
++ 141 27
++ 142 21
++ 143 23
++ 144 23
++ 145 18
++ 146 12
++ 147 9
++ 148 10
++ 149 15
++ 150 7
++ 151 17
++ 152 12
++ 153 10
++ 154 13
++ 155 10
++ 156 16
++ 157 8
++ 158 19
++ 159 20
++ 160 11
++ 161 8
++ 162 8
++ 163 8
++ 164 8
++ 165 16
++ 166 15
++ 167 10
++ 168 6
++ 169 9
++ 170 10
++ 171 10
++ 172 6
++ 173 5
++ 174 12
++ 175 4
++ 176 10
++ 177 4
++ 178 6
++ 179 3
++ 180 6
++ 181 6
++ 182 4
++ 183 7
++ 184 3
++ 185 5
+- 24 1
+- 25 7
+- 26 11
+- 27 3
+- 28 6
+- 29 6
+- 30 10
+- 31 8
+- 32 2
+- 33 8
+- 34 15
+- 35 16
+- 36 14
+- 37 16
+- 38 23
+- 39 25
+- 40 30
+- 41 35
+- 42 26
+- 43 30
+- 44 39
+- 45 55
+- 46 55
+- 47 63
+- 48 55
+- 49 68
+- 50 58
+- 51 61
+- 52 67
+- 53 76
+- 54 72
+- 55 109
+- 56 97
+- 57 127
+- 58 93
+- 59 115
+- 60 88
+- 61 90
+- 62 104
+- 63 115
+- 64 96
+- 65 110
+- 66 127
+- 67 101
+- 68 105
+- 69 111
+- 70 113
+- 71 108
+- 72 87
+- 73 84
+- 74 97
+- 75 102
+- 76 84
+- 77 109
+- 78 97
+- 79 86
+- 80 95
+- 81 106
+- 82 92
+- 83 80
+- 84 82
+- 85 77
+- 86 95
+- 87 89
+- 88 76
+- 89 88
+- 90 92
+- 91 96
+- 92 141
+- 93 274
+- 94 1348
+- 95 55
+- 96 40
+- 97 41
+- 98 84
+- 99 43
+- 100 49
+- 101 39
+- 102 39
+- 103 38
+- 104 29
+- 105 32
+- 106 36
+- 107 31
+- 108 41
+- 109 35
+- 110 35
+- 111 30
+- 112 32
+- 113 42
+- 114 33
+- 115 41
+- 116 29
+- 117 36
+- 118 36
+- 119 37
+- 120 34
+- 121 34
+- 122 25
+- 123 31
+- 124 30
+- 125 23
+- 126 27
+- 127 25
+- 128 19
+- 129 18
+- 130 31
+- 131 23
+- 132 23
+- 133 23
+- 134 14
+- 135 27
+- 136 18
+- 137 21
+- 138 20
+- 139 18
+- 140 20
+- 141 27
+- 142 14
+- 143 21
+- 144 15
+- 145 14
+- 146 22
+- 147 12
+- 148 19
+- 149 21
+- 150 6
+- 151 19
+- 152 17
+- 153 17
+- 154 10
+- 155 11
+- 156 14
+- 157 9
+- 158 14
+- 159 11
+- 160 5
+- 161 10
+- 162 8
+- 163 15
+- 164 7
+- 165 12
+- 166 10
+- 167 6
+- 168 9
+- 169 11
+- 170 7
+- 171 11
+- 172 7
+- 173 8
+- 174 9
+- 175 4
+- 176 12
+- 177 4
+- 178 6
+- 179 6
+- 180 8
+- 181 9
+- 182 3
+- 183 5
+- 184 6
+- 185 7
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/misincorporation.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/misincorporation.txt
new file mode 100644
index 0000000..ef01cd2
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_mito.mapDamage/Pi1889_id_TAGCTT/misincorporation.txt
@@ -0,0 +1,284 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_mito.fasta as reference file
+# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads
+Chr End Std Pos A C G T Total G>A C>T A>G T>C A>C A>T C>G C>A T>G T>A G>C G>T A>- T>- C>- G>- ->A ->T ->C ->G S
+* 3p + 1 1808 1386 1734 3597 8525 32 73 22 81 42 265 1 35 24 126 24 69 0 0 0 0 0 0 0 0 11
+* 3p + 2 2976 1326 1302 2918 8522 49 35 17 37 28 113 11 63 24 260 8 30 0 0 0 0 0 1 2 0 10
+* 3p + 3 2954 1121 1470 2973 8518 43 32 20 41 14 111 4 62 26 201 4 26 0 1 0 0 2 5 0 0 10
+* 3p + 4 2989 1158 1415 2958 8520 39 30 19 21 14 59 10 56 21 185 4 19 0 3 0 1 2 1 1 0 7
+* 3p + 5 2907 1145 1443 3025 8520 14 33 13 30 10 48 5 28 26 96 3 21 0 1 4 1 0 1 2 2 7
+* 3p + 6 3079 1118 1467 2833 8497 17 19 13 23 3 38 8 23 8 53 3 5 0 1 1 0 16 6 0 6 7
+* 3p + 7 2847 1236 1487 2932 8502 15 3 7 37 7 44 2 13 8 49 1 5 3 0 0 0 10 6 5 2 7
+* 3p + 8 3078 1036 1587 2793 8494 1 4 6 18 4 26 2 8 10 31 1 5 3 3 1 4 15 10 3 3 7
+* 3p + 9 3150 1139 1471 2740 8500 7 5 1 26 6 17 3 4 3 9 1 2 3 6 3 0 21 2 0 2 7
+* 3p + 10 2893 1327 1487 2798 8505 5 8 2 28 1 11 0 4 10 13 0 4 2 5 1 0 8 9 0 2 7
+* 3p + 11 2822 1116 1626 2946 8510 10 3 3 26 2 5 2 4 5 10 4 3 0 1 4 0 8 6 0 1 7
+* 3p + 12 2905 1143 1608 2861 8517 7 2 10 8 3 3 2 5 4 5 1 3 1 1 2 0 2 3 2 0 7
+* 3p + 13 2860 1183 1614 2864 8521 6 5 7 26 1 5 1 2 0 3 0 1 1 1 0 1 3 1 0 0 6
+* 3p + 14 2860 1169 1703 2789 8521 4 2 1 20 4 3 2 2 1 5 3 2 3 2 0 0 0 2 1 0 6
+* 3p + 15 2876 1136 1605 2842 8459 7 2 4 14 1 2 1 1 1 5 0 4 0 0 0 0 0 1 0 0 6
+* 3p + 16 3023 1209 1531 2761 8524 3 5 2 16 2 2 1 0 1 8 1 0 2 0 0 0 1 0 0 0 5
+* 3p + 17 2908 1185 1574 2857 8524 6 2 2 14 2 1 0 1 0 7 0 2 0 3 1 0 0 0 1 0 4
+* 3p + 18 2871 1190 1549 2912 8522 6 3 3 18 5 0 1 0 2 4 1 2 0 2 0 0 0 0 1 2 4
+* 3p + 19 2917 1207 1632 2768 8524 1 1 9 16 2 1 0 2 1 4 0 4 0 2 0 1 0 1 0 0 4
+* 3p + 20 2917 1230 1569 2807 8523 8 4 5 10 1 3 1 3 1 1 0 3 0 1 0 0 1 0 0 1 4
+* 3p + 21 2793 1262 1618 2849 8522 3 1 6 17 1 2 0 2 3 1 2 3 0 1 0 0 0 0 1 0 4
+* 3p + 22 2798 1300 1611 2799 8508 8 0 8 8 2 3 0 0 4 3 1 3 1 1 0 0 0 0 0 0 3
+* 3p + 23 2938 1169 1594 2819 8520 2 3 8 16 3 1 1 0 4 6 1 1 0 0 0 0 0 0 0 0 3
+* 3p + 24 2849 1216 1563 2892 8520 2 5 6 12 1 3 0 1 1 0 1 1 0 0 0 0 0 0 0 0 3
+* 3p + 25 2876 1213 1627 2791 8507 5 3 3 27 0 3 0 2 2 1 0 6 0 1 0 0 0 0 0 0 3
+* 3p + 26 2812 1237 1620 2823 8492 5 2 6 15 0 1 0 0 1 1 0 2 0 1 0 0 0 0 0 0 3
+* 3p + 27 2891 1273 1501 2815 8480 3 2 4 21 1 0 0 1 2 3 2 0 0 2 0 0 2 0 0 0 3
+* 3p + 28 2711 1230 1694 2829 8464 6 2 3 11 2 1 0 5 4 4 0 2 0 0 0 0 2 0 0 0 3
+* 3p + 29 2963 1204 1528 2758 8453 6 3 3 15 2 0 1 3 6 3 1 0 0 3 0 0 1 0 0 0 3
+* 3p + 30 2930 1222 1523 2765 8440 5 0 4 13 1 2 0 5 0 1 0 0 0 0 0 0 0 0 0 0 3
+* 3p + 31 2844 1245 1672 2680 8441 5 2 6 8 3 1 0 4 0 2 0 1 0 0 0 0 0 0 0 0 3
+* 3p + 32 2856 1257 1545 2771 8429 1 0 2 17 4 4 0 0 1 2 0 1 0 0 0 0 0 0 0 0 3
+* 3p + 33 2891 1208 1547 2772 8418 4 0 6 7 3 0 1 1 1 2 0 0 0 0 0 0 0 0 0 0 3
+* 3p + 34 2738 1189 1631 2855 8413 4 1 3 20 1 0 1 0 1 0 0 1 0 0 1 0 0 0 0 0 3
+* 3p + 35 2859 1214 1627 2701 8401 3 5 7 15 3 4 2 3 3 3 0 5 1 0 0 0 0 0 0 0 2
+* 3p + 36 2779 1292 1511 2805 8387 4 2 0 12 1 0 0 3 2 4 1 1 0 1 0 0 0 0 0 0 2
+* 3p + 37 2745 1330 1568 2729 8372 9 0 3 15 0 0 0 0 0 0 0 6 0 1 0 0 0 0 0 0 2
+* 3p + 38 2804 1292 1525 2730 8351 6 1 3 12 4 0 0 0 2 3 1 1 0 0 0 0 0 0 0 0 2
+* 3p + 39 2688 1208 1632 2786 8314 8 1 4 20 0 3 0 8 1 0 1 2 0 0 0 0 0 0 0 0 2
+* 3p + 40 2663 1196 1602 2836 8297 10 4 3 10 2 1 0 5 1 0 0 1 0 0 0 0 0 0 0 0 2
+* 3p + 41 2759 1249 1629 2643 8280 1 6 2 11 2 0 0 7 2 0 0 0 1 0 0 0 0 0 0 0 2
+* 3p + 42 2718 1248 1622 2670 8258 2 4 4 6 3 1 1 5 2 1 1 2 1 0 0 0 0 0 0 0 2
+* 3p + 43 2759 1253 1577 2643 8232 0 3 3 3 1 0 0 0 1 1 0 1 2 0 0 0 0 0 0 0 2
+* 3p + 44 2787 1206 1593 2608 8194 13 1 6 9 2 0 1 1 2 1 1 1 0 2 0 0 0 0 0 0 2
+* 3p + 45 2832 1161 1470 2686 8149 10 4 4 8 3 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 2
+* 3p + 46 2740 1288 1537 2538 8103 1 4 3 7 2 2 2 11 1 2 0 1 0 4 0 0 0 0 0 0 2
+* 3p + 47 2655 1211 1481 2702 8049 2 2 4 13 1 0 0 9 2 1 0 3 0 0 0 0 0 0 0 0 2
+* 3p + 48 2696 1190 1433 2679 7998 1 1 6 11 0 0 0 0 2 0 0 0 0 1 0 0 0 0 0 0 1
+* 3p + 49 2570 1251 1531 2603 7955 4 0 3 5 1 0 0 2 2 0 0 0 1 1 0 0 0 0 0 0 1
+* 3p + 50 2571 1211 1557 2561 7900 0 3 2 9 2 0 0 0 0 2 0 1 0 0 0 0 0 0 0 0 0
+* 3p + 51 2655 1204 1373 2609 7841 0 3 3 9 0 0 0 3 1 2 0 1 0 0 0 0 0 0 0 0 0
+* 3p + 52 2512 1175 1488 2595 7770 1 2 6 8 1 0 0 0 0 2 0 1 0 0 0 0 0 0 0 0 0
+* 3p + 53 2494 1236 1535 2432 7697 1 6 1 5 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 3p + 54 2567 1139 1367 2550 7623 3 3 4 13 0 0 0 1 2 1 2 0 0 2 0 0 0 0 0 0 0
+* 3p + 55 2502 1101 1433 2497 7533 0 3 1 10 0 3 0 0 2 1 0 0 1 0 0 0 0 0 0 0 0
+* 3p + 56 2511 1131 1422 2387 7451 3 1 3 4 2 1 1 0 2 1 0 5 1 0 0 0 0 0 0 0 0
+* 3p + 57 2454 1133 1318 2458 7363 0 2 2 8 2 0 1 0 1 1 1 5 1 0 0 0 0 0 0 0 0
+* 3p + 58 2398 1152 1385 2346 7281 6 2 5 7 0 1 0 1 1 0 0 2 1 0 0 0 0 0 0 0 0
+* 3p + 59 2436 1009 1316 2419 7180 0 4 3 8 0 1 0 0 1 1 0 2 0 1 0 0 0 0 0 0 0
+* 3p + 60 2327 1051 1310 2367 7055 1 2 0 5 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 3p + 61 2390 1042 1271 2264 6967 1 1 3 3 1 0 0 0 2 1 2 4 0 0 0 0 0 0 0 0 0
+* 3p + 62 2344 1037 1213 2268 6862 2 3 2 5 0 0 0 0 0 3 0 3 0 0 0 0 0 0 0 0 0
+* 3p + 63 2266 969 1241 2291 6767 1 2 4 9 1 0 1 0 2 0 0 1 0 0 0 0 0 0 0 0 0
+* 3p + 64 2150 1033 1250 2244 6677 1 2 2 7 2 0 0 1 0 3 0 1 0 0 0 0 0 0 0 0 0
+* 3p + 65 2172 959 1251 2185 6567 0 3 1 2 0 0 0 0 2 0 1 1 0 0 0 1 0 0 0 0 0
+* 3p + 66 2175 942 1206 2151 6474 0 3 2 6 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 3p + 67 2080 973 1218 2119 6390 0 3 3 4 0 0 0 1 1 1 0 3 0 0 0 0 0 0 0 0 0
+* 3p + 68 2202 903 1130 2052 6287 1 2 1 2 1 1 0 0 0 1 0 2 0 0 0 0 0 0 0 0 0
+* 3p + 69 2079 910 1049 2147 6185 0 0 0 7 1 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0
+* 3p + 70 2014 965 1066 2037 6082 1 1 1 5 1 0 1 0 0 1 0 3 0 0 0 0 0 0 0 0 0
+* 3p - 1 1833 1799 1276 3763 8671 47 83 29 84 65 274 10 42 29 156 27 57 0 0 0 0 0 0 0 0 6
+* 3p - 2 3141 1534 1029 2959 8663 70 43 26 43 18 146 11 60 42 264 14 42 0 0 0 2 7 0 1 0 6
+* 3p - 3 3004 1258 1194 3207 8663 37 31 33 49 17 107 9 37 31 234 2 28 0 2 0 1 3 4 1 0 6
+* 3p - 4 2788 1363 1255 3249 8655 31 23 23 37 14 71 6 40 27 194 3 31 0 4 2 1 14 1 0 1 3
+* 3p - 5 2819 1457 1281 3084 8641 32 31 13 34 8 50 7 20 13 124 1 15 0 7 2 1 18 8 2 2 3
+* 3p - 6 3008 1369 1168 3096 8641 8 12 25 19 12 32 3 15 21 81 4 9 0 4 1 0 20 4 4 2 3
+* 3p - 7 2747 1503 1296 3084 8630 16 10 11 26 8 15 3 13 16 54 2 3 1 4 3 2 20 12 4 5 3
+* 3p - 8 3076 1415 1368 2780 8639 15 8 14 19 4 17 2 5 8 45 2 1 0 11 1 1 21 6 3 2 3
+* 3p - 9 3137 1450 1308 2742 8637 6 2 7 18 8 8 1 5 7 19 1 5 1 6 0 0 15 9 7 2 3
+* 3p - 10 2976 1515 1318 2839 8648 6 2 13 14 1 13 2 5 4 24 1 5 1 4 0 2 16 4 0 0 3
+* 3p - 11 3028 1534 1275 2829 8666 6 8 5 12 5 6 0 1 3 15 0 5 0 2 0 0 3 2 0 0 3
+* 3p - 12 2910 1458 1288 3011 8667 9 4 2 12 5 7 2 3 9 10 0 5 0 2 1 0 2 0 0 2 3
+* 3p - 13 2969 1571 1350 2774 8664 1 1 7 9 3 3 0 0 1 8 0 3 0 2 0 0 3 1 0 2 3
+* 3p - 14 2998 1518 1328 2823 8667 3 5 14 19 9 1 1 1 0 2 2 2 1 1 0 0 1 3 0 0 3
+* 3p - 15 2900 1501 1286 2924 8611 6 4 4 15 4 2 0 1 2 0 0 9 2 1 1 0 0 0 0 0 3
+* 3p - 16 2923 1568 1216 2959 8666 5 2 5 21 2 3 1 0 1 5 0 4 0 1 0 0 0 3 0 0 3
+* 3p - 17 2971 1474 1358 2865 8668 10 7 9 12 2 2 2 0 2 1 0 9 2 2 0 0 0 1 1 0 3
+* 3p - 18 2988 1556 1354 2773 8671 4 3 3 16 5 2 1 1 0 1 0 2 0 0 0 0 0 0 0 0 3
+* 3p - 19 2916 1591 1335 2827 8669 6 8 4 16 1 1 0 1 4 2 1 11 0 0 0 1 0 0 0 1 3
+* 3p - 20 2984 1558 1288 2840 8670 2 1 9 7 3 2 0 0 2 1 1 21 0 0 0 0 0 0 0 1 3
+* 3p - 21 2941 1400 1353 2972 8666 5 12 11 17 2 2 1 1 4 1 1 2 0 0 0 0 1 1 0 1 3
+* 3p - 22 2786 1593 1274 3006 8659 5 22 8 17 0 1 0 1 2 2 0 0 0 1 0 0 1 1 0 0 3
+* 3p - 23 2800 1577 1340 2953 8670 1 3 5 12 4 2 2 0 5 3 0 1 0 0 0 0 0 0 0 0 3
+* 3p - 24 2923 1547 1270 2928 8668 3 0 4 7 2 1 0 0 1 4 2 3 0 0 0 0 0 0 0 0 3
+* 3p - 25 2779 1627 1301 2953 8660 6 4 5 10 1 1 0 0 2 0 0 1 1 1 0 0 0 0 0 0 3
+* 3p - 26 2847 1593 1368 2839 8647 2 0 6 8 4 2 1 1 3 0 0 4 1 1 0 0 0 0 0 0 3
+* 3p - 27 2883 1551 1225 2966 8625 2 3 4 16 0 3 0 3 0 5 0 5 1 0 0 0 0 0 0 1 2
+* 3p - 28 2828 1515 1418 2856 8617 3 3 5 21 1 2 0 0 5 2 1 1 0 0 0 0 0 0 0 0 2
+* 3p - 29 2808 1602 1328 2869 8607 6 4 3 15 2 1 0 3 5 1 0 1 0 0 0 0 0 0 0 0 2
+* 3p - 30 2997 1567 1252 2769 8585 4 3 4 14 0 0 0 1 7 4 0 3 0 0 0 0 0 0 0 0 2
+* 3p - 31 2847 1565 1341 2832 8585 5 3 3 5 3 1 0 1 1 1 1 3 1 0 0 0 0 0 0 0 1
+* 3p - 32 2825 1560 1301 2887 8573 6 4 4 10 3 0 0 0 8 0 0 2 0 0 0 0 0 1 0 0 1
+* 3p - 33 2871 1613 1173 2915 8572 1 1 5 12 1 0 0 0 1 3 0 0 0 0 0 0 0 1 0 0 1
+* 3p - 34 2773 1656 1242 2894 8565 2 1 1 13 1 2 2 0 4 1 0 1 0 0 0 0 0 0 0 0 1
+* 3p - 35 2841 1573 1249 2884 8547 2 1 5 17 0 3 0 0 1 2 0 1 0 1 0 0 0 0 0 0 1
+* 3p - 36 2811 1645 1153 2924 8533 3 2 4 19 3 1 1 0 0 6 0 4 0 1 0 0 0 0 0 0 1
+* 3p - 37 2746 1655 1296 2818 8515 2 2 8 12 2 0 1 1 0 2 0 0 0 2 0 0 0 0 0 0 1
+* 3p - 38 2809 1558 1294 2837 8498 1 0 6 11 4 1 0 0 0 4 1 1 0 4 0 0 0 0 0 0 1
+* 3p - 39 2814 1577 1163 2917 8471 0 0 3 11 1 0 0 0 2 1 1 0 0 0 0 0 0 0 0 0 1
+* 3p - 40 2738 1610 1233 2864 8445 3 0 8 6 2 1 0 1 0 0 0 4 0 0 0 0 0 0 0 0 1
+* 3p - 41 2820 1552 1230 2813 8415 4 0 3 13 2 1 0 1 0 2 1 2 0 0 0 0 0 0 0 0 1
+* 3p - 42 2899 1472 1217 2794 8382 6 1 4 7 0 2 1 0 0 2 1 0 0 0 0 0 0 0 0 0 1
+* 3p - 43 2790 1576 1245 2742 8353 1 1 5 8 0 1 0 1 0 1 2 2 0 0 0 0 0 0 0 0 1
+* 3p - 44 2807 1425 1321 2769 8322 0 3 11 8 3 4 0 0 0 2 0 3 0 0 0 0 0 1 0 0 1
+* 3p - 45 2775 1428 1282 2801 8286 2 3 8 7 0 0 0 1 0 1 1 3 0 0 0 0 1 0 0 0 1
+* 3p - 46 2660 1564 1236 2768 8228 3 1 6 12 1 1 0 0 3 0 0 3 0 0 0 0 0 0 0 0 1
+* 3p - 47 2746 1567 1204 2652 8169 2 2 8 9 1 1 0 0 1 1 1 0 0 0 0 0 0 0 0 0 1
+* 3p - 48 2671 1563 1084 2790 8108 1 0 5 7 0 1 1 0 1 1 1 2 0 0 0 0 0 0 0 0 1
+* 3p - 49 2511 1588 1169 2782 8050 4 3 7 14 1 1 0 0 1 0 0 3 0 1 0 0 0 0 0 0 1
+* 3p - 50 2745 1495 1133 2606 7979 1 1 3 5 0 0 0 0 2 2 0 0 0 1 0 0 0 0 0 0 1
+* 3p - 51 2568 1503 1196 2652 7919 5 4 1 12 0 0 0 0 0 0 0 2 0 0 0 0 0 0 0 0 0
+* 3p - 52 2515 1494 1231 2617 7857 0 1 3 12 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0
+* 3p - 53 2570 1433 1179 2605 7787 0 1 7 2 1 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
+* 3p - 54 2543 1423 1133 2614 7713 1 2 5 9 3 0 1 0 1 4 0 2 0 0 0 0 0 0 0 0 0
+* 3p - 55 2479 1478 1120 2565 7642 1 2 3 7 0 1 0 0 0 2 1 1 1 1 0 0 0 0 0 0 0
+* 3p - 56 2484 1402 1082 2563 7531 1 0 4 6 0 1 0 0 0 6 0 2 0 0 0 0 0 0 0 0 0
+* 3p - 57 2449 1294 1093 2596 7432 2 1 4 7 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 0
+* 3p - 58 2381 1376 1098 2455 7310 2 2 4 5 0 1 0 0 0 2 1 0 0 0 0 0 0 0 0 0 0
+* 3p - 59 2445 1274 1034 2462 7215 1 1 1 4 1 0 0 0 0 2 0 5 0 0 0 0 0 1 0 0 0
+* 3p - 60 2452 1284 1010 2353 7099 0 1 3 4 2 2 0 0 0 1 0 2 1 0 0 0 0 1 0 0 0
+* 3p - 61 2415 1255 1042 2297 7009 0 1 2 6 1 0 0 0 0 2 0 1 0 0 0 0 0 0 0 0 0
+* 3p - 62 2358 1206 1007 2345 6916 1 4 5 7 0 1 0 0 0 3 0 0 0 0 0 1 0 0 0 0 0
+* 3p - 63 2334 1200 993 2295 6822 0 0 6 9 0 1 0 0 1 0 1 2 0 0 0 0 0 0 0 0 0
+* 3p - 64 2280 1149 996 2280 6705 0 2 1 6 0 0 0 0 1 2 0 1 1 0 0 0 0 0 0 0 0
+* 3p - 65 2253 1191 931 2231 6606 3 1 2 5 0 0 0 1 1 0 0 0 1 0 0 0 0 0 0 0 0
+* 3p - 66 2183 1201 930 2183 6497 3 0 4 11 2 0 1 0 0 0 0 1 0 0 0 0 0 0 0 0 0
+* 3p - 67 2175 1134 979 2079 6367 0 2 0 6 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 0 0
+* 3p - 68 2227 1126 835 2081 6269 0 0 4 7 0 0 0 0 3 0 0 1 0 0 0 0 0 0 0 0 0
+* 3p - 69 2096 1095 883 2078 6152 2 4 5 3 4 2 1 0 4 1 0 1 0 1 0 0 0 0 0 0 0
+* 3p - 70 1954 1077 879 2142 6052 0 1 4 4 0 0 1 0 2 0 0 1 0 0 0 0 0 0 0 0 0
+* 5p + 1 3968 1234 1777 1546 8525 4 7 7 8 3 3 0 1 1 1 1 1 0 0 0 0 0 0 0 0 6
+* 5p + 2 2699 901 1587 3338 8525 2 5 14 7 0 2 0 0 4 1 0 1 0 0 0 0 0 0 0 0 4
+* 5p + 3 3039 1294 1282 2909 8524 0 3 14 8 2 1 0 0 1 2 0 2 0 0 0 0 0 0 0 0 4
+* 5p + 4 2959 1242 1367 2956 8524 1 8 4 14 3 4 2 3 3 2 0 2 0 0 0 0 1 0 0 0 3
+* 5p + 5 3073 1210 1377 2863 8523 1 6 6 14 3 1 0 1 6 1 0 2 1 1 0 0 0 0 2 0 3
+* 5p + 6 3029 1117 1351 3026 8523 1 8 5 10 1 4 0 1 1 1 0 1 0 0 0 0 1 0 0 1 3
+* 5p + 7 3030 1201 1494 2800 8525 1 2 7 9 1 1 0 2 4 0 0 2 1 1 0 0 0 0 0 0 3
+* 5p + 8 2650 1293 1396 3186 8525 3 6 4 10 2 1 0 0 0 3 0 2 0 0 0 0 0 0 0 0 3
+* 5p + 9 2699 1293 1373 3158 8523 3 1 2 12 1 0 0 3 3 3 0 3 0 1 0 1 0 1 1 0 3
+* 5p + 10 2796 1312 1531 2885 8524 0 0 5 13 1 1 0 4 3 3 1 3 1 0 0 0 0 0 1 0 3
+* 5p + 11 2961 1168 1436 2960 8525 3 1 5 7 2 2 0 4 2 4 1 4 1 0 0 0 0 0 0 0 3
+* 5p + 12 2956 1197 1495 2876 8524 6 2 4 7 1 2 2 4 2 3 0 3 1 0 0 0 0 0 1 0 3
+* 5p + 13 2688 1333 1548 2956 8525 6 1 5 10 1 2 0 2 4 2 0 3 1 0 0 0 0 0 0 0 3
+* 5p + 14 2838 1284 1464 2939 8525 7 3 6 14 1 2 0 4 0 3 0 2 0 0 0 0 0 0 0 0 3
+* 5p + 15 2932 1272 1387 2919 8510 2 4 5 4 2 1 0 8 1 3 0 2 0 2 0 0 0 0 0 0 2
+* 5p + 16 2936 1221 1482 2886 8525 6 1 4 10 0 2 0 4 2 0 0 2 0 1 0 0 0 0 0 0 2
+* 5p + 17 2876 1234 1506 2908 8524 6 7 4 7 0 0 0 4 0 3 0 0 2 0 0 0 0 1 0 0 2
+* 5p + 18 2798 1281 1555 2891 8525 3 4 4 2 0 1 2 0 1 1 1 0 3 0 0 0 0 0 0 0 2
+* 5p + 19 2867 1280 1531 2847 8525 4 0 5 6 1 1 0 8 1 3 0 0 2 2 0 0 0 0 0 0 2
+* 5p + 20 2893 1234 1512 2886 8525 0 0 4 8 2 0 0 14 4 0 2 0 0 0 0 0 0 0 0 0 2
+* 5p + 21 2944 1285 1523 2772 8524 8 4 3 7 5 2 1 1 1 1 0 0 1 1 0 0 0 0 0 0 2
+* 5p + 22 2860 1292 1571 2798 8521 14 4 4 6 3 6 0 3 3 0 0 4 0 1 0 0 0 0 0 0 2
+* 5p + 23 3004 1222 1549 2746 8521 1 2 3 7 1 1 0 2 2 1 2 4 1 0 0 0 0 0 0 0 2
+* 5p + 24 2833 1205 1558 2925 8521 3 2 2 7 1 1 1 2 1 2 0 0 0 0 0 0 0 0 0 0 2
+* 5p + 25 2903 1275 1565 2778 8521 3 0 3 5 1 1 0 3 1 0 0 3 0 0 0 0 0 0 0 0 2
+* 5p + 26 2865 1255 1553 2820 8493 2 3 8 5 1 1 1 1 0 5 0 1 0 0 0 1 0 0 0 0 2
+* 5p + 27 2971 1188 1509 2814 8482 2 0 4 19 1 2 0 2 1 1 2 0 0 0 0 0 0 0 0 0 2
+* 5p + 28 2750 1361 1496 2859 8466 5 2 4 11 4 1 0 6 2 0 1 2 0 0 0 0 0 0 0 0 2
+* 5p + 29 2811 1256 1545 2842 8454 3 4 3 6 1 3 1 1 1 1 0 0 0 0 0 0 0 1 0 0 2
+* 5p + 30 2756 1250 1602 2842 8450 5 0 1 11 2 4 0 1 1 1 0 5 1 0 0 0 0 0 0 0 2
+* 5p + 31 2811 1232 1672 2726 8441 2 2 4 13 1 2 1 0 0 2 0 1 0 0 0 0 0 0 0 0 1
+* 5p + 32 2925 1280 1588 2633 8426 1 4 3 9 2 2 0 0 0 7 1 3 0 0 0 0 3 0 0 0 1
+* 5p + 33 2865 1162 1621 2770 8418 1 3 3 20 0 1 0 0 2 5 0 1 0 0 0 0 0 0 0 0 1
+* 5p + 34 2856 1228 1580 2748 8412 2 4 5 14 0 0 2 1 1 5 0 1 0 0 0 0 0 0 1 0 1
+* 5p + 35 2902 1220 1566 2712 8400 1 4 4 7 1 2 1 1 3 3 1 1 1 1 0 0 0 1 0 0 1
+* 5p + 36 2810 1168 1665 2744 8387 1 0 1 6 2 3 1 0 2 2 2 2 0 2 0 0 0 0 0 0 1
+* 5p + 37 2728 1347 1648 2649 8372 0 3 8 10 1 2 0 1 4 0 0 5 0 3 0 0 0 0 0 0 1
+* 5p + 38 2808 1246 1525 2770 8349 2 0 7 16 1 2 1 0 2 3 0 1 0 1 0 0 0 2 0 0 1
+* 5p + 39 2850 1157 1529 2775 8311 2 0 4 24 1 3 0 0 1 5 0 0 0 1 0 0 0 1 0 2 1
+* 5p + 40 2766 1200 1643 2686 8295 1 5 6 10 0 3 1 4 3 3 1 3 0 0 0 0 0 2 0 0 1
+* 5p + 41 2873 1247 1460 2698 8278 5 7 0 18 0 3 0 1 1 3 0 2 0 1 0 0 0 0 2 0 1
+* 5p + 42 2815 1191 1475 2777 8258 0 7 2 10 5 5 1 0 1 7 1 4 0 2 0 0 0 0 0 0 1
+* 5p + 43 2733 1276 1528 2694 8231 1 1 9 11 1 4 0 0 5 7 0 2 0 0 0 0 1 0 0 0 1
+* 5p + 44 2724 1289 1504 2677 8194 2 3 5 11 1 4 1 4 3 8 1 5 0 0 0 0 0 0 0 0 0
+* 5p + 45 2748 1207 1451 2742 8148 4 7 4 10 0 0 0 1 0 17 2 1 0 0 0 0 0 0 1 0 0
+* 5p + 46 2595 1213 1541 2753 8102 2 1 2 10 1 5 2 4 3 12 1 3 0 2 0 0 1 0 0 0 0
+* 5p + 47 2659 1162 1508 2720 8049 3 5 1 6 1 6 1 2 0 6 0 3 0 2 0 0 0 0 0 0 0
+* 5p + 48 2774 1074 1535 2615 7998 0 4 0 19 2 5 0 0 0 14 0 0 0 0 0 0 0 0 0 0 0
+* 5p + 49 2561 1224 1591 2578 7954 3 0 3 18 1 6 1 3 0 11 0 0 0 0 0 0 0 1 0 0 0
+* 5p + 50 2615 1149 1421 2715 7900 0 4 2 14 1 9 1 0 1 20 0 2 3 0 0 0 0 0 0 0 0
+* 5p + 51 2628 1128 1617 2468 7841 3 2 2 6 3 1 0 0 3 5 2 4 0 1 0 0 0 0 0 0 0
+* 5p + 52 2565 1215 1483 2506 7769 1 4 1 15 4 9 2 4 2 2 1 1 0 2 0 0 0 1 0 0 0
+* 5p + 53 2525 1129 1376 2661 7691 1 5 2 14 0 6 0 2 4 9 0 4 0 0 0 0 2 4 0 0 0
+* 5p + 54 2513 1061 1494 2549 7617 2 0 3 4 2 10 1 4 2 19 1 6 0 0 0 0 6 0 0 0 0
+* 5p + 55 2467 1196 1582 2288 7533 1 1 9 7 3 7 0 1 1 7 0 10 0 0 0 0 0 0 0 0 0
+* 5p + 56 2523 1121 1387 2420 7451 4 1 4 20 3 8 2 8 3 11 0 2 0 0 0 0 0 0 0 0 0
+* 5p + 57 2468 1087 1374 2431 7360 7 8 3 15 2 10 1 4 1 14 2 0 0 0 0 0 3 0 0 0 0
+* 5p + 58 2427 1130 1390 2328 7275 6 6 0 13 2 9 0 5 1 8 1 11 0 0 0 0 1 3 2 0 0
+* 5p + 59 2437 1039 1272 2432 7180 2 14 2 14 0 15 0 8 5 19 2 6 0 0 1 0 0 0 0 0 0
+* 5p + 60 2416 958 1291 2390 7055 9 1 3 3 3 9 0 6 2 18 5 6 1 5 0 0 0 0 0 0 0
+* 5p + 61 2328 1090 1236 2312 6966 8 5 2 12 3 14 0 5 5 19 3 5 1 2 1 0 1 0 0 0 0
+* 5p + 62 2265 1047 1199 2347 6858 9 2 2 21 1 14 0 4 8 12 2 3 2 1 0 2 2 1 1 0 0
+* 5p + 63 2313 952 1232 2266 6763 6 3 2 5 7 8 0 5 4 29 0 2 0 2 0 0 1 3 0 0 0
+* 5p + 64 2228 1023 1246 2178 6675 6 0 2 8 8 13 2 3 2 33 0 9 0 0 0 0 2 0 0 0 0
+* 5p + 65 2154 926 1187 2298 6565 7 4 2 6 5 16 0 5 0 23 0 6 0 0 0 0 0 2 0 0 0
+* 5p + 66 2141 919 1206 2204 6470 3 8 3 16 2 25 1 4 5 26 1 7 1 1 0 2 4 0 0 0 0
+* 5p + 67 1988 1040 1186 2169 6383 2 8 4 20 1 13 1 9 7 18 2 3 0 0 1 0 3 1 0 3 0
+* 5p + 68 2070 911 1122 2179 6282 4 11 6 19 2 7 4 7 4 17 2 3 0 0 2 0 4 1 0 0 0
+* 5p + 69 2105 895 1119 2040 6159 8 10 0 9 2 9 0 10 6 17 2 2 0 4 4 0 2 0 0 0 0
+* 5p + 70 2068 956 1156 1894 6074 7 4 2 13 0 13 7 5 3 13 3 4 0 4 2 0 6 1 0 1 0
+* 5p - 1 3947 1782 1421 1521 8671 4 1 10 7 3 5 0 3 1 3 1 3 0 0 0 0 0 0 0 0 3
+* 5p - 2 2753 1310 1301 3307 8671 2 7 16 13 1 10 1 0 0 4 0 0 0 0 0 0 0 0 0 0 3
+* 5p - 3 3139 1479 988 3065 8671 1 4 7 5 1 5 0 1 2 8 0 1 0 0 0 0 0 0 0 0 3
+* 5p - 4 2958 1401 1091 3221 8671 3 1 9 13 3 4 0 2 1 3 0 1 0 0 0 0 0 0 0 0 2
+* 5p - 5 3029 1405 1119 3116 8669 3 1 12 7 1 4 0 0 2 5 2 1 1 0 0 0 1 0 0 1 2
+* 5p - 6 2871 1438 1085 3275 8669 0 2 8 6 0 2 0 0 1 3 0 1 0 0 0 0 1 1 0 0 2
+* 5p - 7 3003 1414 1271 2980 8668 2 3 15 2 1 2 0 0 0 5 0 0 0 0 0 0 1 0 0 2 2
+* 5p - 8 2683 1488 1092 3408 8671 0 2 5 12 2 0 1 0 0 3 0 2 0 0 0 1 0 0 0 0 2
+* 5p - 9 2828 1486 1122 3234 8670 0 4 12 10 0 0 0 1 1 1 1 1 0 0 0 0 0 0 1 0 2
+* 5p - 10 2776 1615 1303 2976 8670 4 6 9 6 1 1 0 0 1 1 0 2 0 0 0 1 0 1 0 0 2
+* 5p - 11 2914 1663 1121 2972 8670 3 2 5 5 0 1 0 0 5 3 0 3 1 0 0 0 0 1 0 0 2
+* 5p - 12 2879 1568 1207 3016 8670 4 0 4 12 2 1 0 0 1 0 2 2 0 1 0 0 0 1 0 0 2
+* 5p - 13 2909 1646 1219 2895 8669 3 9 11 9 1 1 0 1 2 5 1 1 1 0 0 0 0 1 0 0 2
+* 5p - 14 2944 1600 1119 3007 8670 0 2 5 11 1 1 0 0 2 2 0 1 0 0 0 0 1 0 0 0 2
+* 5p - 15 2893 1544 1230 2987 8654 2 3 10 8 4 1 0 0 5 0 1 2 0 0 0 0 0 0 0 0 1
+* 5p - 16 2923 1483 1240 3025 8671 0 5 5 6 1 6 2 0 1 6 0 2 0 0 0 0 0 0 0 0 1
+* 5p - 17 2885 1544 1322 2920 8671 2 0 1 6 1 2 0 0 0 1 2 1 0 1 0 0 0 0 0 0 1
+* 5p - 18 3019 1527 1169 2956 8671 1 6 8 5 0 3 0 2 1 0 0 1 0 0 0 0 0 0 0 0 1
+* 5p - 19 2868 1613 1300 2890 8671 3 3 6 11 0 2 2 0 2 2 1 2 0 0 0 0 0 0 0 0 1
+* 5p - 20 2879 1570 1224 2998 8671 1 5 6 4 0 0 0 1 2 2 0 0 0 1 0 0 0 0 0 0 1
+* 5p - 21 2990 1518 1235 2928 8671 0 4 4 10 0 0 0 0 2 1 0 5 0 0 0 0 0 0 0 0 1
+* 5p - 22 2838 1607 1258 2968 8671 2 2 10 8 1 2 1 0 1 4 0 2 0 1 0 0 0 0 0 0 1
+* 5p - 23 2865 1611 1193 3001 8670 6 5 11 7 2 1 0 1 2 2 0 4 1 0 0 0 0 0 0 0 1
+* 5p - 24 3023 1545 1193 2909 8670 8 2 8 14 0 0 0 0 1 4 0 2 0 0 0 0 0 0 0 0 1
+* 5p - 25 2977 1671 1207 2815 8670 1 6 7 9 1 3 0 2 0 3 2 1 0 0 0 0 0 0 0 0 1
+* 5p - 26 2865 1607 1260 2916 8648 3 4 10 11 1 3 1 0 3 1 0 2 0 0 0 0 0 0 0 0 1
+* 5p - 27 2788 1537 1275 3028 8628 1 3 5 8 3 3 0 0 2 2 0 3 0 0 0 0 0 0 0 0 0
+* 5p - 28 2756 1619 1218 3024 8617 3 2 8 8 3 1 2 2 0 1 0 3 0 0 0 0 0 0 0 0 0
+* 5p - 29 2795 1594 1228 2989 8606 3 4 9 12 0 3 1 0 5 1 1 7 0 0 1 0 1 0 0 0 0
+* 5p - 30 2813 1645 1210 2927 8595 1 0 6 10 1 0 0 1 2 2 0 2 0 1 0 0 1 1 0 0 0
+* 5p - 31 2805 1676 1270 2833 8584 1 4 6 7 0 0 1 0 2 0 1 6 1 1 0 0 0 1 0 0 0
+* 5p - 32 2884 1587 1242 2862 8575 3 6 10 9 2 2 1 1 2 3 1 0 3 1 0 0 0 0 0 0 0
+* 5p - 33 2942 1582 1185 2864 8573 3 7 3 15 5 2 0 0 3 2 0 5 0 0 0 0 0 0 0 0 0
+* 5p - 34 2974 1587 1190 2811 8562 1 4 7 8 2 0 0 2 2 2 1 6 0 0 0 0 0 3 0 0 0
+* 5p - 35 2849 1623 1287 2785 8544 1 2 3 8 1 1 0 1 0 4 1 8 0 0 0 0 2 2 0 0 0
+* 5p - 36 2886 1602 1212 2830 8530 4 3 2 15 2 3 0 4 3 0 1 1 0 0 0 0 0 1 2 0 0
+* 5p - 37 2774 1564 1386 2790 8514 0 7 6 10 1 0 1 2 1 2 0 1 0 1 1 0 1 0 0 0 0
+* 5p - 38 2790 1539 1235 2932 8496 2 4 1 9 5 2 0 2 4 5 0 5 0 0 0 0 0 2 0 0 0
+* 5p - 39 2808 1571 1299 2791 8469 4 5 10 14 1 4 0 2 3 3 1 6 0 0 0 0 0 2 0 0 0
+* 5p - 40 2736 1517 1289 2902 8444 0 9 4 7 1 0 0 0 4 4 0 5 0 0 0 0 1 0 0 0 0
+* 5p - 41 2710 1614 1256 2832 8412 4 3 3 6 1 1 2 0 1 10 0 3 0 1 0 0 3 0 0 0 0
+* 5p - 42 2746 1545 1247 2844 8382 7 6 8 7 2 1 0 0 1 0 0 6 0 0 0 0 0 0 0 0 0
+* 5p - 43 2695 1581 1240 2837 8353 2 6 0 14 3 3 1 1 3 2 0 4 0 0 0 0 1 0 0 0 0
+* 5p - 44 2715 1553 1230 2823 8321 7 14 4 12 4 6 0 5 4 9 2 4 1 0 0 0 0 2 0 0 0
+* 5p - 45 2672 1516 1256 2843 8287 4 8 4 8 3 9 1 6 6 6 0 5 1 2 0 0 0 0 0 0 0
+* 5p - 46 2463 1564 1337 2862 8226 2 7 2 12 1 8 1 1 2 11 1 11 1 1 0 0 0 2 0 0 0
+* 5p - 47 2721 1472 1224 2752 8169 8 8 4 15 1 4 1 2 2 9 0 4 0 0 0 0 0 0 0 0 0
+* 5p - 48 2722 1457 1225 2700 8104 4 5 5 10 3 8 1 3 1 3 2 2 0 0 0 0 3 0 0 1 0
+* 5p - 49 2648 1503 1222 2674 8047 5 3 4 9 4 7 0 4 7 14 1 4 0 0 0 0 3 0 0 0 0
+* 5p - 50 2578 1608 1203 2587 7976 7 1 5 12 1 4 2 0 3 10 1 6 1 2 0 0 2 1 0 0 0
+* 5p - 51 2635 1438 1171 2671 7915 4 4 7 10 5 8 2 1 4 9 4 3 0 0 0 0 3 0 0 1 0
+* 5p - 52 2586 1542 1121 2606 7855 5 2 5 9 0 2 0 0 7 15 0 1 0 2 0 0 0 1 0 0 0
+* 5p - 53 2473 1570 1182 2557 7782 10 5 3 16 2 4 0 8 2 21 0 2 0 0 0 0 2 0 1 2 0
+* 5p - 54 2559 1385 1206 2556 7706 6 5 3 15 1 3 0 2 5 25 4 2 0 0 0 0 2 2 1 2 0
+* 5p - 55 2451 1461 1141 2582 7635 4 6 2 9 0 12 0 2 7 32 0 0 0 0 0 0 5 2 0 0 0
+* 5p - 56 2452 1386 1122 2566 7526 9 13 2 21 4 13 0 4 10 18 0 5 0 2 0 0 2 1 2 0 0
+* 5p - 57 2525 1296 1165 2441 7427 9 4 1 11 1 26 2 4 7 11 2 7 0 4 0 0 2 2 1 0 0
+* 5p - 58 2543 1261 1100 2404 7308 13 6 0 13 2 12 2 3 6 27 3 10 0 3 0 1 2 0 0 0 0
+* 5p - 59 2479 1254 994 2484 7211 9 3 10 16 5 16 1 5 2 28 2 5 0 0 2 1 2 1 0 2 0
+* 5p - 60 2345 1361 992 2392 7090 10 0 8 7 3 12 0 6 1 28 0 5 1 2 2 2 8 2 0 0 0
+* 5p - 61 2181 1284 1078 2464 7007 2 6 4 9 1 14 3 5 7 24 3 10 0 2 0 0 2 0 0 0 0
+* 5p - 62 2338 1181 1015 2377 6911 12 7 3 17 6 12 2 3 2 38 3 1 0 3 0 0 3 0 0 2 0
+* 5p - 63 2250 1236 1043 2285 6814 5 10 3 18 1 21 0 9 1 30 1 7 0 1 0 0 6 1 0 0 0
+* 5p - 64 2272 1189 1070 2167 6698 4 8 3 14 2 13 0 5 6 32 0 9 0 2 0 0 5 2 0 0 0
+* 5p - 65 2242 1238 1012 2102 6594 7 5 7 12 4 12 0 10 4 24 1 6 0 1 0 0 9 3 0 0 0
+* 5p - 66 2138 1195 979 2177 6489 19 10 10 19 1 15 0 6 8 18 2 6 0 0 0 0 4 0 4 0 0
+* 5p - 67 2062 1267 964 2069 6362 8 3 2 8 4 13 2 4 4 21 0 4 0 0 0 0 5 0 0 0 0
+* 5p - 68 2079 1143 935 2107 6264 4 11 3 5 4 13 0 10 2 30 0 8 0 1 0 2 4 1 0 0 0
+* 5p - 69 2083 1090 931 2020 6124 7 4 4 17 15 9 0 5 3 27 1 5 0 0 1 0 0 2 0 0 0
+* 5p - 70 2049 1061 923 2017 6050 8 6 12 3 2 15 1 1 4 21 1 6 0 1 0 0 2 0 0 0 0
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.coverage b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.coverage
new file mode 100644
index 0000000..9766dfe
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.coverage
@@ -0,0 +1,34 @@
+# Timestamp: 2013-10-24T16:27:57.159306
+#
+# Columns:
+# Contig: Contig, chromosome, or feature for which a depth histogram was
+# created. Unnamed features are named after the chromosome or
+# contig on which they are located, with a star appended. For
+# example "chr1*".
+# Size: The total size of the region. Multiple features with the same
+# name are combined into one row, with the size representing to
+# total of these. Note that overlapping bases are counted 2 (or
+# more) times.
+# Hits: Sum of SE, PE_1, and PE_2 hits
+# SE, PE_1, PE_2: Number of Single Ended, and Pair Ended (mate 1 and 2) hits overlapping
+# the current contig or intervals. Note that a hit may be counted multiple
+# times if it overlaps multiple intervals
+# Collapsed: Number of hits for PE pair collapsed into a single read
+# M, I, D: Number of aligned (M), inserted (I) and deleted (D) bases relative to references
+# Coverage: Average number of bases covering each position in the contig(s)/intervals(s).
+Name Sample Library Contig Size Hits SE PE_1 PE_2 Collapsed M I D Coverage
+Pi1889 * * * 228543505 15696369 7491334 95385 102601 8007049 1244426137 172821 244560 5.44502954481
+Pi1889 * * <Genome> 228543505 15696369 7491334 95385 102601 8007049 1244426137 172821 244560 5.44502954481
+#
+#
+Pi1889 Pi1889 * * 228543505 15696369 7491334 95385 102601 8007049 1244426137 172821 244560 5.44502954481
+Pi1889 Pi1889 * <Genome> 228543505 15696369 7491334 95385 102601 8007049 1244426137 172821 244560 5.44502954481
+#
+Pi1889 Pi1889 Pi1889_id_CTTGTA * 228543505 5215431 2523580 17517 20407 2653927 403753830 53394 78380 1.76663882879
+Pi1889 Pi1889 Pi1889_id_CTTGTA <Genome> 228543505 5215431 2523580 17517 20407 2653927 403753830 53394 78380 1.76663882879
+#
+Pi1889 Pi1889 Pi1889_id_GGCTAC * 228543505 5597401 2683429 24929 26356 2862687 447916759 62254 86200 1.95987525001
+Pi1889 Pi1889 Pi1889_id_GGCTAC <Genome> 228543505 5597401 2683429 24929 26356 2862687 447916759 62254 86200 1.95987525001
+#
+Pi1889 Pi1889 Pi1889_id_TAGCTT * 228543505 4883537 2284325 52939 55838 2490435 392755548 57173 79980 1.71851546602
+Pi1889 Pi1889 Pi1889_id_TAGCTT <Genome> 228543505 4883537 2284325 52939 55838 2490435 392755548 57173 79980 1.71851546602
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.depths b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.depths
new file mode 100644
index 0000000..3d9dda9
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.depths
@@ -0,0 +1,32 @@
+# Timestamp: 2013-10-24T16:36:05.541282
+#
+# Columns:
+# Contig: Contig, chromosome, or feature for which a depth histogram was
+# created. Unnamed features are named after the chromosome or
+# contig on which they are located, with a star appended. For
+# example "chr1*".
+# Size: The total size of the region. Multiple features with the same
+# name are combined into one row, with the size representing to
+# total of these. Note that overlapping bases are counted 2 (or
+# more) times.
+# MaxDepth: Maximum depth to use when calling SNPs, in order to exclude
+# (at least) the 0.5% most extreme sites based on read depth,
+# not including sites with depth 0.
+# MD_*: Fraction of sites with a minimum depth of 1-200.
+#
+Name Sample Library Contig Size MaxDepth MD_001 MD_002 MD_003 MD_004 MD_005 MD_006 MD_007 MD_008 MD_009 MD_010 MD_011 MD_012 MD_013 MD_014 MD_015 MD_016 MD_017 MD_018 MD_019 MD_020 MD_021 MD_022 MD_023 MD_024 MD_025 MD_026 MD_027 MD_028 MD_029 MD_030 MD_031 MD_032 MD_033 MD_034 MD_035 MD_036 MD_037 MD_038 MD_039 MD_040 MD_041 MD_042 [...]
+Pi1889 * * * 0 41 0.4452 0.4048 0.3787 0.3586 0.3400 0.3225 0.3052 0.2879 0.2705 0.2529 0.2351 0.2172 0.1993 0.1815 0.1641 0.1471 0.1308 0.1154 0.1009 0.0875 0.0753 0.0643 0.0546 0.0459 0.0384 0.0320 0.0265 0.0219 0.0180 0.0148 0.0122 0.0100 0.0083 0.0069 0.0057 0.0048 0.0041 0.0035 0.0030 0.0026 0.0023 0.0020 [...]
+Pi1889 * * <Genome> 0 41 0.4452 0.4048 0.3787 0.3586 0.3400 0.3225 0.3052 0.2879 0.2705 0.2529 0.2351 0.2172 0.1993 0.1815 0.1641 0.1471 0.1308 0.1154 0.1009 0.0875 0.0753 0.0643 0.0546 0.0459 0.0384 0.0320 0.0265 0.0219 0.0180 0.0148 0.0122 0.0100 0.0083 0.0069 0.0057 0.0048 0.0041 0.0035 0.0030 0.0026 0.0023 0.0020 [...]
+#
+#
+Pi1889 Pi1889 * * 0 41 0.4452 0.4048 0.3787 0.3586 0.3400 0.3225 0.3052 0.2879 0.2705 0.2529 0.2351 0.2172 0.1993 0.1815 0.1641 0.1471 0.1308 0.1154 0.1009 0.0875 0.0753 0.0643 0.0546 0.0459 0.0384 0.0320 0.0265 0.0219 0.0180 0.0148 0.0122 0.0100 0.0083 0.0069 0.0057 0.0048 0.0041 0.0035 0.0030 0.0026 0.0023 0.0020 [...]
+Pi1889 Pi1889 * <Genome> 0 41 0.4452 0.4048 0.3787 0.3586 0.3400 0.3225 0.3052 0.2879 0.2705 0.2529 0.2351 0.2172 0.1993 0.1815 0.1641 0.1471 0.1308 0.1154 0.1009 0.0875 0.0753 0.0643 0.0546 0.0459 0.0384 0.0320 0.0265 0.0219 0.0180 0.0148 0.0122 0.0100 0.0083 0.0069 0.0057 0.0048 0.0041 0.0035 0.0030 0.0026 0.0023 0.0020 [...]
+#
+Pi1889 Pi1889 Pi1889_id_CTTGTA * 0 17 0.3633 0.3078 0.2523 0.2073 0.1643 0.1279 0.0967 0.0715 0.0516 0.0366 0.0254 0.0174 0.0118 0.0080 0.0054 0.0037 0.0025 0.0018 0.0013 0.0010 0.0008 0.0006 0.0005 0.0004 0.0004 0.0003 0.0003 0.0003 0.0002 0.0002 0.0002 0.0002 0.0002 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 [...]
+Pi1889 Pi1889 Pi1889_id_CTTGTA <Genome> 0 17 0.3633 0.3078 0.2523 0.2073 0.1643 0.1279 0.0967 0.0715 0.0516 0.0366 0.0254 0.0174 0.0118 0.0080 0.0054 0.0037 0.0025 0.0018 0.0013 0.0010 0.0008 0.0006 0.0005 0.0004 0.0004 0.0003 0.0003 0.0003 0.0002 0.0002 0.0002 0.0002 0.0002 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 [...]
+#
+Pi1889 Pi1889 Pi1889_id_GGCTAC * 0 19 0.3753 0.3230 0.2699 0.2269 0.1841 0.1472 0.1143 0.0870 0.0647 0.0471 0.0337 0.0237 0.0165 0.0114 0.0079 0.0055 0.0038 0.0027 0.0020 0.0014 0.0011 0.0009 0.0007 0.0006 0.0005 0.0004 0.0004 0.0003 0.0003 0.0003 0.0002 0.0002 0.0002 0.0002 0.0002 0.0002 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 [...]
+Pi1889 Pi1889 Pi1889_id_GGCTAC <Genome> 0 19 0.3753 0.3230 0.2699 0.2269 0.1841 0.1472 0.1143 0.0870 0.0647 0.0471 0.0337 0.0237 0.0165 0.0114 0.0079 0.0055 0.0038 0.0027 0.0020 0.0014 0.0011 0.0009 0.0007 0.0006 0.0005 0.0004 0.0004 0.0003 0.0003 0.0003 0.0002 0.0002 0.0002 0.0002 0.0002 0.0002 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 [...]
+#
+Pi1889 Pi1889 Pi1889_id_TAGCTT * 0 17 0.3712 0.3099 0.2504 0.2030 0.1580 0.1209 0.0895 0.0648 0.0458 0.0317 0.0216 0.0145 0.0097 0.0065 0.0043 0.0029 0.0020 0.0015 0.0011 0.0008 0.0006 0.0005 0.0004 0.0004 0.0003 0.0003 0.0003 0.0002 0.0002 0.0002 0.0002 0.0002 0.0002 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 [...]
+Pi1889 Pi1889 Pi1889_id_TAGCTT <Genome> 0 17 0.3712 0.3099 0.2504 0.2030 0.1580 0.1209 0.0895 0.0648 0.0458 0.0317 0.0216 0.0145 0.0097 0.0065 0.0043 0.0029 0.0020 0.0015 0.0011 0.0008 0.0006 0.0005 0.0004 0.0004 0.0003 0.0003 0.0003 0.0002 0.0002 0.0002 0.0002 0.0002 0.0002 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 0.0001 [...]
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/3pGtoA_freq.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/3pGtoA_freq.txt
new file mode 100644
index 0000000..26c25f4
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/3pGtoA_freq.txt
@@ -0,0 +1,26 @@
+pos 5pG>A
+1 0.00792426579560234
+2 0.0102056515718755
+3 0.00805673126902799
+4 0.00573466161820387
+5 0.00597109987659727
+6 0.0051661918328585
+7 0.00533596837944664
+8 0.00458849235251275
+9 0.00480296497995613
+10 0.00421239330451169
+11 0.00425673674859343
+12 0.00466545289030496
+13 0.00353982300884956
+14 0.00313210783399828
+15 0.00358693600151029
+16 0.00399745955841148
+17 0.00457820474332032
+18 0.00359820089955022
+19 0.00390770375883885
+20 0.00381424974077914
+21 0.00383724022643517
+22 0.00376548555936288
+23 0.00335946248600224
+24 0.00330848798296319
+25 0.0039849709666401
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/5pCtoT_freq.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/5pCtoT_freq.txt
new file mode 100644
index 0000000..99c2956
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/5pCtoT_freq.txt
@@ -0,0 +1,26 @@
+pos 5pC>T
+1 0.00434832360682
+2 0.007567454725745
+3 0.00535815002820079
+4 0.00422978200354289
+5 0.00440133987691828
+6 0.00426439232409382
+7 0.00365796098174953
+8 0.00324235730064847
+9 0.00381241440574453
+10 0.00355654548067251
+11 0.00352681851579721
+12 0.00397942711266284
+13 0.00307602167862897
+14 0.00342478230772033
+15 0.00335347877358491
+16 0.00380061394532963
+17 0.00360625574977001
+18 0.00366341030195382
+19 0.00360758328731824
+20 0.00317249520436771
+21 0.00331462578628197
+22 0.00390668190026905
+23 0.00406654343807763
+24 0.00263365815117198
+25 0.00304289743209144
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/Fragmisincorporation_plot.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/Fragmisincorporation_plot.pdf
new file mode 100644
index 0000000..ec22e9d
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/Fragmisincorporation_plot.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/Length_plot.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/Length_plot.pdf
new file mode 100644
index 0000000..75236f9
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/Length_plot.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/Runtime_log.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/Runtime_log.txt
new file mode 100644
index 0000000..5ebf151
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/Runtime_log.txt
@@ -0,0 +1,4 @@
+2013-10-24 15:07:37,742 INFO main: Started with the command: /home/mischu/bin/mapDamage/bin/mapDamage --no-stats --merge-reference-sequences -t mapDamage plot for library 'Pi1889_id_CTTGTA' -i - -d /home/mischu/scratch/bam_pipeline/89ca5df2-c86d-4a50-a3aa-3b5a1d1b5ba4 -r 000_prefixes/Pi_nucl.fasta --downsample 100000
+2013-10-24 15:09:25,013 DEBUG main: BAM read in 109.754742 seconds
+2013-10-24 15:09:25,826 INFO main: Successful run
+2013-10-24 15:09:25,826 DEBUG main: Run completed in 110.568290 seconds
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_correct_prob.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_correct_prob.csv
new file mode 100644
index 0000000..98637a3
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_correct_prob.csv
@@ -0,0 +1,25 @@
+"","Position","C.T","G.A"
+"1",1,0.0797264089342923,0.0719613206608906
+"2",2,0.0306223260447826,0.023420942000563
+"3",3,0.0150592748488285,0.0110778472191001
+"4",4,0.00862033966989843,0.0065266899393183
+"5",5,0.00559785278660617,0.00433989177214494
+"6",6,0.00397184925756031,0.00310751713644133
+"7",7,0.00296089920443026,0.00236882481659723
+"8",8,0.00228359244279009,0.0018831934885618
+"9",9,0.0018145278971573,0.00153241736684503
+"10",10,0.00146958874403339,0.0012770322432345
+"11",11,0.00122960564500815,0.00106322088020238
+"12",12,0.00107432786119677,0.000867387647534221
+"13",-12,0.000862719050158577,0.00107818422836329
+"14",-11,0.0010123117399304,0.00127530305372429
+"15",-10,0.00119645419200459,0.0015407352621676
+"16",-9,0.00145932699501107,0.00187738488589531
+"17",-8,0.00183237882046418,0.00232613157287246
+"18",-7,0.00234075216268973,0.00298314182822147
+"19",-6,0.00309101953883051,0.00398373126452267
+"20",-5,0.00430968435870788,0.00561971184705729
+"21",-4,0.006532876913818,0.00861221911618487
+"22",-3,0.0110012398139291,0.0151100139532769
+"23",-2,0.0222912388681986,0.0314203549375075
+"24",-1,0.063851719692041,0.0858750109440791
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_hist.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_hist.pdf
new file mode 100644
index 0000000..c8b0a61
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_hist.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_iter_summ_stat.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_iter_summ_stat.csv
new file mode 100644
index 0000000..42f44d3
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_iter_summ_stat.csv
@@ -0,0 +1,45 @@
+"","Theta","DeltaD","DeltaS","Lambda","Rho","LogLik"
+"Mean",0.0307675987622533,3.64000747201146e-06,0.125539003290109,0.94089482050281,0.23014181712885,-5592.23925194029
+"Std.",0.000228622373656015,3.65430651648515e-06,0.219919278993131,0.123552364190705,0.00304727073597524,1.6476329636231
+"Acceptance ratio",0.19276,0.1787,0.23344,0.14514,0.19276,0.66608
+"0%",0.0299335541610133,2.01553563261446e-09,2.16460301158405e-05,0.0142047917756347,0.218236239523716,-5602.38667086705
+"2.5%",0.0303265391267829,9.841767773933e-08,3.23330474660986e-05,0.535834460847276,0.224220821733263,-5596.29164995786
+"5%",0.0303965224822254,1.99273358593323e-07,0.000121808161430401,0.671409446195198,0.225191581190517,-5595.42892908928
+"7.5%",0.0304404242145742,2.75908616346075e-07,0.000206882201311237,0.752001521995955,0.22579913158644,-5594.87849254501
+"10%",0.0304760690205227,3.82458654009558e-07,0.000290203592828547,0.805710159910309,0.226213369417274,-5594.48942857669
+"12.5%",0.0305110696193036,4.98470977720514e-07,0.000571016494952753,0.844955129462369,0.226669836500641,-5594.15263479988
+"15%",0.0305392061988847,5.87019463182924e-07,0.000590660317341667,0.875928633353708,0.226972547490384,-5593.89368165128
+"17.5%",0.0305570499171715,7.13571107244045e-07,0.000668380057399285,0.901355309897942,0.227241179946909,-5593.65120826563
+"20%",0.0305748245960631,8.3105190497348e-07,0.000756121831655225,0.920517977061133,0.227548307397738,-5593.45499294176
+"22.5%",0.0305948010250572,9.61877725436628e-07,0.000963299090845911,0.933221796339751,0.227761542306625,-5593.2826286295
+"25%",0.0306130063778337,1.078396864226e-06,0.00111927813214005,0.947792276978062,0.22805072224532,-5593.10384338078
+"27.5%",0.0306284806348928,1.20455464012925e-06,0.00130260593451736,0.9592104657371,0.228282638083664,-5592.95152785741
+"30%",0.0306454353017327,1.33077043111214e-06,0.00174824380216085,0.966440306466556,0.228518377403081,-5592.80692899637
+"32.5%",0.0306611339739537,1.43458814714646e-06,0.00225247191358638,0.973557608623197,0.228735911009569,-5592.67569425
+"35%",0.0306757205485069,1.58042212605839e-06,0.00321994782977357,0.980071444679222,0.228950823949639,-5592.54879473494
+"37.5%",0.0306920525161066,1.69589455617463e-06,0.00461142199954145,0.985456011249196,0.229158658014662,-5592.43081741746
+"40%",0.0307062821289279,1.80915028134946e-06,0.00571531627240692,0.988221937665653,0.229365876621922,-5592.31679824062
+"42.5%",0.0307215984181147,1.95596340740603e-06,0.00827688563189316,0.990342214724375,0.229561657948405,-5592.20198691771
+"45%",0.0307358113996207,2.11659790800682e-06,0.0103887692765082,0.992090019205747,0.229737658915546,-5592.09550098677
+"47.5%",0.03075212904881,2.28120534314018e-06,0.0130976925313217,0.993767231559973,0.229952619061455,-5592.00216974167
+"50%",0.0307657987023219,2.48404223662086e-06,0.0158055172730743,0.995457150944822,0.230145316906325,-5591.90172442002
+"52.5%",0.0307789982474753,2.68452313643726e-06,0.0201369554140009,0.996407077622258,0.230306982829053,-5591.80380981184
+"55%",0.0307933482094035,2.89386753030994e-06,0.0251891273361784,0.997194024343722,0.230485901560505,-5591.71344070824
+"57.5%",0.0308101147163631,3.11795542685734e-06,0.0309989263169072,0.997835531897563,0.230670450178863,-5591.62172530931
+"60%",0.0308247708105714,3.33484669898906e-06,0.0387410250296887,0.998394808687207,0.230873303406774,-5591.53749088281
+"62.5%",0.0308395755801875,3.57419110188965e-06,0.0465164314021797,0.998676053712034,0.231118612353805,-5591.4487966862
+"65%",0.0308533185583761,3.81420384896806e-06,0.0581681916255348,0.998751860740727,0.231369279913685,-5591.36214318211
+"67.5%",0.0308685719065837,4.04935457420849e-06,0.071224390749324,0.998833681331178,0.231576740442473,-5591.27250721254
+"70%",0.0308838277411926,4.33685104986565e-06,0.0893016217184419,0.998954465674908,0.231785910352873,-5591.19085353198
+"72.5%",0.0309016229042858,4.64955044397707e-06,0.112237911489696,0.999222992250115,0.232012294819283,-5591.10292583938
+"75%",0.0309197775935492,5.06168516122073e-06,0.13671498761574,0.999396682194613,0.232235219393487,-5591.01988491389
+"77.5%",0.030941031952352,5.47211301113921e-06,0.168033100561413,0.999574791137844,0.232472723563385,-5590.93808361988
+"80%",0.0309604478658832,5.94890317130806e-06,0.205010928934632,0.999685687855907,0.23269863295317,-5590.85371384655
+"82.5%",0.030981219633522,6.37730098903119e-06,0.251639317568055,0.999770718653225,0.233039342206315,-5590.76071207708
+"85%",0.031005242119724,6.89000012192111e-06,0.305894762365115,0.999782248951224,0.23332997236024,-5590.66928514007
+"87.5%",0.0310311004919976,7.60936599395482e-06,0.375233900721273,0.999855332636154,0.233682187431463,-5590.57051639647
+"90%",0.0310630470785648,8.38464865545381e-06,0.45683700994485,0.999856785323536,0.234080692389398,-5590.4716816571
+"92.5%",0.0310968323935027,9.44626605444087e-06,0.561826490618011,0.999864498818163,0.234493358199214,-5590.3552720511
+"95%",0.0311459121001003,1.07883715883818e-05,0.68988541544079,0.999877091272578,0.23510365942241,-5590.21190528637
+"97.5%",0.0312138488199264,1.35070771687872e-05,0.828718552786093,0.99995031932328,0.236096508509822,-5590.01275083771
+"100%",0.031669713495139,3.35216766686489e-05,0.999483073394731,0.99995200828286,0.241408557049251,-5589.54047251031
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_post_pred.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_post_pred.pdf
new file mode 100644
index 0000000..8caf504
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_post_pred.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_trace.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_trace.pdf
new file mode 100644
index 0000000..2a416af
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/Stats_out_MCMC_trace.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/dnacomp.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/dnacomp.txt
new file mode 100644
index 0000000..6e260bb
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/dnacomp.txt
@@ -0,0 +1,324 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_nucl.fasta as reference file
+# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads
+Chr End Std Pos A C G T Total
+* 3p + -70 7271 8035 7517 7258 30081
+* 3p + -69 7358 8138 7764 7664 30924
+* 3p + -68 7740 8333 8046 7599 31718
+* 3p + -67 7811 8643 8275 7807 32536
+* 3p + -66 8057 8815 8415 8060 33347
+* 3p + -65 8255 9056 8664 8156 34131
+* 3p + -64 8465 9146 8954 8346 34911
+* 3p + -63 8637 9567 8923 8645 35772
+* 3p + -62 8752 9835 9236 8747 36570
+* 3p + -61 9026 10123 9343 8927 37419
+* 3p + -60 9242 10266 9557 9191 38256
+* 3p + -59 9421 10274 9994 9346 39035
+* 3p + -58 9654 10610 10109 9499 39872
+* 3p + -57 9745 11022 10180 9714 40661
+* 3p + -56 9841 10965 10633 9989 41428
+* 3p + -55 10103 11341 10645 10016 42105
+* 3p + -54 10359 11476 10756 10243 42834
+* 3p + -53 10440 11628 11235 10231 43534
+* 3p + -52 10792 11848 11024 10516 44180
+* 3p + -51 10690 12077 11343 10704 44814
+* 3p + -50 10818 12184 11519 10856 45377
+* 3p + -49 10860 12248 11844 11014 45966
+* 3p + -48 11126 12540 11614 11192 46472
+* 3p + -47 11217 12603 12258 10905 46983
+* 3p + -46 11397 12749 12114 11186 47446
+* 3p + -45 11515 12787 12396 11138 47836
+* 3p + -44 11505 12948 12392 11342 48187
+* 3p + -43 11606 13006 12573 11328 48513
+* 3p + -42 11694 13129 12501 11476 48800
+* 3p + -41 11680 13326 12646 11396 49048
+* 3p + -40 11673 13044 12853 11661 49231
+* 3p + -39 11812 13280 12714 11584 49390
+* 3p + -38 11870 13327 12900 11448 49545
+* 3p + -37 11926 13331 12900 11484 49641
+* 3p + -36 11836 13204 13081 11607 49728
+* 3p + -35 11852 13389 13066 11495 49802
+* 3p + -34 11802 13316 13130 11604 49852
+* 3p + -33 11862 13506 13009 11519 49896
+* 3p + -32 11871 13371 13048 11638 49928
+* 3p + -31 12115 13027 13280 11531 49953
+* 3p + -30 12082 13287 12941 11655 49965
+* 3p + -29 12054 13276 13180 11471 49981
+* 3p + -28 12161 13244 13193 11394 49992
+* 3p + -27 12275 13300 13011 11418 50004
+* 3p + -26 11994 13251 13417 11351 50013
+* 3p + -25 12178 13174 13274 11389 50015
+* 3p + -24 12006 13337 13217 11463 50023
+* 3p + -23 12199 13147 13416 11261 50023
+* 3p + -22 11980 13315 13349 11373 50017
+* 3p + -21 11998 13540 13191 11297 50026
+* 3p + -20 11817 13324 13636 11248 50025
+* 3p + -19 12198 13171 13531 11129 50029
+* 3p + -18 12024 13253 13430 11319 50026
+* 3p + -17 12011 13426 13315 11276 50028
+* 3p + -16 12087 13194 13522 11224 50027
+* 3p + -15 11992 13268 13303 11179 49742
+* 3p + -14 12076 13322 13513 11117 50028
+* 3p + -13 11907 13262 13635 11225 50029
+* 3p + -12 11842 13417 13300 11468 50027
+* 3p + -11 11837 13478 13593 11117 50025
+* 3p + -10 12351 13331 13600 10745 50027
+* 3p + -9 13733 12627 13221 10447 50028
+* 3p + -8 13543 11971 13833 10682 50029
+* 3p + -7 12509 13405 12810 11305 50029
+* 3p + -6 13019 12550 13166 11294 50029
+* 3p + -5 12311 13362 12594 11762 50029
+* 3p + -4 12121 12190 13731 11986 50028
+* 3p + -3 12886 11129 13486 12528 50029
+* 3p + -2 14558 14446 9823 11202 50029
+* 3p + -1 6405 12495 16549 14580 50029
+* 3p + 1 4042 19658 4201 22120 50021
+* 3p + 2 9892 14291 14171 11667 50021
+* 3p + 3 11794 12980 13301 11945 50020
+* 3p + 4 12123 12833 12907 12155 50018
+* 3p + 5 12164 12665 13210 11977 50016
+* 3p + 6 12328 12579 13138 11969 50014
+* 3p + 7 12194 13026 12647 12147 50014
+* 3p + 8 12084 12874 13094 11962 50014
+* 3p + 9 12087 12736 13122 12068 50013
+* 3p + 10 12172 12826 12776 12239 50013
+* 3p - -70 7219 8081 7588 7050 29938
+* 3p - -69 7410 8184 7707 7455 30756
+* 3p - -68 7619 8317 8014 7631 31581
+* 3p - -67 7812 8729 8210 7632 32383
+* 3p - -66 7859 8978 8283 8094 33214
+* 3p - -65 8171 9123 8692 8043 34029
+* 3p - -64 8251 9248 8892 8503 34894
+* 3p - -63 8531 9536 9056 8611 35734
+* 3p - -62 8757 9693 9350 8738 36538
+* 3p - -61 8998 10035 9335 8952 37320
+* 3p - -60 9062 10114 9686 9318 38180
+* 3p - -59 9276 10339 9940 9442 38997
+* 3p - -58 9567 10655 10115 9458 39795
+* 3p - -57 9926 10755 10173 9728 40582
+* 3p - -56 9837 11199 10531 9759 41326
+* 3p - -55 10118 11210 10802 9932 42062
+* 3p - -54 10285 11331 10875 10256 42747
+* 3p - -53 10478 11731 11006 10216 43431
+* 3p - -52 10654 11784 11247 10400 44085
+* 3p - -51 10703 11986 11423 10587 44699
+* 3p - -50 11011 12217 11538 10562 45328
+* 3p - -49 11017 12038 11890 10906 45851
+* 3p - -48 11224 12381 11796 10990 46391
+* 3p - -47 11229 12602 12135 10917 46883
+* 3p - -46 11308 12642 12226 11181 47357
+* 3p - -45 11339 12874 12292 11276 47781
+* 3p - -44 11549 12925 12490 11190 48154
+* 3p - -43 11533 12975 12576 11378 48462
+* 3p - -42 11512 13126 12490 11613 48741
+* 3p - -41 11820 13081 12685 11409 48995
+* 3p - -40 11780 13107 12635 11679 49201
+* 3p - -39 11591 13264 12804 11706 49365
+* 3p - -38 11698 13605 12733 11476 49512
+* 3p - -37 11884 13202 13031 11493 49610
+* 3p - -36 11848 13502 12866 11484 49700
+* 3p - -35 11861 13328 13213 11367 49769
+* 3p - -34 11854 13357 13142 11462 49815
+* 3p - -33 11886 13396 13072 11506 49860
+* 3p - -32 12094 13335 12940 11510 49879
+* 3p - -31 12050 13322 13197 11330 49899
+* 3p - -30 11931 13369 13160 11451 49911
+* 3p - -29 12124 13191 13117 11490 49922
+* 3p - -28 12102 13185 13316 11323 49926
+* 3p - -27 12066 13357 13004 11517 49944
+* 3p - -26 12184 13190 13301 11283 49958
+* 3p - -25 12161 13178 13186 11425 49950
+* 3p - -24 11905 13423 13235 11398 49961
+* 3p - -23 11977 13183 13498 11308 49966
+* 3p - -22 12159 13146 13346 11300 49951
+* 3p - -21 11973 13310 13282 11401 49966
+* 3p - -20 11876 13402 13474 11218 49970
+* 3p - -19 12027 13181 13455 11303 49966
+* 3p - -18 11988 13326 13353 11302 49969
+* 3p - -17 12041 13300 13436 11193 49970
+* 3p - -16 12141 13135 13355 11337 49968
+* 3p - -15 12237 13109 13321 11042 49709
+* 3p - -14 12142 13259 13459 11110 49970
+* 3p - -13 12073 13018 13581 11297 49969
+* 3p - -12 11954 13294 13141 11579 49968
+* 3p - -11 11879 13377 13507 11206 49969
+* 3p - -10 12432 13266 13589 10683 49970
+* 3p - -9 13633 12677 13337 10322 49969
+* 3p - -8 13498 12023 13762 10687 49970
+* 3p - -7 12499 13374 12626 11471 49970
+* 3p - -6 12761 12395 13259 11555 49970
+* 3p - -5 12426 13296 12597 11652 49971
+* 3p - -4 12280 12010 13592 12087 49969
+* 3p - -3 12602 11417 13420 12531 49970
+* 3p - -2 14358 14708 9703 11202 49971
+* 3p - -1 6266 12424 16496 14785 49971
+* 3p - 1 4050 19685 4197 22035 49967
+* 3p - 2 9969 14390 14048 11560 49967
+* 3p - 3 11658 12936 13409 11963 49966
+* 3p - 4 12308 12818 12881 11956 49963
+* 3p - 5 12130 12828 13167 11837 49962
+* 3p - 6 12304 12411 13376 11870 49961
+* 3p - 7 12065 13148 12556 12191 49960
+* 3p - 8 12192 12775 13042 11950 49959
+* 3p - 9 12259 12656 13012 12032 49959
+* 3p - 10 12155 13087 12464 12251 49957
+* 5p + -10 12309 12822 12901 11986 50018
+* 5p + -9 12109 13214 12329 12366 50018
+* 5p + -8 11735 13153 12891 12241 50020
+* 5p + -7 12405 12671 13021 11925 50022
+* 5p + -6 12050 13106 12469 12397 50022
+* 5p + -5 11762 13383 12799 12079 50023
+* 5p + -4 12018 12870 12987 12149 50024
+* 5p + -3 12337 13489 12750 11450 50026
+* 5p + -2 11771 14039 14770 9446 50026
+* 5p + -1 23640 2805 21286 2296 50027
+* 5p + 1 15774 17367 11043 5845 50029
+* 5p + 2 10573 9591 14965 14900 50029
+* 5p + 3 12918 14216 10635 12260 50029
+* 5p + 4 12484 13977 11573 11983 50017
+* 5p + 5 11760 12986 12919 12364 50029
+* 5p + 6 11738 13466 12082 12743 50029
+* 5p + 7 12231 12866 12943 11989 50029
+* 5p + 8 10906 14201 11579 13343 50029
+* 5p + 9 10587 13584 12426 13432 50029
+* 5p + 10 11211 13979 12973 11866 50029
+* 5p + 11 11485 13738 13358 11448 50029
+* 5p + 12 11793 13315 13262 11659 50029
+* 5p + 13 11814 13677 12840 11698 50029
+* 5p + 14 11338 13894 13050 11747 50029
+* 5p + 15 11506 13704 13072 11729 50011
+* 5p + 16 11686 13706 12755 11882 50029
+* 5p + 17 11485 13828 13051 11665 50029
+* 5p + 18 11752 13635 12975 11667 50029
+* 5p + 19 11891 13572 12866 11700 50029
+* 5p + 20 11548 13772 13101 11608 50029
+* 5p + 21 11911 13455 13096 11566 50028
+* 5p + 22 11706 13694 12788 11840 50028
+* 5p + 23 11522 13721 12981 11802 50026
+* 5p + 24 12006 13407 12988 11625 50026
+* 5p + 25 11807 13655 12767 11797 50026
+* 5p + 26 11661 13396 13201 11755 50013
+* 5p + 27 12031 13409 13009 11555 50004
+* 5p + 28 11820 13449 12933 11791 49993
+* 5p + 29 11614 13620 13107 11642 49983
+* 5p + 30 11907 13382 12981 11701 49971
+* 5p + 31 11836 13436 12940 11741 49953
+* 5p + 32 11669 13260 13345 11654 49928
+* 5p + 33 11796 13382 13118 11601 49897
+* 5p + 34 12119 13423 12771 11540 49853
+* 5p + 35 11785 13407 13199 11410 49801
+* 5p + 36 11911 13376 13072 11371 49730
+* 5p + 37 11989 13296 12833 11524 49642
+* 5p + 38 11887 13322 12895 11442 49546
+* 5p + 39 11993 12793 13193 11413 49392
+* 5p + 40 11933 13210 12740 11349 49232
+* 5p + 41 11770 13160 12844 11274 49048
+* 5p + 42 11887 12763 12788 11362 48800
+* 5p + 43 11667 13099 12559 11189 48514
+* 5p + 44 11756 12731 12576 11124 48187
+* 5p + 45 11765 12471 12618 10982 47836
+* 5p + 46 11602 12610 12150 11083 47445
+* 5p + 47 11430 12342 12390 10822 46984
+* 5p + 48 11478 12030 12180 10784 46472
+* 5p + 49 11309 12085 11778 10793 45965
+* 5p + 50 10956 11984 11926 10511 45377
+* 5p + 51 11092 11492 11777 10453 44814
+* 5p + 52 10914 11563 11459 10243 44179
+* 5p + 53 10689 11372 11370 10102 43533
+* 5p + 54 10466 11113 11272 9983 42834
+* 5p + 55 10264 10989 10995 9858 42106
+* 5p + 56 10113 10808 10903 9605 41429
+* 5p + 57 10241 10401 10633 9386 40661
+* 5p + 58 9731 10401 10308 9432 39872
+* 5p + 59 9542 10070 10372 9050 39034
+* 5p + 60 9494 9818 9982 8962 38256
+* 5p + 61 9198 9678 9765 8778 37419
+* 5p + 62 8963 9517 9576 8513 36569
+* 5p + 63 8820 9169 9312 8471 35772
+* 5p + 64 8638 9029 9031 8213 34911
+* 5p + 65 8290 8756 9087 8001 34134
+* 5p + 66 8469 8495 8705 7679 33348
+* 5p + 67 8021 8397 8408 7710 32536
+* 5p + 68 7901 8126 8393 7298 31718
+* 5p + 69 7744 7821 8127 7183 30875
+* 5p + 70 7435 7803 7750 7093 30081
+* 5p - -10 12370 12660 12806 12118 49954
+* 5p - -9 11859 13170 12540 12385 49954
+* 5p - -8 11859 13109 12788 12201 49957
+* 5p - -7 12201 12550 13179 12027 49957
+* 5p - -6 12089 13239 12385 12248 49961
+* 5p - -5 12013 13159 12835 11957 49964
+* 5p - -4 11861 12989 13117 11998 49965
+* 5p - -3 12168 13704 12604 11489 49965
+* 5p - -2 11745 13838 14723 9659 49965
+* 5p - -1 23841 2913 20957 2255 49966
+* 5p - 1 15591 17556 11074 5750 49971
+* 5p - 2 10535 9602 14821 15010 49968
+* 5p - 3 12863 14159 10611 12335 49968
+* 5p - 4 12351 13819 11716 12075 49961
+* 5p - 5 11936 12856 12864 12313 49969
+* 5p - 6 11744 13484 12059 12684 49971
+* 5p - 7 12100 12777 13219 11875 49971
+* 5p - 8 11117 14114 11510 13229 49970
+* 5p - 9 10691 13675 12278 13327 49971
+* 5p - 10 11097 14055 12849 11970 49971
+* 5p - 11 11597 13688 13356 11330 49971
+* 5p - 12 11811 13516 13117 11527 49971
+* 5p - 13 11693 13879 12677 11722 49971
+* 5p - 14 11723 13833 12877 11538 49971
+* 5p - 15 11633 13680 12971 11663 49947
+* 5p - 16 11530 13863 12850 11728 49971
+* 5p - 17 11505 13592 13259 11615 49971
+* 5p - 18 11527 13588 13059 11797 49971
+* 5p - 19 11647 13840 12915 11569 49971
+* 5p - 20 11684 13558 13264 11464 49970
+* 5p - 21 11864 13307 13241 11557 49969
+* 5p - 22 11948 13645 12856 11519 49968
+* 5p - 23 11853 13593 12860 11660 49966
+* 5p - 24 11763 13494 12974 11732 49963
+* 5p - 25 11640 13542 12868 11913 49963
+* 5p - 26 11601 13522 13193 11643 49959
+* 5p - 27 11986 13241 12869 11850 49946
+* 5p - 28 11781 13565 12727 11855 49928
+* 5p - 29 11675 13455 13030 11761 49921
+* 5p - 30 11785 13439 13093 11598 49915
+* 5p - 31 11877 13396 13165 11461 49899
+* 5p - 32 11780 13361 13089 11650 49880
+* 5p - 33 11918 13286 13220 11436 49860
+* 5p - 34 11908 13381 13035 11493 49817
+* 5p - 35 11719 13256 13336 11456 49767
+* 5p - 36 12086 13349 12861 11405 49701
+* 5p - 37 11830 13368 12965 11446 49609
+* 5p - 38 11776 13158 13052 11524 49510
+* 5p - 39 11919 13019 12897 11530 49365
+* 5p - 40 12038 12902 12988 11273 49201
+* 5p - 41 11863 12926 12872 11335 48996
+* 5p - 42 11943 12755 12928 11115 48741
+* 5p - 43 11985 12860 12546 11072 48463
+* 5p - 44 11776 12720 12814 10845 48155
+* 5p - 45 11721 12568 12457 11035 47781
+* 5p - 46 11424 12373 12410 11149 47356
+* 5p - 47 11376 12323 12322 10861 46882
+* 5p - 48 11197 12161 12294 10739 46391
+* 5p - 49 11123 12098 11946 10684 45851
+* 5p - 50 11051 11722 11819 10738 45330
+* 5p - 51 11144 11632 11650 10273 44699
+* 5p - 52 10721 11693 11492 10180 44086
+* 5p - 53 10560 11408 11266 10197 43431
+* 5p - 54 10497 11050 11182 10019 42748
+* 5p - 55 10325 10929 10998 9809 42061
+* 5p - 56 10114 10819 10882 9514 41329
+* 5p - 57 9905 10536 10719 9423 40583
+* 5p - 58 9843 10374 10340 9238 39795
+* 5p - 59 9653 10018 10307 9019 38997
+* 5p - 60 9480 9853 9944 8903 38180
+* 5p - 61 9163 9735 9700 8722 37320
+* 5p - 62 8952 9439 9515 8632 36538
+* 5p - 63 8888 9173 9340 8333 35734
+* 5p - 64 8670 8908 9119 8197 34894
+* 5p - 65 8223 8862 8862 8085 34032
+* 5p - 66 8257 8442 8761 7754 33214
+* 5p - 67 8031 8241 8514 7597 32383
+* 5p - 68 7877 8013 8290 7401 31581
+* 5p - 69 7685 7826 7982 7218 30711
+* 5p - 70 7390 7682 7795 7071 29938
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/dnacomp_genome.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/dnacomp_genome.csv
new file mode 100644
index 0000000..2fc7659
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/dnacomp_genome.csv
@@ -0,0 +1,2 @@
+A,C,G,T
+0.245290724081,0.254598401178,0.255055853499,0.245055021242
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/lgdistribution.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/lgdistribution.txt
new file mode 100644
index 0000000..8c557c9
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/lgdistribution.txt
@@ -0,0 +1,325 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_nucl.fasta as reference file
+# Std: strand of reads
+Std Length Occurences
++ 25 7
++ 26 2
++ 27 4
++ 28 9
++ 29 9
++ 30 12
++ 31 17
++ 32 23
++ 33 40
++ 34 44
++ 35 65
++ 36 85
++ 37 90
++ 38 149
++ 39 157
++ 40 179
++ 41 241
++ 42 280
++ 43 321
++ 44 344
++ 45 392
++ 46 455
++ 47 504
++ 48 505
++ 49 583
++ 50 562
++ 51 632
++ 52 639
++ 53 707
++ 54 720
++ 55 673
++ 56 767
++ 57 786
++ 58 834
++ 59 774
++ 60 833
++ 61 843
++ 62 802
++ 63 850
++ 64 782
++ 65 782
++ 66 808
++ 67 813
++ 68 781
++ 69 858
++ 70 851
++ 71 804
++ 72 793
++ 73 755
++ 74 780
++ 75 727
++ 76 729
++ 77 726
++ 78 682
++ 79 717
++ 80 680
++ 81 734
++ 82 690
++ 83 710
++ 84 646
++ 85 635
++ 86 660
++ 87 641
++ 88 613
++ 89 567
++ 90 626
++ 91 591
++ 92 757
++ 93 1045
++ 94 5525
++ 95 331
++ 96 292
++ 97 254
++ 98 334
++ 99 215
++ 100 231
++ 101 220
++ 102 215
++ 103 219
++ 104 205
++ 105 181
++ 106 184
++ 107 198
++ 108 176
++ 109 173
++ 110 149
++ 111 152
++ 112 151
++ 113 137
++ 114 155
++ 115 150
++ 116 159
++ 117 135
++ 118 122
++ 119 116
++ 120 108
++ 121 110
++ 122 116
++ 123 117
++ 124 97
++ 125 92
++ 126 110
++ 127 99
++ 128 86
++ 129 78
++ 130 64
++ 131 74
++ 132 72
++ 133 54
++ 134 51
++ 135 60
++ 136 54
++ 137 70
++ 138 41
++ 139 38
++ 140 49
++ 141 37
++ 142 36
++ 143 40
++ 144 34
++ 145 38
++ 146 36
++ 147 30
++ 148 22
++ 149 34
++ 150 36
++ 151 31
++ 152 26
++ 153 24
++ 154 24
++ 155 23
++ 156 13
++ 157 21
++ 158 19
++ 159 13
++ 160 10
++ 161 12
++ 162 10
++ 163 15
++ 164 10
++ 165 12
++ 166 8
++ 167 11
++ 168 7
++ 169 7
++ 170 12
++ 171 7
++ 172 4
++ 173 6
++ 174 6
++ 175 7
++ 176 6
++ 177 10
++ 178 6
++ 179 5
++ 180 3
++ 181 2
++ 182 2
++ 183 5
++ 184 2
++ 185 4
+- 25 2
+- 26 4
+- 27 8
+- 28 3
+- 29 2
+- 30 13
+- 31 16
+- 32 13
+- 33 36
+- 34 45
+- 35 64
+- 36 85
+- 37 93
+- 38 138
+- 39 161
+- 40 197
+- 41 252
+- 42 264
+- 43 305
+- 44 371
+- 45 421
+- 46 474
+- 47 488
+- 48 531
+- 49 525
+- 50 624
+- 51 612
+- 52 650
+- 53 675
+- 54 690
+- 55 728
+- 56 735
+- 57 792
+- 58 796
+- 59 816
+- 60 858
+- 61 781
+- 62 810
+- 63 832
+- 64 856
+- 65 814
+- 66 831
+- 67 796
+- 68 807
+- 69 845
+- 70 772
+- 71 789
+- 72 771
+- 73 743
+- 74 757
+- 75 777
+- 76 751
+- 77 723
+- 78 688
+- 79 738
+- 80 699
+- 81 713
+- 82 707
+- 83 683
+- 84 719
+- 85 646
+- 86 574
+- 87 639
+- 88 650
+- 89 614
+- 90 594
+- 91 685
+- 92 760
+- 93 917
+- 94 5535
+- 95 318
+- 96 261
+- 97 228
+- 98 335
+- 99 249
+- 100 243
+- 101 217
+- 102 219
+- 103 195
+- 104 204
+- 105 198
+- 106 199
+- 107 176
+- 108 176
+- 109 173
+- 110 178
+- 111 138
+- 112 171
+- 113 154
+- 114 133
+- 115 147
+- 116 143
+- 117 128
+- 118 117
+- 119 120
+- 120 102
+- 121 102
+- 122 111
+- 123 99
+- 124 91
+- 125 80
+- 126 80
+- 127 89
+- 128 88
+- 129 68
+- 130 70
+- 131 81
+- 132 57
+- 133 78
+- 134 72
+- 135 59
+- 136 60
+- 137 59
+- 138 63
+- 139 41
+- 140 63
+- 141 36
+- 142 42
+- 143 43
+- 144 31
+- 145 38
+- 146 28
+- 147 30
+- 148 29
+- 149 21
+- 150 20
+- 151 24
+- 152 16
+- 153 19
+- 154 12
+- 155 21
+- 156 19
+- 157 11
+- 158 8
+- 159 12
+- 160 14
+- 161 11
+- 162 13
+- 163 16
+- 164 9
+- 165 14
+- 166 5
+- 167 18
+- 168 12
+- 169 17
+- 170 12
+- 171 7
+- 172 3
+- 173 3
+- 174 6
+- 175 4
+- 176 6
+- 177 6
+- 178 7
+- 179 6
+- 180 3
+- 182 2
+- 183 1
+- 184 4
+- 185 3
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/misincorporation.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/misincorporation.txt
new file mode 100644
index 0000000..b89d704
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_CTTGTA/misincorporation.txt
@@ -0,0 +1,284 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_nucl.fasta as reference file
+# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads
+Chr End Std Pos A C G T Total G>A C>T A>G T>C A>C A>T C>G C>A T>G T>A G>C G>T A>- T>- C>- G>- ->A ->T ->C ->G S
+* 3p + 1 6213 12410 16554 14845 50022 130 63 78 221 36 30 55 91 70 114 34 45 0 0 0 0 0 0 0 0 13
+* 3p + 2 14423 14363 9816 11420 50022 109 53 88 201 30 25 43 80 42 87 26 32 0 0 0 0 0 0 0 0 12
+* 3p + 3 12803 10969 13501 12750 50023 110 62 67 233 31 28 35 32 57 65 24 42 0 0 0 0 0 0 0 0 12
+* 3p + 4 12154 12059 13650 12159 50022 67 61 119 221 20 31 33 29 31 40 12 25 0 0 0 0 0 0 0 1 12
+* 3p + 5 12316 13196 12543 11966 50021 72 38 91 223 21 31 33 29 40 36 21 23 0 0 1 0 0 3 1 1 12
+* 3p + 6 13033 12414 13115 11451 50013 66 44 102 192 19 16 18 26 23 27 12 20 1 0 1 0 5 2 2 4 12
+* 3p + 7 12549 13208 12734 11527 50018 70 43 122 242 21 15 19 21 31 25 12 17 3 2 5 2 3 0 5 1 12
+* 3p + 8 13599 11821 13749 10854 50023 55 42 125 192 18 14 20 14 28 29 13 21 1 1 0 1 0 2 1 1 11
+* 3p + 9 13766 12468 13149 10635 50018 54 40 105 209 25 16 18 26 25 30 9 16 0 4 3 3 3 4 2 0 11
+* 3p + 10 12386 13144 13548 10941 50019 59 32 101 210 17 19 13 10 25 28 13 13 0 3 3 3 2 1 2 2 11
+* 3p + 11 11882 13314 13541 11281 50018 46 52 89 204 24 11 15 18 22 15 16 16 3 7 3 5 3 1 2 1 11
+* 3p + 12 11815 13279 13270 11660 50024 69 41 85 202 14 7 18 25 27 29 15 23 2 5 10 0 1 1 0 1 11
+* 3p + 13 11936 13119 13585 11384 50024 53 45 95 190 20 14 18 23 25 20 12 19 4 4 2 1 1 1 1 2 9
+* 3p + 14 12140 13177 13440 11264 50021 45 38 119 181 22 15 16 20 27 21 20 25 3 2 3 2 1 4 1 1 9
+* 3p + 15 12036 13127 13244 11332 49739 58 34 109 179 15 21 10 18 25 33 15 16 2 2 3 4 0 0 1 2 9
+* 3p + 16 12112 13025 13461 11424 50022 54 39 84 200 15 22 14 18 38 24 10 7 2 2 1 7 0 2 2 1 8
+* 3p + 17 12027 13260 13279 11458 50024 74 39 105 211 20 12 14 14 21 30 7 26 3 4 4 3 4 0 0 0 8
+* 3p + 18 12093 13084 13368 11469 50014 43 47 109 194 19 12 15 17 18 18 20 21 2 4 2 3 4 2 2 4 8
+* 3p + 19 12207 13033 13476 11302 50018 59 45 94 197 16 14 13 22 25 18 14 19 3 3 7 1 3 2 3 3 8
+* 3p + 20 11874 13179 13585 11380 50018 51 44 105 189 19 16 12 12 23 21 8 19 2 0 2 5 2 2 2 1 8
+* 3p + 21 12023 13377 13136 11483 50019 60 28 92 182 17 17 13 12 25 26 8 10 3 4 4 6 4 0 1 2 8
+* 3p + 22 12025 13186 13277 11523 50011 45 46 108 175 13 10 10 21 23 22 6 14 3 5 4 6 2 1 1 2 7
+* 3p + 23 12238 13021 13350 11408 50017 44 40 98 181 21 13 16 15 25 16 8 21 2 5 3 4 5 1 0 0 7
+* 3p + 24 12064 13194 13142 11616 50016 44 49 87 178 23 17 9 13 26 19 12 12 3 6 5 2 2 2 2 1 7
+* 3p + 25 12230 13028 13239 11513 50010 57 48 105 182 23 15 12 11 21 19 11 14 5 3 7 3 2 2 1 0 7
+* 3p + 26 12083 13139 13339 11448 50009 49 52 115 156 23 25 8 11 22 12 12 18 1 3 4 3 0 1 2 1 7
+* 3p + 27 12305 13163 12966 11564 49998 49 47 94 178 21 12 9 11 18 21 8 21 4 3 5 3 1 2 3 1 7
+* 3p + 28 12223 13100 13129 11534 49986 52 43 95 181 13 24 16 18 26 25 7 17 4 6 2 3 2 3 0 2 7
+* 3p + 29 12117 13131 13106 11619 49973 36 47 107 188 23 21 19 13 30 27 12 27 4 3 2 5 3 0 1 4 7
+* 3p + 30 12138 13155 12866 11799 49958 40 36 99 167 16 17 12 12 19 23 10 17 4 5 1 3 4 1 3 0 6
+* 3p + 31 12156 12874 13214 11702 49946 58 50 104 200 18 16 14 16 17 21 5 15 3 4 0 4 1 2 2 2 6
+* 3p + 32 11905 13226 13022 11766 49919 49 52 90 188 19 16 6 13 13 23 8 16 2 2 5 3 0 4 2 3 6
+* 3p + 33 11902 13401 12938 11642 49883 39 50 103 165 14 18 14 26 19 19 15 14 2 5 2 2 1 7 1 4 6
+* 3p + 34 11872 13186 13045 11742 49845 37 40 95 175 16 16 14 11 22 14 5 10 0 3 1 2 5 0 2 0 6
+* 3p + 35 11899 13225 13015 11657 49796 32 43 82 180 29 15 8 10 24 30 14 16 2 4 2 2 3 0 3 1 5
+* 3p + 36 11885 13099 13021 11715 49720 46 50 104 154 17 19 6 13 16 22 10 18 1 4 2 2 5 2 0 2 4
+* 3p + 37 11965 13168 12860 11641 49634 45 37 98 193 18 14 10 17 18 22 10 20 2 3 5 1 2 4 1 0 4
+* 3p + 38 11924 13149 12834 11632 49539 51 26 97 180 16 11 10 15 26 20 7 14 4 5 2 1 1 2 2 0 4
+* 3p + 39 11828 13175 12652 11727 49382 38 39 78 174 21 18 13 14 20 24 3 12 1 2 6 2 6 0 0 1 4
+* 3p + 40 11745 12938 12762 11774 49219 38 65 105 172 14 14 10 16 23 14 7 17 3 4 1 5 3 2 3 3 4
+* 3p + 41 11736 13161 12600 11541 49038 47 42 107 196 16 14 7 17 11 12 12 10 2 5 2 1 5 4 1 2 4
+* 3p + 42 11741 12983 12441 11631 48796 41 50 79 181 18 20 9 14 25 19 11 11 2 3 3 2 1 3 1 1 4
+* 3p + 43 11642 12880 12546 11436 48504 32 38 91 169 12 13 7 18 13 16 7 24 1 0 2 3 3 3 2 2 4
+* 3p + 44 11531 12838 12360 11455 48184 48 37 88 156 15 17 7 13 11 27 10 20 1 2 2 3 1 2 0 1 4
+* 3p + 45 11581 12680 12335 11236 47832 34 51 95 147 19 18 7 15 19 20 8 21 1 0 2 1 0 3 1 4 3
+* 3p + 46 11471 12628 12051 11292 47442 41 44 93 128 24 11 11 12 26 19 9 19 4 4 3 5 0 1 5 1 2
+* 3p + 47 11262 12530 12193 10997 46982 32 50 90 136 14 9 9 11 19 14 10 23 3 7 5 2 1 1 2 1 1
+* 3p + 48 11185 12469 11558 11256 46468 33 48 88 116 11 16 6 10 25 12 7 10 2 1 2 1 1 6 1 3 1
+* 3p + 49 10919 12158 11798 11094 45969 42 51 88 124 17 16 14 10 14 15 8 17 2 3 4 0 0 2 0 3 1
+* 3p + 50 10861 12117 11466 10940 45384 26 49 89 145 11 19 8 15 14 17 13 17 4 1 2 2 3 0 0 1 0
+* 3p + 51 10745 11971 11287 10814 44817 39 40 85 152 10 18 10 11 20 12 5 18 1 1 3 2 3 0 0 0 0
+* 3p + 52 10854 11738 10965 10625 44182 34 40 88 146 14 16 10 10 19 18 12 15 3 3 3 1 1 0 1 1 0
+* 3p + 53 10504 11526 11187 10320 43537 30 21 83 124 12 16 3 16 11 16 8 26 1 3 1 4 0 1 3 0 0
+* 3p + 54 10402 11392 10682 10353 42829 27 46 99 118 7 5 8 4 12 29 11 9 4 3 2 2 4 1 3 1 0
+* 3p + 55 10151 11233 10602 10113 42099 26 38 76 121 7 13 9 8 13 10 7 14 3 5 0 2 3 2 6 0 0
+* 3p + 56 9908 10920 10587 10015 41430 26 51 83 115 14 16 4 12 17 7 7 12 1 0 4 1 1 2 0 0 0
+* 3p + 57 9794 10941 10145 9776 40656 32 46 69 103 19 18 8 13 21 15 14 12 2 5 0 3 0 2 4 2 0
+* 3p + 58 9693 10521 10061 9596 39871 24 39 64 122 17 20 12 10 14 13 6 18 2 4 1 2 2 0 2 2 0
+* 3p + 59 9480 10194 9950 9407 39031 36 36 72 110 16 11 4 10 15 15 10 11 2 1 0 1 1 2 4 1 0
+* 3p + 60 9286 10174 9494 9306 38260 26 29 70 118 15 12 13 12 15 17 5 14 4 1 1 0 1 1 1 2 0
+* 3p + 61 9061 10078 9321 8960 37420 29 48 73 100 16 11 7 12 14 13 5 13 2 3 0 3 1 4 2 2 0
+* 3p + 62 8810 9762 9193 8812 36577 33 33 77 103 15 12 5 10 8 10 2 14 0 2 0 0 1 6 0 1 0
+* 3p + 63 8659 9522 8882 8713 35776 37 42 61 99 9 14 7 8 15 9 6 14 2 3 0 2 1 0 1 0 0
+* 3p + 64 8511 9071 8948 8383 34913 36 49 66 93 14 10 6 8 12 12 6 20 0 3 2 2 2 3 1 3 0
+* 3p + 65 8315 8995 8616 8200 34126 19 30 66 85 18 16 5 11 10 11 5 10 2 2 1 2 5 4 1 1 0
+* 3p + 66 8072 8760 8388 8128 33348 34 36 53 95 6 13 8 4 17 17 5 16 5 1 2 3 4 1 3 2 0
+* 3p + 67 7836 8598 8255 7855 32544 26 42 57 84 9 4 5 12 10 16 3 17 8 3 1 3 2 2 0 2 0
+* 3p + 68 7759 8308 8010 7648 31725 18 31 43 75 16 4 5 11 12 11 9 12 3 2 2 0 1 4 0 3 0
+* 3p + 69 7382 8074 7757 7719 30932 17 27 43 79 7 7 4 5 7 14 8 14 3 6 1 2 2 2 1 0 0
+* 3p + 70 7289 8006 7505 7294 30094 30 31 49 74 13 7 5 6 11 10 2 14 3 1 3 2 2 0 0 0 0
+* 3p - 1 6082 12381 16509 14995 49967 132 65 87 188 35 40 51 110 77 104 44 54 0 0 0 0 0 0 0 0 20
+* 3p - 2 14266 14626 9683 11393 49968 90 59 79 190 26 24 34 62 48 68 20 32 0 0 0 0 0 0 0 0 19
+* 3p - 3 12533 11251 13433 12747 49964 107 51 73 238 20 33 27 38 53 47 23 37 0 0 0 0 1 0 1 1 19
+* 3p - 4 12266 11841 13553 12297 49957 89 45 105 226 23 20 23 29 33 41 14 21 1 1 0 0 4 4 0 1 17
+* 3p - 5 12414 13149 12578 11818 49959 78 47 85 205 30 16 24 28 23 32 10 29 1 0 1 0 3 2 1 4 17
+* 3p - 6 12781 12237 13210 11729 49957 70 43 103 200 26 16 17 22 37 29 9 31 1 6 0 0 4 3 3 2 16
+* 3p - 7 12531 13184 12566 11671 49952 65 48 98 234 23 26 25 16 29 30 12 20 2 8 2 1 6 2 6 3 16
+* 3p - 8 13519 11875 13711 10855 49960 71 43 113 209 16 13 19 19 21 23 8 24 5 1 3 2 6 1 2 1 16
+* 3p - 9 13659 12529 13293 10481 49962 73 38 106 183 19 21 20 11 22 33 6 28 3 5 3 1 3 1 3 0 16
+* 3p - 10 12476 13098 13515 10872 49961 55 34 111 208 21 17 15 25 27 23 13 15 5 2 3 4 3 3 0 3 16
+* 3p - 11 11899 13244 13475 11341 49959 69 44 93 175 19 18 20 15 23 22 14 21 6 2 4 6 1 1 7 1 15
+* 3p - 12 12003 13118 13094 11745 49960 54 45 96 214 27 16 13 21 29 16 11 24 4 4 6 6 2 2 2 2 14
+* 3p - 13 12106 12878 13535 11446 49965 43 41 81 186 22 12 16 25 23 20 13 19 3 2 8 4 0 2 1 1 14
+* 3p - 14 12201 13070 13379 11312 49962 39 35 103 210 24 15 14 12 30 21 14 20 2 8 4 4 1 1 5 1 14
+* 3p - 15 12307 12928 13241 11222 49698 37 42 104 219 23 19 13 16 20 23 12 21 5 0 4 5 5 2 3 1 14
+* 3p - 16 12168 12977 13306 11505 49956 53 43 97 190 21 15 9 18 30 29 6 17 1 2 5 3 6 4 1 1 14
+* 3p - 17 12088 13114 13369 11390 49961 48 33 110 228 16 17 8 21 23 25 13 24 2 3 3 3 4 0 1 4 14
+* 3p - 18 12009 13167 13312 11470 49958 53 33 97 198 19 11 18 15 14 21 10 19 1 5 1 2 5 1 2 3 13
+* 3p - 19 12090 13020 13394 11456 49960 46 37 99 179 27 16 11 16 27 23 16 23 3 2 0 2 2 2 1 1 12
+* 3p - 20 11906 13248 13419 11388 49961 52 43 105 190 20 16 9 24 25 22 12 13 9 3 4 2 3 2 4 0 12
+* 3p - 21 12082 13123 13185 11567 49957 41 44 125 209 25 24 13 16 22 18 14 17 4 7 4 6 2 3 1 3 11
+* 3p - 22 12205 13031 13280 11428 49944 55 54 103 171 25 13 16 14 25 22 11 15 4 5 7 4 3 1 0 3 11
+* 3p - 23 12041 13041 13440 11437 49959 46 44 101 173 24 21 8 17 19 19 10 12 4 3 4 3 0 2 0 5 11
+* 3p - 24 11955 13285 13154 11552 49946 43 44 103 178 15 15 12 15 28 26 9 16 3 0 0 2 3 4 4 4 10
+* 3p - 25 12206 13029 13110 11600 49945 48 43 113 194 19 13 11 21 21 30 8 19 1 3 1 1 1 0 1 3 10
+* 3p - 26 12211 13070 13252 11417 49950 46 50 91 189 17 14 14 20 23 19 8 20 1 2 1 3 2 1 3 2 10
+* 3p - 27 12091 13211 12950 11685 49937 48 38 88 171 17 14 4 14 25 17 6 15 1 3 1 1 2 1 2 2 10
+* 3p - 28 12170 13051 13244 11456 49921 49 36 105 178 12 11 10 11 21 12 7 22 2 2 0 1 1 1 0 3 10
+* 3p - 29 12212 13012 13047 11648 49919 42 41 111 197 26 17 13 16 26 19 16 21 2 1 2 2 0 2 1 0 10
+* 3p - 30 11980 13205 13127 11592 49904 55 34 94 182 25 11 10 14 18 14 7 20 3 0 1 2 0 3 3 1 9
+* 3p - 31 12117 13212 13104 11462 49895 44 48 116 167 17 17 11 14 30 21 4 23 3 5 0 4 3 0 1 0 9
+* 3p - 32 12165 13186 12880 11643 49874 41 33 109 170 22 18 8 15 25 20 10 23 2 1 2 2 1 2 2 0 9
+* 3p - 33 11932 13280 13003 11642 49857 39 42 97 157 14 18 12 19 17 20 6 13 1 3 1 4 1 0 0 2 9
+* 3p - 34 11906 13243 13070 11588 49807 43 50 101 167 24 14 10 14 21 24 10 15 4 3 2 2 3 0 1 4 9
+* 3p - 35 11916 13221 13162 11465 49764 43 38 101 137 15 10 7 18 18 18 4 17 1 5 3 4 1 1 2 1 8
+* 3p - 36 11845 13392 12832 11621 49690 47 47 78 156 12 10 7 10 22 22 4 23 2 2 5 2 3 2 1 4 8
+* 3p - 37 11943 13081 12964 11618 49606 51 45 109 168 20 20 8 19 17 22 10 14 4 4 1 0 2 2 1 0 8
+* 3p - 38 11751 13465 12690 11594 49500 48 48 92 171 16 12 6 7 15 18 10 19 1 3 4 0 1 6 3 2 8
+* 3p - 39 11649 13141 12761 11807 49358 45 52 93 157 20 17 10 11 13 24 10 12 0 0 2 4 3 2 1 2 8
+* 3p - 40 11824 12978 12567 11819 49188 41 35 98 153 20 14 9 13 20 22 13 18 1 4 1 4 5 4 4 0 8
+* 3p - 41 11894 12967 12631 11495 48987 34 54 94 157 21 18 10 13 17 15 10 14 5 3 2 3 3 1 3 1 8
+* 3p - 42 11586 13002 12428 11720 48736 38 50 107 163 18 17 11 10 14 15 8 24 2 4 1 1 0 4 3 1 8
+* 3p - 43 11603 12876 12474 11502 48455 38 60 118 167 16 15 10 12 25 26 3 18 4 1 2 4 3 3 3 1 8
+* 3p - 44 11592 12835 12449 11272 48148 34 48 85 151 16 13 10 19 20 12 9 20 3 2 1 1 1 3 1 2 8
+* 3p - 45 11399 12784 12226 11370 47779 30 50 91 143 19 16 9 19 21 21 13 24 2 1 3 1 1 0 1 1 8
+* 3p - 46 11359 12520 12175 11303 47357 41 44 86 154 18 19 14 12 14 16 6 18 4 4 4 1 2 0 2 0 8
+* 3p - 47 11305 12516 12086 10976 46883 35 60 85 140 15 20 7 16 19 20 9 22 2 1 6 1 0 0 1 1 8
+* 3p - 48 11253 12297 11767 11069 46386 43 63 94 133 18 16 5 10 17 23 8 24 0 4 0 2 1 3 1 1 8
+* 3p - 49 11055 11967 11820 11011 45853 37 40 89 135 16 11 9 11 16 24 5 16 3 1 2 0 1 0 0 1 8
+* 3p - 50 11083 12110 11468 10668 45329 33 38 90 118 14 10 8 6 15 15 6 15 0 1 1 3 0 0 0 1 7
+* 3p - 51 10770 11862 11385 10680 44697 39 51 97 163 15 11 5 11 12 18 4 24 1 1 1 3 0 1 3 0 7
+* 3p - 52 10672 11699 11218 10491 44080 39 41 72 134 10 12 4 6 22 23 11 13 1 2 1 1 1 4 0 1 7
+* 3p - 53 10513 11613 10977 10327 43430 36 37 87 131 10 16 5 10 15 28 8 21 3 6 2 2 0 3 0 2 6
+* 3p - 54 10345 11234 10799 10368 42746 24 39 90 124 18 11 6 14 21 16 5 16 1 2 1 1 2 4 1 0 6
+* 3p - 55 10156 11128 10779 9996 42059 36 37 58 120 12 11 3 9 13 13 6 22 2 1 5 5 2 2 0 1 6
+* 3p - 56 9861 11127 10494 9842 41324 35 37 69 130 10 12 14 8 11 13 9 13 3 2 5 1 2 0 3 1 5
+* 3p - 57 9951 10653 10152 9835 40591 29 40 70 135 18 12 8 14 10 21 16 20 1 3 2 4 0 2 1 0 5
+* 3p - 58 9602 10577 10083 9534 39796 42 45 77 121 7 16 8 12 15 21 7 7 6 0 1 1 1 4 2 1 5
+* 3p - 59 9296 10279 9919 9509 39003 37 35 56 102 14 10 8 16 11 13 5 22 2 1 0 1 2 1 2 0 5
+* 3p - 60 9110 10043 9641 9392 38186 25 43 76 121 9 11 12 9 13 13 7 15 3 0 3 1 1 2 0 1 3
+* 3p - 61 9072 9965 9269 9014 37320 19 38 73 107 12 13 5 11 16 11 4 19 1 1 1 1 1 5 0 2 3
+* 3p - 62 8783 9606 9323 8830 36542 33 29 56 112 15 7 9 8 10 10 7 10 0 2 0 0 1 1 4 0 3
+* 3p - 63 8559 9450 9046 8678 35733 23 33 60 94 12 14 2 5 11 12 7 11 0 3 1 2 1 4 2 2 3
+* 3p - 64 8248 9201 8872 8572 34893 21 44 43 109 9 9 8 14 14 17 8 20 2 2 1 2 2 4 1 2 3
+* 3p - 65 8206 9084 8651 8089 34030 25 36 63 75 15 6 11 13 14 8 10 12 3 2 4 1 5 1 0 2 3
+* 3p - 66 7916 8915 8252 8136 33219 22 35 58 82 10 4 8 7 11 12 6 16 2 3 1 2 1 3 1 1 3
+* 3p - 67 7820 8664 8212 7691 32387 23 34 49 108 8 13 8 5 4 16 6 14 1 1 0 3 1 0 2 1 3
+* 3p - 68 7651 8257 7980 7704 31592 24 28 43 94 14 13 8 10 4 15 6 9 1 1 0 0 1 0 2 1 3
+* 3p - 69 7446 8117 7686 7515 30764 30 32 67 84 11 8 6 5 7 11 12 6 1 0 1 0 2 1 1 0 2
+* 3p - 70 7258 8045 7556 7082 29941 22 29 48 57 11 10 8 10 16 9 6 8 1 1 1 1 0 2 0 1 2
+* 5p + 1 15855 17391 10959 5822 50027 47 72 129 56 23 17 18 25 17 15 11 22 0 0 0 0 0 0 0 0 12
+* 5p + 2 10631 9598 14891 14907 50027 33 77 95 86 12 23 18 15 17 23 4 22 1 4 0 1 0 0 0 0 11
+* 5p + 3 12976 14194 10570 12283 50023 36 76 91 97 25 14 18 20 22 15 13 18 0 1 4 1 0 2 1 1 10
+* 5p + 4 12488 13898 11538 12086 50010 30 45 54 146 13 12 12 30 24 16 6 20 1 1 4 1 0 3 1 2 10
+* 5p + 5 11786 12898 12883 12451 50018 43 57 76 150 10 18 18 12 15 20 11 23 2 3 1 3 4 2 2 2 10
+* 5p + 6 11813 13358 12007 12840 50018 26 51 101 148 17 17 4 14 22 16 11 21 4 4 4 4 2 1 4 4 9
+* 5p + 7 12324 12771 12845 12083 50023 32 40 128 128 15 21 4 15 19 24 7 13 3 3 4 4 2 2 2 0 9
+* 5p + 8 10992 14073 11525 13431 50021 37 47 105 149 22 25 5 8 15 18 10 19 2 6 3 2 1 3 2 2 9
+* 5p + 9 10673 13444 12340 13565 50022 29 55 106 191 20 16 10 13 25 13 8 24 5 2 5 1 1 3 1 2 9
+* 5p + 10 11280 13881 12900 11962 50023 41 50 108 148 24 13 6 14 21 17 3 24 3 4 10 3 0 3 0 3 9
+* 5p + 11 11560 13644 13261 11558 50023 49 51 121 149 27 20 13 22 28 13 3 13 3 6 1 5 1 1 2 2 7
+* 5p + 12 11865 13228 13178 11755 50026 31 60 100 149 14 10 13 14 21 15 8 20 4 4 1 4 0 1 1 1 7
+* 5p + 13 11882 13543 12762 11832 50019 43 32 120 160 19 19 14 17 18 24 7 15 6 2 1 2 4 2 4 0 7
+* 5p + 14 11401 13755 12979 11885 50020 30 48 102 178 17 9 5 10 16 21 12 20 3 3 1 3 3 4 1 1 6
+* 5p + 15 11592 13609 12969 11831 50001 30 45 118 145 14 15 17 9 16 26 8 18 1 3 4 3 3 1 2 4 6
+* 5p + 16 11748 13596 12678 11997 50019 32 50 94 159 19 16 7 17 27 14 6 17 0 5 4 1 1 4 4 1 6
+* 5p + 17 11559 13701 12971 11792 50023 31 58 115 173 14 9 8 9 24 23 9 13 2 4 4 5 2 1 2 1 5
+* 5p + 18 11838 13531 12867 11787 50023 24 38 116 133 14 11 3 11 25 21 6 20 6 3 4 2 2 2 1 1 5
+* 5p + 19 11953 13458 12793 11819 50023 32 54 109 165 14 11 8 18 21 17 7 19 2 2 1 1 2 2 1 1 5
+* 5p + 20 11602 13655 13038 11724 50019 42 41 105 160 18 16 8 19 16 18 8 19 1 1 3 2 5 1 2 2 5
+* 5p + 21 11975 13355 13020 11669 50019 40 47 117 154 20 21 10 28 22 23 11 22 8 1 3 4 2 1 4 2 5
+* 5p + 22 11782 13617 12711 11914 50024 36 56 114 138 12 16 15 13 16 22 10 25 1 3 3 2 1 3 0 0 5
+* 5p + 23 11594 13583 12911 11934 50022 39 50 105 185 18 16 8 11 17 20 6 20 3 2 1 1 2 1 1 0 4
+* 5p + 24 12084 13255 12927 11757 50023 36 32 103 164 18 25 9 8 21 20 9 17 6 1 2 1 2 1 0 0 4
+* 5p + 25 11887 13514 12711 11908 50020 33 40 97 162 24 23 11 11 17 13 14 16 1 3 1 2 2 2 0 2 4
+* 5p + 26 11706 13291 13119 11886 50002 33 56 90 176 13 12 21 16 15 23 14 14 3 1 3 1 5 2 0 3 3
+* 5p + 27 12113 13252 12936 11692 49993 31 44 108 188 25 16 7 15 13 12 9 14 3 6 3 0 3 4 1 3 3
+* 5p + 28 11890 13341 12860 11895 49986 33 54 103 159 20 14 17 10 27 17 18 22 2 3 3 3 1 3 0 4 3
+* 5p + 29 11662 13484 13031 11798 49975 46 43 97 177 13 20 11 13 27 26 9 16 1 4 2 0 1 1 4 2 3
+* 5p + 30 11993 13250 12893 11829 49965 39 51 119 166 16 15 12 18 29 22 10 22 5 0 2 2 3 2 2 0 3
+* 5p + 31 11891 13324 12878 11852 49945 40 53 104 151 23 10 4 11 19 25 14 12 3 2 4 1 1 4 2 1 3
+* 5p + 32 11721 13121 13312 11768 49922 44 44 84 169 20 21 10 10 11 16 10 14 2 3 5 3 1 2 2 1 3
+* 5p + 33 11864 13260 13049 11719 49892 43 47 101 166 21 17 8 15 18 14 15 15 3 2 3 0 1 2 0 2 3
+* 5p + 34 12169 13287 12708 11680 49844 45 42 108 169 17 17 8 15 24 21 11 18 6 4 3 2 6 0 1 2 3
+* 5p + 35 11838 13263 13148 11545 49794 55 45 106 181 13 18 10 9 21 16 5 22 3 2 0 3 2 1 4 1 2
+* 5p + 36 11949 13258 13004 11511 49722 48 42 100 159 13 12 12 11 25 19 10 20 2 4 3 0 6 0 3 0 2
+* 5p + 37 12057 13170 12757 11652 49636 37 42 112 159 13 17 10 13 15 16 4 12 1 5 0 3 1 2 1 3 2
+* 5p + 38 11989 13192 12810 11549 49540 44 37 121 164 20 18 8 16 20 10 12 19 5 2 3 3 2 1 1 2 1
+* 5p + 39 12064 12687 13116 11521 49388 42 42 104 150 21 16 7 11 21 18 6 18 2 4 1 0 1 0 1 2 1
+* 5p + 40 11977 13072 12697 11482 49228 49 37 92 173 15 19 10 6 18 16 8 15 3 0 2 3 2 2 0 0 1
+* 5p + 41 11820 13034 12793 11395 49042 38 46 81 169 16 16 10 17 25 21 9 18 3 2 2 2 1 3 1 4 0
+* 5p + 42 11912 12633 12736 11519 48800 45 34 80 166 18 21 12 15 19 22 10 15 4 2 3 3 1 1 0 0 0
+* 5p + 43 11724 12957 12513 11315 48509 45 40 87 164 25 13 10 18 23 20 9 17 1 3 3 3 3 1 2 0 0
+* 5p + 44 11782 12598 12533 11266 48179 46 41 84 162 22 13 12 12 28 17 14 15 2 2 5 4 3 1 2 3 0
+* 5p + 45 11816 12340 12550 11126 47832 48 39 99 182 14 15 13 12 21 17 9 17 1 3 2 3 4 2 0 2 0
+* 5p + 46 11606 12476 12115 11245 47442 49 30 80 158 26 10 11 16 19 21 9 13 3 2 2 1 1 0 3 2 0
+* 5p + 47 11498 12193 12326 10964 46981 52 44 102 190 16 12 9 15 27 18 7 19 6 6 5 1 2 2 1 2 0
+* 5p + 48 11490 11924 12137 10919 46470 47 42 81 150 15 11 15 15 20 31 9 28 1 6 3 3 2 3 2 2 0
+* 5p + 49 11364 11924 11760 10919 45967 52 42 88 165 32 16 6 18 23 21 14 15 0 8 2 1 0 1 2 2 0
+* 5p + 50 10987 11896 11869 10623 45375 54 46 87 141 14 12 15 19 21 21 11 13 5 6 2 3 4 4 1 3 0
+* 5p + 51 11082 11419 11753 10561 44815 58 44 81 146 12 12 8 15 19 19 8 21 1 2 2 2 1 2 1 0 0
+* 5p + 52 10931 11444 11425 10374 44174 47 36 83 160 12 24 12 18 15 28 6 19 4 3 3 1 6 3 0 0 0
+* 5p + 53 10710 11279 11350 10195 43534 47 45 69 141 23 18 11 11 15 25 9 14 2 1 2 1 3 2 0 1 0
+* 5p + 54 10476 10990 11245 10122 42833 51 28 62 159 12 11 15 12 9 21 11 13 1 4 4 2 1 4 0 0 0
+* 5p + 55 10300 10852 10983 9971 42106 36 33 75 155 11 13 12 16 14 19 10 20 1 3 3 2 1 0 2 2 0
+* 5p + 56 10115 10702 10888 9724 41429 46 35 64 151 22 9 8 14 17 21 12 24 3 2 3 4 2 1 1 1 0
+* 5p + 57 10270 10276 10601 9513 40660 43 34 67 138 15 8 9 16 24 19 6 13 2 4 0 1 3 1 0 0 0
+* 5p + 58 9729 10330 10269 9543 39871 44 40 75 134 18 14 14 17 18 21 16 18 3 3 0 0 0 4 2 0 0
+* 5p + 59 9549 9970 10364 9150 39033 60 38 62 132 25 19 18 10 10 23 9 21 2 3 2 2 3 0 2 0 0
+* 5p + 60 9508 9708 9973 9069 38258 45 38 57 123 18 15 14 14 20 19 12 11 2 2 2 2 3 2 1 1 0
+* 5p + 61 9251 9571 9717 8879 37418 27 30 66 140 17 13 7 17 19 8 11 14 5 1 3 5 5 2 3 1 0
+* 5p + 62 8964 9448 9550 8614 36576 45 34 68 123 8 23 12 13 25 28 12 19 0 2 0 3 0 6 1 2 0
+* 5p + 63 8818 9085 9316 8555 35774 56 43 53 125 11 11 10 14 10 20 14 14 1 0 1 0 2 2 0 1 0
+* 5p + 64 8645 8936 9021 8311 34913 44 31 55 121 23 8 12 17 28 16 8 25 1 2 2 3 3 1 5 1 0
+* 5p + 65 8321 8660 9062 8093 34136 47 26 71 104 13 12 12 4 18 22 10 16 2 4 1 0 3 0 1 0 0
+* 5p + 66 8496 8389 8701 7768 33354 34 25 60 108 19 14 3 6 13 14 12 8 2 2 0 2 3 0 1 1 0
+* 5p + 67 8058 8303 8370 7814 32545 42 23 57 106 16 12 11 9 15 24 8 15 1 3 4 0 0 2 0 3 0
+* 5p + 68 7902 8053 8393 7383 31731 36 22 50 107 10 11 9 12 13 17 5 21 2 2 1 2 0 1 1 0 0
+* 5p + 69 7765 7746 8122 7254 30887 27 27 38 99 15 17 9 15 12 10 13 13 3 3 0 3 0 1 0 0 0
+* 5p + 70 7450 7722 7725 7196 30093 34 25 50 104 15 8 11 8 19 12 7 7 3 2 2 3 0 1 0 2 0
+* 5p - 1 15636 17565 11016 5750 49967 66 80 127 68 27 12 17 28 18 26 21 19 0 0 0 0 0 0 0 0 12
+* 5p - 2 10645 9563 14722 15035 49965 29 68 118 104 20 26 18 13 28 14 14 25 3 0 0 0 0 0 0 0 11
+* 5p - 3 12889 14174 10577 12325 49965 47 76 79 68 20 19 13 21 29 23 8 34 1 1 2 1 0 0 0 0 11
+* 5p - 4 12384 13763 11677 12137 49961 37 72 78 130 17 13 10 17 18 19 8 21 2 1 1 1 0 0 0 0 10
+* 5p - 5 11950 12776 12824 12414 49964 39 56 78 146 9 10 8 18 21 27 6 20 2 1 3 3 1 3 0 1 10
+* 5p - 6 11799 13375 12003 12791 49968 35 63 91 164 22 14 5 15 17 22 8 20 1 0 0 2 1 0 1 1 9
+* 5p - 7 12200 12653 13132 11981 49966 33 53 122 168 18 23 11 12 15 20 7 22 1 3 1 0 0 2 3 0 9
+* 5p - 8 11209 13993 11429 13336 49967 25 44 113 156 17 19 7 16 15 16 10 20 4 3 2 0 1 0 1 1 9
+* 5p - 9 10764 13573 12212 13416 49965 33 48 104 149 17 21 7 13 15 26 4 22 3 3 4 0 1 4 0 1 9
+* 5p - 10 11166 13955 12784 12059 49964 35 49 102 136 18 16 4 12 17 16 11 16 1 2 1 0 2 1 3 1 9
+* 5p - 11 11660 13576 13278 11449 49963 45 45 114 160 15 15 15 11 18 20 6 19 4 1 3 2 1 5 2 0 9
+* 5p - 12 11857 13409 13056 11642 49964 37 46 101 151 12 12 9 15 19 30 7 23 4 4 1 2 0 3 2 2 9
+* 5p - 13 11762 13765 12599 11836 49962 27 52 103 170 19 15 10 19 23 22 12 25 2 1 3 2 5 0 0 4 7
+* 5p - 14 11790 13692 12800 11682 49964 35 46 112 166 15 13 3 8 17 23 12 12 0 3 2 3 3 0 3 1 7
+* 5p - 15 11725 13527 12888 11800 49940 30 46 113 176 15 16 5 11 26 17 8 20 2 2 0 3 1 1 3 2 7
+* 5p - 16 11598 13768 12766 11827 49959 36 54 116 163 12 21 10 14 18 15 7 17 1 2 1 0 5 4 1 2 7
+* 5p - 17 11590 13474 13177 11721 49962 27 40 110 152 12 11 4 12 16 17 5 23 2 2 3 1 1 6 2 0 7
+* 5p - 18 11607 13493 12992 11868 49960 31 61 103 141 28 15 8 15 13 14 6 19 2 1 1 1 2 3 3 3 7
+* 5p - 19 11698 13707 12846 11709 49960 42 44 103 172 9 14 9 6 18 21 11 15 1 7 2 4 2 3 4 2 7
+* 5p - 20 11760 13453 13207 11545 49965 45 45 121 143 10 18 7 10 16 14 12 25 3 3 4 1 0 2 1 2 6
+* 5p - 21 11958 13194 13147 11668 49967 32 41 113 146 20 13 5 12 19 17 10 12 8 8 2 2 1 0 1 0 6
+* 5p - 22 12035 13516 12761 11653 49965 40 50 128 175 12 13 12 10 17 14 12 12 2 1 2 3 1 0 1 1 6
+* 5p - 23 11944 13467 12774 11776 49961 28 60 112 175 20 16 5 13 24 21 10 18 1 2 1 1 1 3 0 1 6
+* 5p - 24 11834 13324 12932 11866 49956 39 38 99 179 22 17 10 11 24 11 8 28 3 2 4 2 3 2 1 1 6
+* 5p - 25 11695 13434 12782 12036 49947 34 42 96 157 17 18 14 14 33 16 12 21 3 2 4 2 5 5 4 2 6
+* 5p - 26 11688 13426 13090 11744 49948 42 31 127 130 19 16 16 10 19 15 10 17 0 1 0 2 6 2 1 2 6
+* 5p - 27 12062 13073 12819 11989 49943 38 42 102 176 27 18 7 13 12 22 16 9 2 0 0 4 1 1 1 0 6
+* 5p - 28 11834 13461 12639 11989 49923 35 40 92 157 20 12 14 15 26 23 8 14 4 3 2 4 2 0 0 3 6
+* 5p - 29 11731 13334 12957 11892 49914 50 41 119 156 21 14 11 19 27 21 11 16 1 3 5 6 3 3 1 0 6
+* 5p - 30 11859 13317 13003 11727 49906 35 39 111 159 20 15 25 19 19 32 13 18 0 3 5 2 2 2 1 4 6
+* 5p - 31 11937 13247 13085 11624 49893 35 48 118 191 17 13 4 20 19 24 6 21 2 3 2 4 2 2 2 0 6
+* 5p - 32 11842 13274 13013 11737 49866 31 53 108 158 17 21 9 15 19 17 9 17 2 3 3 0 2 5 3 4 6
+* 5p - 33 11998 13126 13133 11594 49851 41 45 106 185 18 19 5 14 20 18 8 11 0 3 1 4 2 1 4 2 6
+* 5p - 34 11981 13246 12953 11626 49806 37 39 116 163 22 11 7 9 23 20 9 16 1 1 3 2 1 5 3 2 6
+* 5p - 35 11751 13106 13291 11608 49756 40 41 89 172 16 14 9 14 23 19 12 24 4 2 2 0 3 2 4 2 6
+* 5p - 36 12112 13218 12839 11526 49695 54 40 80 155 15 13 5 7 25 23 7 20 1 2 3 0 0 1 2 3 6
+* 5p - 37 11854 13245 12922 11578 49599 56 46 99 170 10 19 7 18 21 27 13 18 3 1 1 1 1 5 3 2 3
+* 5p - 38 11835 13029 12990 11645 49499 35 54 89 174 14 11 5 12 15 18 8 23 5 3 1 4 3 2 4 2 3
+* 5p - 39 11985 12881 12836 11655 49357 40 36 96 181 15 17 11 8 14 23 8 14 5 0 2 3 3 0 2 4 3
+* 5p - 40 12106 12765 12914 11406 49191 38 45 110 171 22 12 13 15 17 19 11 18 4 3 2 1 5 4 1 0 3
+* 5p - 41 11890 12790 12815 11494 48989 39 40 85 186 19 15 11 18 23 21 9 17 5 2 2 2 1 2 3 1 3
+* 5p - 42 12006 12613 12868 11253 48740 45 42 110 157 29 11 6 17 24 24 4 25 2 1 2 2 1 1 1 1 2
+* 5p - 43 12042 12750 12504 11162 48458 46 43 97 141 13 19 12 10 17 20 17 15 3 6 1 2 0 4 3 1 2
+* 5p - 44 11804 12555 12784 11009 48152 56 33 81 197 23 18 12 22 23 10 10 20 1 1 1 0 1 0 2 2 2
+* 5p - 45 11762 12431 12399 11184 47776 50 32 82 172 23 17 11 13 22 26 7 10 1 2 2 0 0 5 1 1 2
+* 5p - 46 11430 12262 12384 11279 47355 54 41 78 162 15 19 13 18 19 26 6 25 3 2 4 4 1 1 4 0 2
+* 5p - 47 11427 12174 12287 10991 46879 58 54 96 184 17 20 13 11 24 27 12 27 2 0 5 2 1 2 2 0 1
+* 5p - 48 11255 12038 12240 10853 46386 44 42 84 168 21 14 12 12 21 13 3 21 2 4 1 1 1 2 1 2 1
+* 5p - 49 11168 11961 11905 10814 45848 45 39 91 170 15 18 10 12 18 13 12 15 2 0 4 4 2 2 0 3 1
+* 5p - 50 11059 11619 11799 10851 45328 60 38 65 141 16 14 6 13 30 21 10 22 2 3 3 4 1 0 1 2 1
+* 5p - 51 11183 11490 11580 10443 44696 45 40 92 185 14 16 13 10 19 20 9 15 2 1 0 2 1 1 2 2 1
+* 5p - 52 10750 11574 11482 10279 44085 40 37 69 150 25 16 7 19 15 18 9 30 3 5 5 3 2 0 2 0 1
+* 5p - 53 10583 11281 11251 10317 43432 56 42 84 166 19 12 9 6 14 23 11 15 4 0 4 2 2 0 1 1 0
+* 5p - 54 10530 10898 11162 10159 42749 43 27 65 154 20 19 14 15 17 23 8 13 2 3 2 3 2 0 2 1 0
+* 5p - 55 10341 10848 10960 9911 42060 40 38 70 140 14 12 15 14 15 16 13 15 5 0 3 3 1 2 0 0 0
+* 5p - 56 10140 10717 10847 9623 41327 46 44 75 141 12 15 9 20 18 18 11 16 4 2 1 2 4 0 1 1 0
+* 5p - 57 9940 10437 10699 9512 40588 41 32 68 129 14 13 10 17 18 14 9 15 5 2 3 1 3 1 0 3 0
+* 5p - 58 9841 10281 10305 9369 39796 44 35 57 136 18 14 10 19 16 16 8 21 1 4 2 0 2 3 2 1 0
+* 5p - 59 9665 9934 10299 9108 39006 45 30 64 115 21 9 9 12 17 23 9 14 2 3 1 2 1 0 1 0 0
+* 5p - 60 9519 9738 9942 8988 38187 44 47 70 134 26 19 7 13 12 19 17 7 5 2 1 3 1 0 1 1 0
+* 5p - 61 9196 9644 9651 8831 37322 37 28 67 134 20 11 18 14 20 13 9 20 2 4 1 0 2 1 2 1 0
+* 5p - 62 8960 9332 9506 8745 36543 43 36 52 127 12 9 13 12 19 19 17 15 5 0 1 2 1 1 1 2 0
+* 5p - 63 8927 9100 9313 8393 35733 41 35 71 104 13 11 10 14 17 11 7 23 2 2 2 5 3 3 1 2 0
+* 5p - 64 8673 8826 9096 8302 34897 49 42 61 136 14 7 15 10 21 15 8 20 0 5 2 4 2 2 0 1 0
+* 5p - 65 8250 8772 8837 8175 34034 39 28 51 123 14 21 13 8 20 13 6 13 1 0 0 1 2 1 2 2 0
+* 5p - 66 8242 8371 8789 7816 33218 45 31 36 96 17 10 6 7 13 13 13 12 0 3 4 1 2 0 2 3 0
+* 5p - 67 8048 8153 8482 7702 32385 44 36 57 112 18 12 7 15 15 15 6 15 0 0 1 4 3 2 1 1 0
+* 5p - 68 7882 7917 8284 7507 31590 40 22 54 105 20 4 12 12 21 20 11 14 0 3 1 1 3 0 2 2 0
+* 5p - 69 7706 7783 7961 7270 30720 36 28 61 101 9 16 14 15 14 16 7 14 3 3 1 2 0 2 2 0 0
+* 5p - 70 7391 7587 7780 7181 29939 43 18 39 106 10 13 9 7 15 17 6 11 0 3 2 1 2 3 0 1 0
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/3pGtoA_freq.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/3pGtoA_freq.txt
new file mode 100644
index 0000000..2a17012
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/3pGtoA_freq.txt
@@ -0,0 +1,26 @@
+pos 5pG>A
+1 0.00835079408460114
+2 0.0120969808806192
+3 0.00796225302270717
+4 0.00600289028050543
+5 0.00492377988734392
+6 0.00539878240226672
+7 0.00527641203981293
+8 0.00510407239819005
+9 0.00564736203759854
+10 0.00434330685277303
+11 0.00362045557399306
+12 0.00398941271241705
+13 0.00401233421257941
+14 0.00477522850214512
+15 0.0042465281762883
+16 0.00385123654898999
+17 0.00387813700264757
+18 0.00344527316094347
+19 0.00394914999247781
+20 0.00352535253525353
+21 0.00441289332310054
+22 0.00349637204406181
+23 0.0040782418246356
+24 0.00420972062763108
+25 0.00347668354621722
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/5pCtoT_freq.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/5pCtoT_freq.txt
new file mode 100644
index 0000000..4f70bac
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/5pCtoT_freq.txt
@@ -0,0 +1,26 @@
+pos 5pC>T
+1 0.00379039958334539
+2 0.00768163874959991
+3 0.00473866610085579
+4 0.00372262773722628
+5 0.00453976205385097
+6 0.00365867601661649
+7 0.00370166180987635
+8 0.00431577054486603
+9 0.00348641049671978
+10 0.00341594592630278
+11 0.00253400409912428
+12 0.00350335267083553
+13 0.00330488469220991
+14 0.00366595791480314
+15 0.00293166954822221
+16 0.00333748991417883
+17 0.00372721234039412
+18 0.00300077797947616
+19 0.00342655023764784
+20 0.00310123310935539
+21 0.00380401704199635
+22 0.00287882753205967
+23 0.00311734580271654
+24 0.00292617046818728
+25 0.00314124378295501
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/Fragmisincorporation_plot.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/Fragmisincorporation_plot.pdf
new file mode 100644
index 0000000..7215927
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/Fragmisincorporation_plot.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/Length_plot.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/Length_plot.pdf
new file mode 100644
index 0000000..56cac77
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/Length_plot.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/Runtime_log.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/Runtime_log.txt
new file mode 100644
index 0000000..c7f1917
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/Runtime_log.txt
@@ -0,0 +1,4 @@
+2013-10-24 14:58:00,531 INFO main: Started with the command: /home/mischu/bin/mapDamage/bin/mapDamage --no-stats --merge-reference-sequences -t mapDamage plot for library 'Pi1889_id_GGCTAC' -i - -d /home/mischu/scratch/bam_pipeline/2a68191f-b62d-4903-bfa1-2acdad04fc38 -r 000_prefixes/Pi_nucl.fasta --downsample 100000
+2013-10-24 14:59:18,072 DEBUG main: BAM read in 80.000876 seconds
+2013-10-24 14:59:18,917 INFO main: Successful run
+2013-10-24 14:59:18,918 DEBUG main: Run completed in 80.846741 seconds
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_correct_prob.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_correct_prob.csv
new file mode 100644
index 0000000..04cb7ac
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_correct_prob.csv
@@ -0,0 +1,25 @@
+"","Position","C.T","G.A"
+"1",1,0.10108639889437,0.0964808308330821
+"2",2,0.0420179948121332,0.0358682904052418
+"3",3,0.0224741455841767,0.018945489286392
+"4",4,0.0136185953050491,0.011844155648586
+"5",5,0.00916127165171173,0.00803096719423654
+"6",6,0.00664877406746499,0.00581339693892618
+"7",7,0.00507457394976156,0.00448247180609317
+"8",8,0.00406576468215326,0.00360279376930059
+"9",9,0.00336960129733685,0.00302033667256635
+"10",10,0.00284121955919037,0.00265241682010395
+"11",11,0.00245963778638766,0.00238068025996153
+"12",12,0.00221192171948199,0.00213623351986629
+"13",-12,0.00192599750167264,0.00240242625184696
+"14",-11,0.00214256732240242,0.00267535934439904
+"15",-10,0.00237634308742586,0.00308405742877322
+"16",-9,0.00268791288341626,0.00365205068783988
+"17",-8,0.00325472266252092,0.00436080559971738
+"18",-7,0.00416763692636265,0.00534407874791067
+"19",-6,0.00555204052869244,0.00687269260965727
+"20",-5,0.00766976192381012,0.00946902657040652
+"21",-4,0.0109575640246285,0.0143443308274069
+"22",-3,0.016960448059723,0.0239787835997718
+"23",-2,0.0310793368390304,0.0454669380641758
+"24",-1,0.0816234842938072,0.112567034187736
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_hist.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_hist.pdf
new file mode 100644
index 0000000..37a1f5e
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_hist.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_iter_summ_stat.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_iter_summ_stat.csv
new file mode 100644
index 0000000..8cad52f
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_iter_summ_stat.csv
@@ -0,0 +1,45 @@
+"","Theta","DeltaD","DeltaS","Lambda","Rho","LogLik"
+"Mean",0.0255637285544568,5.06160721101857e-06,0.123783490957086,0.924381095789362,0.270072237263071,-4504.6592679494
+"Std.",0.00020773818027736,5.06923796568598e-06,0.219987940508823,0.14171111138312,0.003674022778329,1.65805945479885
+"Acceptance ratio",0.17654,0.1272,0.22898,0.17904,0.22946,0.67114
+"0%",0.024665072945494,8.05552783857713e-10,9.03755652013286e-06,0.00814603818947418,0.255149110336801,-4516.2026359026
+"2.5%",0.0251624086644525,1.4953851393464e-07,9.2270053967526e-05,0.487342899589896,0.2628105788838,-4508.77775348364
+"5%",0.0252209617016033,2.71120536177025e-07,0.000270342134186839,0.601978853619506,0.26410106152641,-4507.84680409711
+"7.5%",0.0252619103139905,4.15251608396266e-07,0.000284591897971833,0.677275085512289,0.264825076095329,-4507.28021150402
+"10%",0.025295957136261,5.46024691031562e-07,0.000416257377655972,0.745060466626702,0.265458286207262,-4506.88466144796
+"12.5%",0.0253241317004649,6.89781519204e-07,0.000650360764823055,0.792459018830076,0.265939866852691,-4506.56730351932
+"15%",0.0253479860639935,8.67211154579624e-07,0.000950674202993887,0.830829036699038,0.26630477139417,-4506.30923999026
+"17.5%",0.0253685117781895,1.04201631734832e-06,0.00125248392768823,0.861828260092543,0.26666397325067,-4506.07234608002
+"20%",0.0253863990644583,1.19721567566252e-06,0.00147605304920502,0.887103638696215,0.267041934254092,-4505.87181882387
+"22.5%",0.0254051674357453,1.37016971274831e-06,0.00192413411694901,0.90915426038242,0.267345630468217,-4505.68549024208
+"25%",0.0254205037605352,1.52658401149835e-06,0.00256619503817065,0.925856112756371,0.267620223896294,-4505.51283023082
+"27.5%",0.0254376786028866,1.67926121693856e-06,0.00283140755608328,0.939895570337305,0.267935826249091,-4505.36167617519
+"30%",0.0254537931368866,1.83781303131733e-06,0.00360320072462356,0.949688248146985,0.268203356422721,-4505.21269430712
+"32.5%",0.0254691428973077,2.00079800809324e-06,0.00459969687949287,0.958599979459329,0.268413022461792,-4505.07601247191
+"35%",0.0254856499246643,2.18510812684841e-06,0.0058760199511598,0.965960260004011,0.268636082196345,-4504.95102359296
+"37.5%",0.0254987140953115,2.38047824763999e-06,0.00727189412162522,0.972036817976132,0.268876205283445,-4504.82611743105
+"40%",0.0255108187812353,2.55949132073897e-06,0.00877047812010047,0.977100210108153,0.269100563020757,-4504.71713769471
+"42.5%",0.0255236393437617,2.74431552691424e-06,0.0106434403254753,0.980701313472598,0.269333761875072,-4504.61368532485
+"45%",0.0255362557493368,3.03174946367668e-06,0.0126162008583666,0.983660819069898,0.269543727455198,-4504.51615089427
+"47.5%",0.0255493855868092,3.22976860829391e-06,0.0152717218940989,0.985971369759664,0.269743539017027,-4504.42508785173
+"50%",0.0255638659536853,3.47872213081413e-06,0.0184792608762247,0.988328283707403,0.269980635967478,-4504.32450584886
+"52.5%",0.0255753966770517,3.72816378127789e-06,0.0213385070754737,0.989962800733263,0.270193268420946,-4504.23406846511
+"55%",0.025587208307857,4.01697153892875e-06,0.0255910838133896,0.991947078609875,0.27046531022785,-4504.14121772064
+"57.5%",0.025601269230345,4.3029623096571e-06,0.0290577366281189,0.993607666795162,0.270688159386879,-4504.04851398106
+"60%",0.0256157470653088,4.59868297252143e-06,0.0350449652235089,0.995068074180815,0.270943057155364,-4503.96754635385
+"62.5%",0.0256289795137046,4.92634369847614e-06,0.0429431832400187,0.995631509364139,0.271194000077319,-4503.88079419472
+"65%",0.0256440033794414,5.26536818934233e-06,0.054484787768161,0.996266136756598,0.27146808137544,-4503.79808200037
+"67.5%",0.0256606328184136,5.66660895863506e-06,0.0662595634070728,0.996838249204934,0.271713430189119,-4503.70607717894
+"70%",0.0256754162236362,6.15594461468446e-06,0.0826303691687441,0.997500670898713,0.272006761444803,-4503.62154139243
+"72.5%",0.0256903023961532,6.54827810848628e-06,0.103772327510094,0.997965960366473,0.272261970883793,-4503.54123153086
+"75%",0.0257048873939394,7.05243539380971e-06,0.126851494570225,0.998519814564963,0.272579715389786,-4503.44989886693
+"77.5%",0.0257216625461927,7.61235302736584e-06,0.15340192938571,0.99880991236194,0.272867601586877,-4503.35880983007
+"80%",0.0257403440872491,8.16903819634408e-06,0.189350782585554,0.999089218110281,0.273156849863882,-4503.2687304658
+"82.5%",0.0257604253259579,8.79465926138546e-06,0.237635833608678,0.999320908918255,0.273502626927723,-4503.17881133666
+"85%",0.0257831686781429,9.456792094594e-06,0.292240110928434,0.999483964044587,0.273868866219534,-4503.08135663454
+"87.5%",0.0258037984771969,1.0429313082909e-05,0.362188648018939,0.999627505662009,0.274295685285882,-4502.98062540346
+"90%",0.0258281769273818,1.15882742215076e-05,0.452762413688893,0.999688361500281,0.274798946731039,-4502.87565980961
+"92.5%",0.0258633846524385,1.29824768365931e-05,0.564514560522541,0.999824043933224,0.275423475542222,-4502.75653538778
+"95%",0.0259061087802089,1.50620604466877e-05,0.688753292219776,0.99989742020698,0.276181359107467,-4502.61209051411
+"97.5%",0.0259730868640328,1.88464125918366e-05,0.833995010240257,0.999957894773813,0.277388011526398,-4502.4387341569
+"100%",0.0263024974746344,5.39207283957985e-05,0.999969652698085,0.999983019755541,0.284507650739632,-4501.92353409552
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_post_pred.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_post_pred.pdf
new file mode 100644
index 0000000..7807fc3
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_post_pred.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_trace.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_trace.pdf
new file mode 100644
index 0000000..d75aba1
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/Stats_out_MCMC_trace.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/dnacomp.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/dnacomp.txt
new file mode 100644
index 0000000..099112b
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/dnacomp.txt
@@ -0,0 +1,324 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_nucl.fasta as reference file
+# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads
+Chr End Std Pos A C G T Total
+* 3p + -70 7735 8264 8051 7653 31703
+* 3p + -69 7875 8637 8013 7948 32473
+* 3p + -68 8020 8744 8318 8156 33238
+* 3p + -67 8193 8951 8587 8275 34006
+* 3p + -66 8442 9161 8638 8534 34775
+* 3p + -65 8729 9334 8852 8620 35535
+* 3p + -64 8680 9508 9276 8879 36343
+* 3p + -63 8934 9865 9314 8987 37100
+* 3p + -62 9204 10059 9491 9153 37907
+* 3p + -61 9326 10191 9773 9390 38680
+* 3p + -60 9591 10427 9889 9531 39438
+* 3p + -59 9647 10656 10200 9670 40173
+* 3p + -58 9877 10877 10322 9804 40880
+* 3p + -57 10119 10993 10312 10167 41591
+* 3p + -56 10332 11096 10714 10123 42265
+* 3p + -55 10382 11345 10969 10261 42957
+* 3p + -54 10622 11481 10977 10524 43604
+* 3p + -53 10563 11663 11189 10783 44198
+* 3p + -52 10791 11938 11423 10623 44775
+* 3p + -51 11046 12044 11453 10847 45390
+* 3p + -50 11032 12205 11610 11099 45946
+* 3p + -49 11253 12241 11903 11070 46467
+* 3p + -48 11199 12491 11879 11361 46930
+* 3p + -47 11292 12531 12238 11284 47345
+* 3p + -46 11524 12621 12306 11304 47755
+* 3p + -45 11596 12682 12194 11655 48127
+* 3p + -44 11648 12918 12377 11508 48451
+* 3p + -43 11505 12900 12580 11741 48726
+* 3p + -42 11803 13202 12480 11494 48979
+* 3p + -41 11774 13128 12684 11635 49221
+* 3p + -40 11800 12891 12838 11865 49394
+* 3p + -39 11894 13184 12743 11722 49543
+* 3p + -38 11969 13136 12877 11675 49657
+* 3p + -37 11832 13296 12968 11676 49772
+* 3p + -36 12061 13233 12863 11705 49862
+* 3p + -35 12049 13246 13086 11528 49909
+* 3p + -34 11889 13199 13104 11759 49951
+* 3p + -33 11823 13215 13024 11914 49976
+* 3p + -32 11946 13461 12948 11643 49998
+* 3p + -31 12021 13252 13113 11632 50018
+* 3p + -30 12141 13359 12955 11573 50028
+* 3p + -29 12172 13106 13130 11640 50048
+* 3p + -28 12125 13050 13285 11599 50059
+* 3p + -27 12144 13257 12978 11691 50070
+* 3p + -26 12097 13234 13232 11516 50079
+* 3p + -25 12196 12975 13382 11533 50086
+* 3p + -24 12145 13192 13080 11680 50097
+* 3p + -23 12247 13208 13235 11408 50098
+* 3p + -22 11941 13240 13320 11588 50089
+* 3p + -21 12039 13354 13140 11566 50099
+* 3p + -20 12027 13303 13341 11431 50102
+* 3p + -19 12188 13207 13382 11323 50100
+* 3p + -18 12199 13151 13266 11483 50099
+* 3p + -17 11916 13505 13396 11284 50101
+* 3p + -16 12480 12926 13307 11387 50100
+* 3p + -15 12093 13270 13083 11327 49773
+* 3p + -14 12265 13186 13368 11282 50101
+* 3p + -13 12022 13158 13526 11394 50100
+* 3p + -12 12116 13384 12938 11664 50102
+* 3p + -11 12061 13375 13416 11249 50101
+* 3p + -10 12387 13185 13537 10992 50101
+* 3p + -9 13539 12613 13280 10671 50103
+* 3p + -8 13580 11807 13931 10785 50103
+* 3p + -7 12547 13367 12563 11626 50103
+* 3p + -6 13062 12361 13140 11540 50103
+* 3p + -5 12471 13095 12683 11853 50102
+* 3p + -4 12321 12061 13527 12193 50102
+* 3p + -3 12718 11184 13514 12683 50099
+* 3p + -2 14457 14544 9665 11435 50101
+* 3p + -1 6400 12616 16406 14681 50103
+* 3p + 1 4347 19410 4448 21896 50101
+* 3p + 2 10051 14285 14134 11630 50100
+* 3p + 3 11782 12689 13386 12241 50098
+* 3p + 4 12173 12923 13039 11961 50096
+* 3p + 5 12109 12999 13073 11915 50096
+* 3p + 6 12330 12583 13130 12052 50095
+* 3p + 7 12181 13127 12800 11986 50094
+* 3p + 8 12307 12903 13002 11880 50092
+* 3p + 9 12161 12700 13095 12136 50092
+* 3p + 10 12029 13007 12807 12248 50091
+* 3p - -70 7613 8439 7975 7682 31709
+* 3p - -69 7723 8590 8116 8044 32473
+* 3p - -68 7954 8725 8468 8157 33304
+* 3p - -67 8120 8877 8616 8445 34058
+* 3p - -66 8228 9255 8761 8584 34828
+* 3p - -65 8461 9326 9180 8654 35621
+* 3p - -64 8901 9504 9149 8821 36375
+* 3p - -63 8885 9902 9338 9047 37172
+* 3p - -62 9220 9979 9655 9027 37881
+* 3p - -61 9413 10211 9693 9253 38570
+* 3p - -60 9485 10501 9712 9593 39291
+* 3p - -59 9765 10671 10060 9494 39990
+* 3p - -58 9908 10663 10228 9890 40689
+* 3p - -57 9925 11014 10302 10132 41373
+* 3p - -56 10043 11234 10626 10158 42061
+* 3p - -55 10304 11338 10840 10246 42728
+* 3p - -54 10406 11510 11060 10397 43373
+* 3p - -53 10491 11831 11154 10562 44038
+* 3p - -52 10697 11741 11336 10827 44601
+* 3p - -51 10711 12057 11469 10926 45163
+* 3p - -50 10990 12224 11653 10908 45775
+* 3p - -49 11213 12118 11892 11053 46276
+* 3p - -48 11388 12327 11880 11191 46786
+* 3p - -47 11400 12541 12088 11161 47190
+* 3p - -46 11497 12670 12031 11416 47614
+* 3p - -45 11633 12671 12199 11451 47954
+* 3p - -44 11539 12922 12567 11242 48270
+* 3p - -43 11735 13053 12461 11318 48567
+* 3p - -42 11669 12875 12427 11839 48810
+* 3p - -41 11737 13026 12752 11503 49018
+* 3p - -40 11986 13001 12672 11542 49201
+* 3p - -39 11980 13020 12553 11792 49345
+* 3p - -38 12054 13021 12848 11542 49465
+* 3p - -37 12064 13027 12790 11667 49548
+* 3p - -36 12150 13165 12669 11640 49624
+* 3p - -35 11814 13122 13078 11669 49683
+* 3p - -34 11940 13249 13068 11469 49726
+* 3p - -33 11814 13264 12823 11861 49762
+* 3p - -32 12062 13105 13111 11512 49790
+* 3p - -31 12006 13112 13170 11518 49806
+* 3p - -30 12097 13293 12771 11654 49815
+* 3p - -29 12111 13115 13196 11414 49836
+* 3p - -28 12121 12977 13028 11725 49851
+* 3p - -27 12203 13114 12981 11572 49870
+* 3p - -26 12382 13026 13113 11361 49882
+* 3p - -25 12307 13004 13102 11467 49880
+* 3p - -24 12200 13150 13054 11486 49890
+* 3p - -23 12182 12982 13279 11451 49894
+* 3p - -22 12109 12907 13328 11536 49880
+* 3p - -21 11890 13387 12935 11678 49890
+* 3p - -20 11957 13171 13384 11383 49895
+* 3p - -19 12158 13032 13266 11437 49893
+* 3p - -18 12060 13179 13207 11451 49897
+* 3p - -17 11984 13156 13508 11245 49893
+* 3p - -16 12212 13191 13226 11267 49896
+* 3p - -15 12060 13196 13095 11227 49578
+* 3p - -14 11994 13168 13460 11271 49893
+* 3p - -13 12148 13014 13440 11292 49894
+* 3p - -12 11739 13316 13179 11662 49896
+* 3p - -11 12030 13523 13140 11204 49897
+* 3p - -10 12569 12953 13438 10933 49893
+* 3p - -9 13674 12645 13133 10445 49897
+* 3p - -8 13528 11925 13721 10723 49897
+* 3p - -7 12580 13129 12496 11692 49897
+* 3p - -6 12846 12488 13013 11549 49896
+* 3p - -5 12355 12895 12715 11932 49897
+* 3p - -4 12459 11962 13431 12045 49897
+* 3p - -3 12621 11245 13523 12506 49895
+* 3p - -2 14435 14402 9777 11283 49897
+* 3p - -1 6435 12418 16491 14553 49897
+* 3p - 1 4275 19367 4502 21751 49895
+* 3p - 2 10107 14078 14187 11523 49895
+* 3p - 3 11626 12780 13363 12125 49894
+* 3p - 4 12142 13028 12608 12113 49891
+* 3p - 5 11960 12973 13099 11858 49890
+* 3p - 6 12442 12454 12967 12025 49888
+* 3p - 7 11934 13199 12735 12018 49886
+* 3p - 8 12194 12772 12869 12051 49886
+* 3p - 9 12156 12643 12844 12241 49884
+* 3p - 10 11969 12777 12761 12375 49882
+* 5p + -10 12315 12684 12918 12178 50095
+* 5p + -9 12179 13254 12440 12223 50096
+* 5p + -8 11775 13067 12847 12407 50096
+* 5p + -7 12232 12611 13151 12103 50097
+* 5p + -6 12079 13287 12372 12359 50097
+* 5p + -5 11861 13325 12734 12178 50098
+* 5p + -4 11889 12971 12978 12260 50098
+* 5p + -3 12221 13456 12651 11770 50098
+* 5p + -2 11800 13959 14714 9626 50099
+* 5p + -1 23568 2917 21294 2320 50099
+* 5p + 1 15880 17311 11128 5784 50103
+* 5p + 2 10799 9325 15018 14961 50103
+* 5p + 3 12790 14222 10568 12519 50099
+* 5p + 4 12484 13852 11609 12149 50094
+* 5p + 5 12163 12800 12811 12326 50100
+* 5p + 6 12006 13126 12078 12891 50101
+* 5p + 7 12415 12634 12802 12252 50103
+* 5p + 8 11164 13946 11597 13396 50103
+* 5p + 9 10769 13550 12463 13321 50103
+* 5p + 10 11335 13836 12637 12295 50103
+* 5p + 11 11641 13502 13117 11843 50103
+* 5p + 12 11910 13436 13032 11725 50103
+* 5p + 13 11750 13895 12667 11790 50102
+* 5p + 14 11576 13683 12695 12149 50103
+* 5p + 15 11759 13402 13028 11888 50077
+* 5p + 16 11670 13656 12716 12061 50103
+* 5p + 17 11742 13639 12932 11790 50103
+* 5p + 18 11711 13668 12923 11801 50103
+* 5p + 19 11739 13693 12867 11804 50103
+* 5p + 20 11772 13670 12947 11714 50103
+* 5p + 21 12019 13296 13078 11710 50103
+* 5p + 22 12050 13619 12656 11778 50103
+* 5p + 23 11726 13489 13045 11841 50101
+* 5p + 24 11851 13516 13087 11646 50100
+* 5p + 25 11898 13487 12649 12063 50097
+* 5p + 26 11840 13537 12846 11857 50080
+* 5p + 27 11856 13326 12923 11965 50070
+* 5p + 28 12027 13423 12753 11856 50059
+* 5p + 29 11803 13516 12826 11904 50049
+* 5p + 30 12002 13224 13114 11694 50034
+* 5p + 31 11968 13235 12962 11854 50019
+* 5p + 32 11862 13273 13075 11789 49999
+* 5p + 33 11950 13263 13081 11681 49975
+* 5p + 34 11935 13447 12834 11736 49952
+* 5p + 35 11960 13504 12942 11503 49909
+* 5p + 36 11877 13259 13109 11618 49863
+* 5p + 37 12063 13283 12803 11623 49772
+* 5p + 38 12073 13094 12897 11593 49657
+* 5p + 39 12035 13040 12862 11605 49542
+* 5p + 40 11917 12968 12934 11575 49394
+* 5p + 41 11908 12996 12889 11429 49222
+* 5p + 42 11974 12822 12754 11427 48977
+* 5p + 43 11872 12773 12566 11515 48726
+* 5p + 44 11668 12878 12715 11190 48451
+* 5p + 45 11740 12429 12724 11234 48127
+* 5p + 46 11428 12580 12181 11565 47754
+* 5p + 47 11652 12357 12302 11033 47344
+* 5p + 48 11336 12296 12194 11106 46932
+* 5p + 49 11366 12141 12130 10830 46467
+* 5p + 50 11148 12148 11900 10750 45946
+* 5p + 51 11178 11783 11879 10550 45390
+* 5p + 52 11036 11666 11559 10515 44776
+* 5p + 53 10653 11583 11541 10421 44198
+* 5p + 54 10715 11172 11437 10281 43605
+* 5p + 55 10494 11095 11145 10223 42957
+* 5p + 56 10386 10955 10982 9942 42265
+* 5p + 57 10220 10553 10942 9876 41591
+* 5p + 58 10228 10501 10475 9678 40882
+* 5p + 59 9961 10366 10436 9411 40174
+* 5p + 60 9836 10016 10254 9332 39438
+* 5p + 61 9426 10011 10030 9214 38681
+* 5p + 62 9197 9750 9795 9165 37907
+* 5p + 63 9177 9556 9658 8708 37099
+* 5p + 64 8974 9231 9442 8696 36343
+* 5p + 65 8756 9163 9124 8492 35535
+* 5p + 66 8679 8836 9026 8235 34776
+* 5p + 67 8344 8709 8882 8071 34006
+* 5p + 68 8294 8489 8608 7847 33238
+* 5p + 69 7994 8201 8552 7687 32434
+* 5p + 70 7905 8033 8178 7587 31703
+* 5p - -10 12311 12582 12894 12101 49888
+* 5p - -9 11999 13025 12521 12343 49888
+* 5p - -8 11809 12998 12799 12282 49888
+* 5p - -7 12097 12631 12975 12186 49889
+* 5p - -6 12015 12891 12437 12546 49889
+* 5p - -5 11948 13203 12842 11896 49889
+* 5p - -4 12069 12944 12779 12097 49889
+* 5p - -3 12237 13423 12799 11431 49890
+* 5p - -2 11802 13826 14641 9623 49892
+* 5p - -1 23600 2929 21085 2279 49893
+* 5p - 1 15823 17233 11131 5710 49897
+* 5p - 2 10985 9410 14757 14744 49896
+* 5p - 3 12871 14102 10418 12504 49895
+* 5p - 4 12468 13642 11659 12121 49890
+* 5p - 5 11804 12812 12945 12336 49897
+* 5p - 6 11848 13239 12141 12669 49897
+* 5p - 7 12077 12869 13010 11941 49897
+* 5p - 8 11098 13953 11370 13476 49897
+* 5p - 9 10730 13303 12274 13590 49897
+* 5p - 10 11301 13814 12754 12028 49897
+* 5p - 11 11549 13519 13170 11659 49897
+* 5p - 12 11935 13268 12993 11701 49897
+* 5p - 13 11668 13817 12568 11844 49897
+* 5p - 14 11391 13748 12907 11851 49897
+* 5p - 15 11766 13343 12809 11949 49867
+* 5p - 16 11636 13779 12685 11797 49897
+* 5p - 17 11622 13579 12991 11705 49897
+* 5p - 18 11717 13481 12752 11947 49897
+* 5p - 19 11646 13617 12922 11712 49897
+* 5p - 20 11532 13588 13044 11733 49897
+* 5p - 21 12026 13106 12951 11814 49897
+* 5p - 22 11663 13337 13029 11867 49896
+* 5p - 23 11646 13618 12971 11660 49895
+* 5p - 24 11944 13337 12812 11802 49895
+* 5p - 25 11809 13413 12688 11984 49894
+* 5p - 26 11675 13469 12876 11864 49884
+* 5p - 27 11798 13255 12881 11937 49871
+* 5p - 28 11941 13223 12844 11843 49851
+* 5p - 29 11861 13421 12898 11658 49838
+* 5p - 30 11919 13032 12986 11889 49826
+* 5p - 31 11856 13550 12709 11692 49807
+* 5p - 32 11917 13096 12990 11787 49790
+* 5p - 33 11938 13195 12994 11635 49762
+* 5p - 34 12075 13498 12620 11534 49727
+* 5p - 35 11807 13332 13009 11535 49683
+* 5p - 36 12014 13170 12972 11468 49624
+* 5p - 37 12034 13300 12588 11623 49545
+* 5p - 38 11920 13156 12889 11498 49463
+* 5p - 39 11937 12729 13032 11647 49345
+* 5p - 40 11896 13108 12668 11529 49201
+* 5p - 41 11958 12829 12736 11496 49019
+* 5p - 42 11892 12830 12610 11478 48810
+* 5p - 43 11827 12699 12584 11458 48568
+* 5p - 44 11822 12586 12682 11180 48270
+* 5p - 45 11808 12338 12494 11314 47954
+* 5p - 46 11696 12566 12220 11132 47614
+* 5p - 47 11480 12317 12415 10979 47191
+* 5p - 48 11589 11909 12111 11177 46786
+* 5p - 49 11323 12052 11850 11051 46276
+* 5p - 50 11092 11895 12228 10559 45774
+* 5p - 51 11250 11424 11872 10621 45167
+* 5p - 52 11076 11664 11570 10291 44601
+* 5p - 53 10652 11452 11625 10309 44038
+* 5p - 54 10732 11209 11223 10210 43374
+* 5p - 55 10448 11093 11132 10055 42728
+* 5p - 56 10230 10830 11026 9975 42061
+* 5p - 57 10341 10476 10742 9815 41374
+* 5p - 58 9917 10646 10446 9680 40689
+* 5p - 59 9875 10185 10415 9515 39990
+* 5p - 60 9664 10097 10227 9303 39291
+* 5p - 61 9450 9904 10114 9103 38571
+* 5p - 62 9358 9676 9904 8944 37882
+* 5p - 63 9296 9342 9734 8801 37173
+* 5p - 64 8860 9663 9251 8600 36374
+* 5p - 65 8804 9104 9255 8458 35621
+* 5p - 66 8798 8892 8922 8216 34828
+* 5p - 67 8555 8738 8680 8085 34058
+* 5p - 68 8232 8514 8657 7901 33304
+* 5p - 69 8120 8164 8454 7690 32428
+* 5p - 70 7830 8262 8101 7517 31710
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/dnacomp_genome.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/dnacomp_genome.csv
new file mode 100644
index 0000000..2fc7659
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/dnacomp_genome.csv
@@ -0,0 +1,2 @@
+A,C,G,T
+0.245290724081,0.254598401178,0.255055853499,0.245055021242
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/lgdistribution.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/lgdistribution.txt
new file mode 100644
index 0000000..53e2e21
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/lgdistribution.txt
@@ -0,0 +1,328 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_nucl.fasta as reference file
+# Std: strand of reads
+Std Length Occurences
++ 24 1
++ 25 4
++ 26 1
++ 27 2
++ 28 1
++ 29 7
++ 30 7
++ 31 13
++ 32 17
++ 33 18
++ 34 33
++ 35 42
++ 36 85
++ 37 113
++ 38 103
++ 39 140
++ 40 170
++ 41 240
++ 42 247
++ 43 271
++ 44 322
++ 45 366
++ 46 407
++ 47 409
++ 48 463
++ 49 517
++ 50 552
++ 51 607
++ 52 578
++ 53 588
++ 54 646
++ 55 697
++ 56 665
++ 57 708
++ 58 707
++ 59 729
++ 60 757
++ 61 770
++ 62 807
++ 63 746
++ 64 810
++ 65 754
++ 66 784
++ 67 762
++ 68 738
++ 69 784
++ 70 776
++ 71 762
++ 72 787
++ 73 711
++ 74 709
++ 75 755
++ 76 771
++ 77 700
++ 78 712
++ 79 696
++ 80 690
++ 81 657
++ 82 678
++ 83 661
++ 84 644
++ 85 663
++ 86 590
++ 87 604
++ 88 646
++ 89 578
++ 90 599
++ 91 679
++ 92 813
++ 93 1016
++ 94 6291
++ 95 329
++ 96 269
++ 97 260
++ 98 362
++ 99 249
++ 100 238
++ 101 231
++ 102 206
++ 103 207
++ 104 227
++ 105 207
++ 106 180
++ 107 173
++ 108 201
++ 109 172
++ 110 183
++ 111 183
++ 112 155
++ 113 165
++ 114 160
++ 115 163
++ 116 131
++ 117 152
++ 118 149
++ 119 146
++ 120 131
++ 121 132
++ 122 141
++ 123 121
++ 124 103
++ 125 107
++ 126 118
++ 127 103
++ 128 107
++ 129 99
++ 130 113
++ 131 91
++ 132 78
++ 133 101
++ 134 85
++ 135 67
++ 136 90
++ 137 71
++ 138 77
++ 139 70
++ 140 53
++ 141 53
++ 142 57
++ 143 45
++ 144 66
++ 145 46
++ 146 59
++ 147 61
++ 148 37
++ 149 40
++ 150 34
++ 151 45
++ 152 44
++ 153 42
++ 154 31
++ 155 30
++ 156 30
++ 157 32
++ 158 24
++ 159 26
++ 160 30
++ 161 28
++ 162 15
++ 163 18
++ 164 21
++ 165 13
++ 166 16
++ 167 10
++ 168 14
++ 169 9
++ 170 9
++ 171 9
++ 172 13
++ 173 7
++ 174 15
++ 175 13
++ 176 7
++ 177 9
++ 178 4
++ 179 8
++ 180 8
++ 181 3
++ 182 12
++ 183 7
++ 184 6
++ 185 6
++ 186 1
+- 25 1
+- 26 4
+- 27 5
+- 28 6
+- 29 6
+- 30 10
+- 31 10
+- 32 20
+- 33 33
+- 34 38
+- 35 57
+- 36 69
+- 37 82
+- 38 119
+- 39 137
+- 40 174
+- 41 201
+- 42 235
+- 43 294
+- 44 308
+- 45 340
+- 46 420
+- 47 399
+- 48 501
+- 49 505
+- 50 604
+- 51 557
+- 52 558
+- 53 664
+- 54 645
+- 55 662
+- 56 684
+- 57 682
+- 58 695
+- 59 701
+- 60 722
+- 61 679
+- 62 714
+- 63 792
+- 64 744
+- 65 797
+- 66 770
+- 67 747
+- 68 808
+- 69 787
+- 70 748
+- 71 745
+- 72 713
+- 73 721
+- 74 681
+- 75 738
+- 76 710
+- 77 665
+- 78 671
+- 79 693
+- 80 680
+- 81 703
+- 82 733
+- 83 685
+- 84 687
+- 85 638
+- 86 617
+- 87 645
+- 88 671
+- 89 564
+- 90 637
+- 91 699
+- 92 804
+- 93 1076
+- 94 6325
+- 95 345
+- 96 237
+- 97 263
+- 98 326
+- 99 235
+- 100 232
+- 101 236
+- 102 224
+- 103 225
+- 104 193
+- 105 210
+- 106 196
+- 107 196
+- 108 195
+- 109 187
+- 110 172
+- 111 186
+- 112 163
+- 113 173
+- 114 162
+- 115 172
+- 116 162
+- 117 145
+- 118 112
+- 119 134
+- 120 136
+- 121 119
+- 122 145
+- 123 133
+- 124 129
+- 125 98
+- 126 104
+- 127 90
+- 128 92
+- 129 98
+- 130 92
+- 131 92
+- 132 101
+- 133 86
+- 134 79
+- 135 69
+- 136 73
+- 137 85
+- 138 79
+- 139 60
+- 140 53
+- 141 64
+- 142 67
+- 143 60
+- 144 68
+- 145 53
+- 146 45
+- 147 39
+- 148 46
+- 149 42
+- 150 36
+- 151 42
+- 152 38
+- 153 34
+- 154 38
+- 155 42
+- 156 37
+- 157 32
+- 158 22
+- 159 20
+- 160 20
+- 161 27
+- 162 19
+- 163 23
+- 164 17
+- 165 24
+- 166 12
+- 167 15
+- 168 20
+- 169 10
+- 170 9
+- 171 9
+- 172 4
+- 173 7
+- 174 5
+- 175 8
+- 176 7
+- 177 7
+- 178 8
+- 179 6
+- 180 7
+- 181 3
+- 182 3
+- 183 7
+- 184 3
+- 185 4
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/misincorporation.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/misincorporation.txt
new file mode 100644
index 0000000..d1f1150
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_GGCTAC/misincorporation.txt
@@ -0,0 +1,284 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_nucl.fasta as reference file
+# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads
+Chr End Std Pos A C G T Total G>A C>T A>G T>C A>C A>T C>G C>A T>G T>A G>C G>T A>- T>- C>- G>- ->A ->T ->C ->G S
+* 3p + 1 6181 12579 16426 14915 50101 137 74 74 206 34 36 50 112 67 113 33 41 0 0 0 0 0 0 0 0 27
+* 3p + 2 14305 14460 9715 11619 50099 125 36 58 168 33 36 34 76 43 78 29 32 0 0 0 0 0 0 0 0 25
+* 3p + 3 12636 11090 13554 12817 50097 95 53 55 185 17 26 35 41 30 44 20 46 0 0 0 0 0 0 0 0 25
+* 3p + 4 12293 11921 13537 12349 50100 70 42 61 192 26 23 24 34 26 34 22 30 1 0 0 0 0 1 0 0 25
+* 3p + 5 12456 12997 12675 11970 50098 61 53 61 148 25 19 20 27 40 30 25 29 0 0 0 1 1 0 0 2 25
+* 3p + 6 13052 12230 13118 11695 50095 71 39 67 175 20 23 16 23 33 24 11 14 2 1 0 0 4 1 1 1 25
+* 3p + 7 12549 13239 12541 11764 50093 64 53 74 189 25 11 28 17 15 25 11 22 2 0 1 4 2 3 1 3 25
+* 3p + 8 13565 11679 13911 10939 50094 78 34 76 172 16 19 23 13 27 25 8 23 1 7 2 1 6 0 1 2 24
+* 3p + 9 13508 12462 13284 10837 50091 74 29 54 177 20 15 18 17 26 28 16 21 2 5 2 3 4 3 0 5 23
+* 3p + 10 12384 13060 13518 11132 50094 55 39 70 185 15 18 19 21 14 25 6 25 0 3 3 4 2 3 0 2 20
+* 3p + 11 12067 13253 13397 11374 50091 48 46 64 168 24 14 10 23 22 20 8 16 1 2 2 1 5 1 2 2 20
+* 3p + 12 12111 13257 12917 11810 50095 62 33 61 160 21 18 17 16 24 22 16 13 1 2 6 1 4 0 2 1 20
+* 3p + 13 12035 13073 13504 11481 50093 53 46 75 129 11 18 20 14 17 20 16 13 5 4 2 6 1 2 2 2 19
+* 3p + 14 12263 13091 13345 11396 50095 63 43 79 154 14 6 28 19 19 20 19 20 3 1 4 5 1 1 2 2 18
+* 3p + 15 12081 13124 13048 11511 49764 55 38 69 178 14 9 15 10 29 29 12 18 1 4 1 3 5 2 1 2 17
+* 3p + 16 12464 12808 13284 11539 50095 50 43 55 175 16 12 23 21 25 21 8 15 3 8 2 6 1 2 1 1 17
+* 3p + 17 11957 13388 13331 11421 50097 37 44 83 160 23 17 19 13 22 36 13 13 0 7 4 4 1 1 1 0 16
+* 3p + 18 12214 13042 13219 11616 50091 46 52 73 163 19 18 21 11 26 26 11 20 1 3 1 7 2 1 1 4 16
+* 3p + 19 12211 13083 13360 11437 50091 47 44 70 153 18 17 14 18 20 23 14 25 1 8 7 3 2 1 3 3 15
+* 3p + 20 12090 13189 13286 11530 50095 48 45 85 140 26 21 10 12 33 19 15 23 8 4 5 4 1 1 2 3 14
+* 3p + 21 12038 13243 13141 11673 50095 64 43 84 144 22 10 9 16 21 24 14 23 1 3 4 3 2 2 0 0 11
+* 3p + 22 11944 13133 13295 11712 50084 45 46 64 162 13 12 18 13 24 21 13 22 1 1 4 2 1 1 2 2 11
+* 3p + 23 12261 13097 13213 11519 50090 53 38 75 142 17 12 12 17 24 26 10 23 2 2 3 5 0 5 0 2 8
+* 3p + 24 12135 13082 13075 11800 50092 55 39 62 160 17 15 13 15 11 22 10 19 2 2 8 2 1 1 2 1 8
+* 3p + 25 12220 12875 13348 11638 50081 44 33 70 148 24 13 16 19 16 16 9 23 4 4 3 3 1 1 3 1 8
+* 3p + 26 12113 13142 13232 11584 50071 47 53 58 138 13 20 10 18 14 16 11 25 1 3 4 6 2 2 4 1 8
+* 3p + 27 12177 13142 12949 11792 50060 36 36 60 132 15 12 11 13 15 9 11 18 1 4 6 4 2 2 5 2 8
+* 3p + 28 12090 12959 13300 11706 50055 65 43 66 152 9 15 8 15 19 16 9 23 2 2 4 3 2 0 2 1 8
+* 3p + 29 12195 12990 13105 11752 50042 49 44 61 147 18 20 18 11 37 18 19 21 2 4 1 6 0 3 4 0 8
+* 3p + 30 12178 13248 12909 11686 50021 49 37 68 137 17 9 9 19 23 16 10 21 5 1 3 1 2 1 3 2 8
+* 3p + 31 12017 13167 13105 11723 50012 45 50 59 138 16 15 6 23 23 15 10 26 3 6 3 1 4 1 1 1 8
+* 3p + 32 11957 13362 12937 11732 49988 40 45 70 146 13 17 9 14 14 16 10 19 2 0 3 2 4 2 4 0 7
+* 3p + 33 11864 13099 12997 12009 49969 45 33 72 127 8 16 12 6 24 14 10 25 3 5 2 1 2 2 1 2 7
+* 3p + 34 11925 13095 13056 11868 49944 39 51 77 162 14 14 5 10 23 16 8 16 2 2 4 2 3 2 2 0 7
+* 3p + 35 12074 13168 13078 11584 49904 46 42 58 124 20 15 9 14 16 15 9 27 4 5 1 2 1 0 3 1 7
+* 3p + 36 12092 13122 12844 11799 49857 41 41 69 139 14 19 9 14 20 10 8 21 1 4 1 2 0 4 0 1 7
+* 3p + 37 11858 13211 12928 11769 49766 41 45 67 131 14 12 12 14 20 19 7 22 1 3 1 2 1 2 2 2 6
+* 3p + 38 11967 13050 12871 11762 49650 40 36 62 124 16 16 15 17 15 18 12 14 0 6 2 0 3 1 2 1 5
+* 3p + 39 11921 13099 12708 11808 49536 38 44 65 133 10 12 8 11 19 13 5 20 2 2 3 0 2 3 4 0 5
+* 3p + 40 11809 12815 12839 11929 49392 51 44 58 112 24 12 9 13 19 20 14 26 2 3 1 3 1 2 0 1 5
+* 3p + 41 11793 13046 12657 11726 49222 42 41 64 113 21 22 8 11 25 20 10 19 4 3 3 0 1 0 0 0 5
+* 3p + 42 11847 13077 12460 11593 48977 35 45 69 149 16 17 7 13 19 22 9 26 1 1 2 7 1 1 1 1 5
+* 3p + 43 11522 12838 12550 11813 48723 39 51 70 114 16 8 2 14 19 19 10 25 3 0 8 3 3 1 1 1 4
+* 3p + 44 11650 12823 12378 11594 48445 39 57 57 124 13 11 8 12 20 18 11 19 5 6 4 2 3 2 2 0 4
+* 3p + 45 11607 12589 12204 11720 48120 49 42 55 135 19 18 5 14 16 18 15 24 2 1 5 4 3 1 2 3 4
+* 3p + 46 11570 12505 12271 11406 47752 47 43 60 138 23 11 7 5 15 17 9 20 9 0 4 0 2 2 0 1 4
+* 3p + 47 11295 12455 12230 11361 47341 37 43 60 126 18 5 9 8 8 22 10 18 2 3 4 2 1 2 1 3 4
+* 3p + 48 11203 12435 11861 11426 46925 39 44 57 116 12 12 14 24 9 22 14 26 3 1 1 3 1 1 4 3 4
+* 3p + 49 11286 12137 11875 11162 46460 34 36 63 123 12 11 6 13 10 15 13 16 0 4 3 0 1 4 1 2 4
+* 3p + 50 11047 12129 11611 11154 45941 26 45 58 106 12 10 6 9 8 24 11 15 1 2 1 1 5 2 0 3 3
+* 3p + 51 11055 11969 11441 10921 45386 32 45 61 111 15 13 8 12 11 23 10 17 4 2 2 1 2 2 1 3 3
+* 3p + 52 10797 11887 11400 10692 44776 30 49 46 130 5 10 8 6 18 13 11 29 1 3 1 3 2 0 1 2 3
+* 3p + 53 10589 11581 11179 10848 44197 31 46 51 108 12 8 8 8 20 16 6 21 5 1 4 2 1 1 0 1 2
+* 3p + 54 10663 11417 10934 10592 43606 32 40 63 99 8 14 7 11 16 17 7 20 2 1 0 3 3 0 0 1 1
+* 3p + 55 10404 11279 10962 10315 42960 35 40 60 91 12 10 4 15 15 10 9 12 4 2 1 1 1 2 1 0 1
+* 3p + 56 10332 11026 10717 10190 42265 39 40 54 120 16 12 9 14 11 17 12 17 1 5 0 3 1 0 1 2 1
+* 3p + 57 10124 10955 10305 10209 41593 34 49 42 83 16 12 5 13 17 13 10 24 1 3 0 2 2 3 1 1 1
+* 3p + 58 9889 10806 10327 9859 40881 31 34 35 94 20 11 5 18 18 13 9 14 2 1 2 1 0 2 1 1 1
+* 3p + 59 9660 10610 10168 9735 40173 33 24 45 76 8 14 6 9 11 11 5 16 1 3 1 1 0 1 0 1 1
+* 3p + 60 9609 10373 9899 9561 39442 31 37 50 88 9 12 9 13 13 9 8 17 1 1 0 5 2 2 1 1 1
+* 3p + 61 9340 10145 9769 9431 38685 31 32 47 84 10 13 9 9 11 14 8 28 5 3 2 2 0 3 0 1 1
+* 3p + 62 9225 10012 9478 9198 37913 26 36 48 84 11 11 11 8 11 12 7 15 3 4 0 1 0 0 0 2 1
+* 3p + 63 8932 9836 9303 9027 37098 25 38 36 83 10 11 9 9 17 21 5 20 1 3 2 1 4 1 0 2 1
+* 3p + 64 8694 9491 9250 8919 36354 30 35 51 70 7 13 3 11 11 15 6 15 1 4 3 1 2 3 0 0 1
+* 3p + 65 8743 9276 8866 8657 35542 31 40 39 67 9 3 3 10 12 9 2 23 0 0 0 1 1 3 0 1 1
+* 3p + 66 8447 9116 8629 8586 34778 25 24 28 75 9 13 7 9 13 12 6 13 2 0 2 1 1 4 2 2 0
+* 3p + 67 8204 8905 8576 8321 34006 29 32 47 77 11 3 4 9 7 11 6 10 1 3 0 0 2 0 1 2 0
+* 3p + 68 8019 8707 8341 8172 33239 23 33 30 70 5 12 8 5 7 12 7 17 0 0 1 1 1 1 1 0 0
+* 3p + 69 7914 8603 7999 7967 32483 25 32 28 54 21 9 5 3 16 9 9 19 0 2 0 2 0 0 0 0 0
+* 3p + 70 7700 8219 8073 7717 31709 37 25 22 66 7 12 3 7 7 14 4 13 0 5 3 1 3 1 2 1 0
+* 3p - 1 6231 12373 16505 14787 49896 138 65 90 188 32 30 49 102 71 116 40 46 0 0 0 0 0 0 0 0 27
+* 3p - 2 14296 14356 9794 11449 49895 111 64 76 164 36 25 32 83 37 82 24 27 0 0 0 0 0 0 1 0 26
+* 3p - 3 12504 11159 13574 12653 49890 121 55 61 182 23 23 37 41 28 60 13 44 0 0 0 0 1 1 1 1 26
+* 3p - 4 12404 11868 13450 12169 49891 92 44 59 164 22 24 27 36 26 31 17 27 0 0 2 1 1 1 0 3 23
+* 3p - 5 12351 12762 12712 12065 49890 64 59 65 184 30 20 20 26 29 27 22 25 2 0 1 0 3 2 2 0 22
+* 3p - 6 12831 12395 12999 11661 49886 70 46 67 136 27 13 20 18 29 30 13 23 1 4 1 2 2 4 1 3 21
+* 3p - 7 12546 12970 12476 11897 49889 68 43 56 217 13 16 24 17 30 29 14 13 1 7 5 0 5 2 0 1 20
+* 3p - 8 13520 11819 13714 10834 49887 63 43 73 166 16 19 16 29 21 14 10 27 1 3 5 1 4 2 2 2 19
+* 3p - 9 13681 12507 13100 10605 49893 75 31 92 188 17 18 15 23 26 19 8 19 3 2 5 2 1 1 1 1 18
+* 3p - 10 12561 12831 13420 11074 49886 62 44 69 174 21 12 16 24 22 26 10 26 4 3 3 3 3 2 0 2 17
+* 3p - 11 12024 13411 13119 11335 49889 48 32 65 152 15 14 20 20 29 23 15 27 6 6 2 3 5 1 0 2 17
+* 3p - 12 11746 13208 13152 11781 49887 42 33 54 143 18 19 19 23 22 19 11 18 5 6 2 1 2 2 3 2 16
+* 3p - 13 12152 12886 13413 11435 49886 55 29 65 163 15 11 20 14 27 21 13 26 2 2 3 5 4 1 3 0 16
+* 3p - 14 12011 13021 13460 11392 49884 65 40 69 170 24 18 13 14 22 16 17 22 2 4 2 1 0 4 2 3 15
+* 3p - 15 12052 13096 13091 11335 49574 56 45 65 147 14 18 14 14 20 25 13 22 1 0 3 2 3 0 0 2 14
+* 3p - 16 12230 13084 13201 11374 49889 52 42 67 138 23 11 7 12 22 19 8 15 7 2 4 0 0 2 2 2 14
+* 3p - 17 11997 12991 13486 11414 49888 67 34 76 178 22 9 10 13 23 23 15 16 3 3 2 6 0 3 1 1 14
+* 3p - 18 12067 13027 13194 11605 49893 45 30 57 173 24 15 7 15 24 26 19 15 3 1 1 2 0 3 0 1 14
+* 3p - 19 12157 12883 13228 11618 49886 58 40 69 186 21 12 20 14 35 25 8 18 2 2 4 1 1 0 3 3 14
+* 3p - 20 11965 13043 13378 11502 49888 46 37 64 170 15 12 16 18 19 17 13 34 2 3 5 0 2 0 1 4 14
+* 3p - 21 11896 13250 12919 11819 49884 51 34 61 164 18 13 17 10 27 20 13 26 2 6 2 5 3 1 1 1 13
+* 3p - 22 12106 12796 13304 11667 49873 48 45 60 165 15 12 9 16 21 21 6 16 3 5 3 4 1 1 4 2 13
+* 3p - 23 12201 12861 13269 11554 49885 55 36 65 140 15 8 12 11 20 15 11 27 2 5 3 3 4 2 2 0 13
+* 3p - 24 12171 13026 13055 11633 49885 55 38 56 160 10 12 12 14 21 28 9 24 1 6 0 3 2 1 1 1 12
+* 3p - 25 12299 12887 13114 11575 49875 48 45 55 152 21 9 4 16 22 25 16 26 1 7 3 5 2 2 0 2 12
+* 3p - 26 12382 12914 13118 11464 49878 47 45 56 160 14 16 7 10 13 26 10 22 1 7 2 3 1 1 1 0 12
+* 3p - 27 12197 13006 12954 11711 49868 39 40 58 140 12 8 6 14 22 21 7 11 0 3 5 1 0 2 1 1 12
+* 3p - 28 12133 12876 13004 11835 49848 48 43 71 144 9 18 10 13 21 17 6 19 1 5 3 2 0 0 3 1 12
+* 3p - 29 12153 13016 13142 11516 49827 48 56 69 148 17 21 14 15 33 20 10 25 0 3 2 1 3 2 2 2 12
+* 3p - 30 12098 13216 12756 11739 49809 45 55 72 140 12 4 7 11 18 18 8 17 5 3 2 7 2 0 1 4 11
+* 3p - 31 12020 12993 13137 11648 49798 45 39 53 150 24 13 11 15 17 21 8 19 5 1 2 3 3 2 0 2 11
+* 3p - 32 12078 13031 13106 11570 49785 43 55 73 127 9 7 12 13 18 16 6 19 1 0 4 6 0 1 3 1 11
+* 3p - 33 11856 13144 12790 11965 49755 43 46 71 145 24 11 5 6 13 16 10 20 3 4 1 0 1 0 2 4 10
+* 3p - 34 11964 13157 13030 11568 49719 32 37 67 133 22 18 11 20 16 22 8 16 4 1 1 0 1 3 2 1 10
+* 3p - 35 11844 13036 13043 11750 49673 36 54 75 124 18 12 8 14 16 14 9 18 1 4 1 4 4 2 2 2 10
+* 3p - 36 12173 13059 12646 11736 49614 38 48 57 152 10 15 12 10 15 16 8 20 3 2 0 2 2 2 4 2 9
+* 3p - 37 12105 12927 12763 11750 49545 43 37 73 122 26 19 8 11 20 24 11 18 2 1 5 0 0 2 1 2 9
+* 3p - 38 12106 12934 12790 11634 49464 35 43 83 121 22 10 9 6 19 20 8 22 6 1 5 1 0 0 1 1 8
+* 3p - 39 12003 12945 12525 11869 49342 49 37 79 114 8 9 11 14 20 20 11 21 5 2 5 2 2 0 1 0 8
+* 3p - 40 11977 12908 12644 11668 49197 40 51 58 157 4 11 6 16 19 19 8 15 1 3 5 0 1 2 1 1 8
+* 3p - 41 11758 12929 12734 11593 49014 43 42 67 130 10 14 10 9 19 16 10 17 1 2 1 0 1 1 1 1 8
+* 3p - 42 11666 12803 12414 11921 48804 41 43 55 121 13 16 5 12 24 23 14 21 3 2 2 3 3 2 1 2 8
+* 3p - 43 11790 12922 12430 11418 48560 27 40 54 146 19 13 9 14 16 12 8 16 1 2 2 1 2 3 2 2 8
+* 3p - 44 11565 12840 12543 11315 48263 31 35 55 126 13 21 6 14 18 12 6 24 1 6 0 2 5 1 1 1 8
+* 3p - 45 11656 12569 12198 11529 47952 46 45 62 133 17 22 11 13 16 14 16 27 3 2 1 3 2 2 0 1 7
+* 3p - 46 11539 12604 11995 11471 47609 48 48 72 119 21 19 9 13 17 17 7 14 1 0 3 1 2 1 2 0 7
+* 3p - 47 11396 12471 12074 11239 47180 39 33 65 111 16 11 14 18 13 12 7 22 0 3 3 1 6 1 1 2 7
+* 3p - 48 11414 12265 11864 11237 46780 28 39 48 98 12 13 3 12 14 10 7 21 2 2 1 0 2 3 1 1 7
+* 3p - 49 11206 12035 11890 11140 46271 38 39 49 111 17 13 8 11 15 15 9 17 2 2 0 4 5 0 1 3 5
+* 3p - 50 11030 12157 11611 10976 45774 36 41 69 96 21 14 6 16 21 18 1 14 4 1 1 1 0 1 0 3 5
+* 3p - 51 10719 11982 11468 10992 45161 46 46 57 111 15 13 4 10 18 15 8 15 1 4 2 3 2 1 3 0 5
+* 3p - 52 10736 11665 11310 10890 44601 34 45 57 121 14 14 10 10 11 20 7 22 3 0 4 2 3 0 2 1 4
+* 3p - 53 10485 11768 11124 10657 44034 42 38 57 111 9 7 7 10 13 17 3 14 6 1 6 2 3 4 1 2 4
+* 3p - 54 10428 11465 11057 10416 43366 37 53 55 103 17 15 11 15 9 11 9 19 5 6 4 3 3 1 5 2 4
+* 3p - 55 10326 11284 10795 10316 42721 32 41 53 107 9 11 9 17 15 15 0 17 2 4 2 1 2 4 1 1 4
+* 3p - 56 10059 11180 10603 10222 42064 33 33 48 105 9 10 12 5 17 6 7 16 0 2 3 7 0 0 1 1 3
+* 3p - 57 9948 10929 10295 10203 41375 35 31 47 102 16 11 5 15 12 6 8 14 2 5 1 1 2 0 0 0 2
+* 3p - 58 9917 10593 10227 9952 40689 30 31 53 106 5 10 2 9 10 15 4 21 0 2 0 5 0 1 2 2 1
+* 3p - 59 9784 10600 10039 9565 39988 33 44 44 100 16 9 5 12 16 13 6 14 1 1 1 2 3 2 3 1 1
+* 3p - 60 9481 10474 9721 9611 39287 35 42 46 77 6 10 7 7 12 8 4 10 2 1 0 0 1 2 2 3 1
+* 3p - 61 9435 10146 9688 9303 38572 41 39 46 84 9 10 7 8 15 13 7 16 2 1 2 2 1 2 2 1 1
+* 3p - 62 9226 9931 9652 9080 37889 32 34 43 79 19 9 8 9 9 16 8 14 1 0 0 4 0 2 1 0 1
+* 3p - 63 8902 9855 9321 9099 37177 34 35 47 77 13 14 5 14 13 14 5 16 2 2 1 0 1 2 1 0 0
+* 3p - 64 8884 9467 9155 8870 36376 27 21 38 79 9 11 5 9 9 16 7 17 1 0 2 0 2 3 1 2 0
+* 3p - 65 8495 9298 9163 8678 35634 26 40 40 66 12 7 8 11 17 13 9 13 0 3 1 0 1 1 1 1 0
+* 3p - 66 8240 9219 8749 8621 34829 19 30 39 70 9 12 3 7 10 9 5 17 1 1 2 3 1 1 0 2 0
+* 3p - 67 8122 8827 8617 8491 34057 20 26 30 74 5 8 6 15 11 12 6 17 3 2 2 4 2 1 1 0 0
+* 3p - 68 7980 8667 8476 8189 33312 23 30 34 65 12 11 4 7 10 9 7 18 1 3 2 5 0 0 0 0 0
+* 3p - 69 7715 8563 8109 8090 32477 28 33 33 66 5 11 10 11 16 9 11 18 2 2 1 2 1 0 0 1 0
+* 3p - 70 7616 8429 7974 7692 31711 36 30 41 59 8 7 14 9 10 9 10 13 2 3 1 1 1 2 2 1 0
+* 5p + 1 15927 17322 11077 5773 50099 59 67 114 50 22 18 12 24 25 23 20 22 0 0 0 0 0 0 0 0 21
+* 5p + 2 10842 9337 14965 14956 50100 45 76 91 77 13 16 14 18 28 14 8 30 1 0 2 1 0 1 0 0 17
+* 5p + 3 12805 14198 10545 12547 50095 35 65 55 82 21 14 15 14 17 26 13 16 2 1 1 3 0 0 0 2 14
+* 5p + 4 12506 13802 11587 12191 50086 38 57 58 105 22 15 15 18 13 15 12 16 1 2 3 1 0 4 0 3 13
+* 5p + 5 12145 12763 12836 12348 50092 47 65 43 102 10 14 6 19 6 19 10 26 3 4 0 3 1 2 1 3 13
+* 5p + 6 12030 13060 12060 12943 50093 30 48 57 104 16 16 3 11 18 24 6 25 1 0 1 2 3 3 0 2 13
+* 5p + 7 12442 12581 12775 12295 50093 43 48 78 97 14 21 3 14 16 22 7 21 5 3 5 2 4 2 3 1 13
+* 5p + 8 11160 13891 11606 13439 50096 46 56 50 105 15 13 6 20 18 15 10 27 5 3 2 2 0 1 3 3 13
+* 5p + 9 10809 13470 12444 13374 50097 28 47 67 122 12 14 4 11 7 10 7 22 1 1 0 2 4 0 2 0 13
+* 5p + 10 11377 13751 12603 12364 50095 36 37 84 120 14 12 4 19 14 13 8 27 3 3 3 1 3 2 2 1 13
+* 5p + 11 11691 13410 13066 11929 50096 32 34 82 126 17 22 16 11 23 18 6 27 5 5 2 3 1 4 2 0 13
+* 5p + 12 11966 13349 12998 11786 50099 44 45 82 129 22 17 13 11 18 7 10 27 3 5 2 3 1 1 2 0 13
+* 5p + 13 11807 13789 12631 11869 50096 28 47 68 130 18 15 4 11 19 15 8 22 1 2 0 3 2 2 2 0 13
+* 5p + 14 11634 13572 12656 12235 50097 36 43 76 142 13 16 8 9 10 7 5 22 4 1 2 1 3 1 2 0 12
+* 5p + 15 11771 13327 13004 11971 50073 39 35 63 115 14 8 10 8 16 20 9 19 3 3 1 0 1 0 1 2 12
+* 5p + 16 11725 13572 12662 12140 50099 32 47 79 121 13 18 10 9 19 20 12 18 1 5 3 0 1 0 2 1 12
+* 5p + 17 11765 13579 12910 11844 50098 46 53 73 114 9 13 11 9 10 18 7 16 1 3 6 2 2 1 2 0 12
+* 5p + 18 11739 13577 12887 11894 50097 24 40 70 124 19 12 9 8 16 30 5 28 0 4 4 2 0 4 2 0 12
+* 5p + 19 11777 13607 12821 11890 50095 33 47 79 133 18 9 10 14 16 13 7 19 1 3 0 0 4 1 0 3 12
+* 5p + 20 11810 13594 12902 11789 50095 44 48 80 113 18 14 9 12 17 20 6 17 0 3 2 1 2 3 1 2 12
+* 5p + 21 12049 13253 13031 11763 50096 32 55 76 110 19 11 16 15 16 18 5 16 3 1 4 3 1 3 2 1 12
+* 5p + 22 12120 13494 12605 11878 50097 27 34 77 123 28 10 4 8 14 25 9 21 2 4 4 2 1 0 4 1 12
+* 5p + 23 11746 13411 13021 11915 50093 38 42 67 124 15 12 12 13 11 13 5 22 5 4 4 2 4 1 1 2 12
+* 5p + 24 11864 13408 13074 11745 50091 34 31 58 130 14 15 10 10 25 20 18 19 3 5 2 2 2 2 4 1 12
+* 5p + 25 11930 13405 12615 12142 50092 34 44 67 128 12 12 6 16 18 15 10 23 2 1 2 3 3 0 1 1 12
+* 5p + 26 11871 13477 12802 11928 50078 40 55 76 131 10 19 13 16 18 12 12 18 4 2 3 1 1 2 1 0 12
+* 5p + 27 11867 13198 12926 12075 50066 42 37 68 144 20 11 7 18 17 27 14 24 5 5 3 2 2 1 1 1 11
+* 5p + 28 12028 13330 12720 11972 50050 32 34 62 125 18 11 12 21 12 30 14 20 3 3 5 5 5 1 2 2 10
+* 5p + 29 11841 13432 12806 11965 50044 34 41 60 136 21 15 11 17 18 10 11 30 6 2 1 2 1 1 2 2 10
+* 5p + 30 12027 13154 13085 11763 50029 42 49 69 120 10 18 13 12 25 16 15 24 1 1 0 2 3 1 1 1 9
+* 5p + 31 11995 13161 12931 11923 50010 38 56 78 117 12 14 7 10 21 23 8 17 2 2 0 1 3 3 4 0 9
+* 5p + 32 11884 13185 13053 11870 49992 40 42 65 120 18 14 9 16 22 21 19 24 3 4 1 1 1 2 1 3 9
+* 5p + 33 11991 13170 13076 11733 49970 42 48 66 123 18 17 13 14 16 11 14 35 2 4 3 1 1 3 1 0 9
+* 5p + 34 11955 13353 12807 11830 49945 46 32 56 117 20 21 18 10 22 8 10 25 2 3 2 2 3 1 1 2 8
+* 5p + 35 11977 13403 12930 11587 49897 45 34 63 125 16 9 12 7 19 18 10 16 2 2 5 6 4 4 1 3 8
+* 5p + 36 11919 13200 13071 11669 49859 46 45 78 111 14 14 8 11 14 16 19 26 2 5 1 0 1 2 0 2 7
+* 5p + 37 12100 13164 12792 11708 49764 44 40 76 135 26 15 10 16 19 10 10 24 0 6 1 1 1 3 1 4 7
+* 5p + 38 12077 13035 12880 11657 49649 54 53 67 133 10 10 16 13 17 14 13 20 3 2 2 0 0 4 1 3 7
+* 5p + 39 12052 12943 12857 11684 49536 44 42 67 117 13 10 8 13 19 19 8 31 1 1 3 1 3 2 2 1 7
+* 5p + 40 11936 12907 12875 11672 49390 37 50 65 127 14 13 10 17 20 18 8 14 2 3 0 3 2 1 3 0 7
+* 5p + 41 11904 12911 12907 11496 49218 56 44 51 109 19 12 10 8 13 13 10 13 0 1 4 3 0 3 3 0 6
+* 5p + 42 11959 12726 12746 11542 48973 46 33 51 126 10 8 9 22 21 17 10 19 2 5 2 2 1 3 2 0 6
+* 5p + 43 11888 12645 12561 11628 48722 44 42 57 157 17 14 7 11 21 12 15 19 2 2 2 3 1 3 2 1 6
+* 5p + 44 11705 12777 12674 11286 48442 36 49 74 149 8 14 10 13 22 19 8 24 2 2 2 3 3 3 2 2 6
+* 5p + 45 11751 12321 12707 11345 48124 50 36 60 160 18 12 12 12 19 13 12 18 1 1 4 3 2 1 1 1 6
+* 5p + 46 11422 12485 12199 11640 47746 55 46 45 133 20 25 13 15 19 25 9 22 1 2 6 2 4 3 2 1 5
+* 5p + 47 11655 12286 12273 11121 47335 36 33 60 121 19 19 16 20 19 28 8 28 0 4 3 2 2 5 3 2 5
+* 5p + 48 11335 12223 12180 11193 46931 51 38 60 123 13 17 10 15 24 21 12 22 3 5 5 4 2 2 0 1 5
+* 5p + 49 11361 12032 12125 10946 46464 53 51 61 137 12 11 10 13 17 18 6 23 2 5 1 6 2 1 1 0 5
+* 5p + 50 11162 12063 11916 10807 45948 43 43 51 119 18 15 9 12 14 19 16 31 1 3 7 5 1 0 1 1 5
+* 5p + 51 11171 11695 11890 10635 45391 47 38 61 117 11 13 13 8 24 22 15 24 1 2 3 6 1 0 1 1 5
+* 5p + 52 11055 11595 11536 10595 44781 33 41 47 118 20 10 16 11 21 19 11 20 4 4 2 1 1 0 0 0 4
+* 5p + 53 10649 11484 11536 10524 44193 48 49 44 123 15 12 11 16 27 30 13 24 3 4 1 3 3 2 2 0 4
+* 5p + 54 10671 11088 11448 10400 43607 60 36 37 127 9 4 16 11 16 19 9 14 2 3 1 3 3 1 0 0 4
+* 5p + 55 10494 11047 11141 10280 42962 45 33 50 102 15 16 10 12 10 18 11 19 3 2 2 3 0 0 1 1 3
+* 5p + 56 10376 10861 11003 10026 42266 43 31 39 125 13 17 12 20 19 21 17 22 2 2 2 2 1 0 1 1 2
+* 5p + 57 10213 10489 10945 9950 41597 37 38 46 106 16 17 10 21 15 25 14 20 0 2 1 1 1 1 0 1 2
+* 5p + 58 10210 10432 10490 9755 40887 48 40 58 117 8 16 17 16 20 23 16 25 5 3 0 1 0 0 1 0 2
+* 5p + 59 9968 10273 10425 9505 40171 51 26 44 128 16 20 15 16 18 20 9 16 4 2 2 2 2 2 0 2 2
+* 5p + 60 9827 9928 10265 9426 39446 50 29 44 110 13 10 11 13 21 13 9 19 1 0 2 1 0 1 1 1 1
+* 5p + 61 9404 9933 10035 9310 38682 44 32 30 114 16 19 8 18 18 23 11 17 3 0 1 2 5 2 0 1 1
+* 5p + 62 9190 9687 9792 9245 37914 50 30 43 111 12 12 14 16 19 10 11 16 3 2 3 5 2 0 0 0 1
+* 5p + 63 9171 9452 9672 8807 37102 44 28 34 115 12 9 8 10 17 19 10 17 1 3 6 3 2 1 0 0 0
+* 5p + 64 8974 9151 9458 8770 36353 37 24 37 99 20 15 4 14 20 8 12 16 2 1 2 3 4 1 0 2 0
+* 5p + 65 8738 9138 9136 8527 35539 45 41 35 96 8 10 19 11 8 15 8 19 1 2 4 0 2 2 1 4 0
+* 5p + 66 8672 8736 9036 8340 34784 49 25 42 115 17 10 7 7 12 16 10 13 5 5 1 0 0 1 1 3 0
+* 5p + 67 8347 8622 8885 8153 34007 43 28 38 105 12 11 12 9 19 10 12 17 2 3 2 5 1 0 1 2 0
+* 5p + 68 8289 8426 8608 7914 33237 33 36 31 89 12 12 9 11 12 11 13 12 1 0 2 1 1 1 2 1 0
+* 5p + 69 7994 8155 8552 7735 32436 36 29 29 98 12 10 12 10 11 13 6 17 3 1 2 4 3 2 1 2 0
+* 5p + 70 7907 7964 8184 7656 31711 36 28 27 90 16 10 15 10 11 9 7 12 0 1 0 2 1 1 2 1 0
+* 5p - 1 15866 17239 11094 5694 49893 47 64 95 49 30 16 18 27 14 24 23 22 0 0 0 0 0 0 0 0 17
+* 5p - 2 11025 9409 14711 14748 49893 43 68 77 80 14 21 22 16 17 15 10 17 2 0 0 1 0 0 0 1 12
+* 5p - 3 12865 14080 10430 12516 49891 48 69 58 88 15 13 11 21 22 20 21 35 0 3 2 1 1 1 1 0 9
+* 5p - 4 12454 13598 11653 12182 49887 46 45 52 109 13 17 24 22 17 24 14 27 0 2 1 2 0 1 0 1 9
+* 5p - 5 11811 12789 12937 12355 49892 48 51 57 80 18 14 13 18 19 12 6 30 0 2 2 1 1 1 1 1 8
+* 5p - 6 11880 13179 12101 12731 49891 30 48 76 105 9 16 9 11 18 24 11 20 0 3 3 2 2 2 1 1 8
+* 5p - 7 12110 12813 12972 11993 49888 46 46 75 109 13 14 9 16 15 8 9 14 1 0 7 0 2 0 5 2 8
+* 5p - 8 11156 13914 11326 13495 49891 37 64 82 99 21 15 11 10 20 14 7 26 3 0 2 3 2 2 2 0 8
+* 5p - 9 10768 13205 12230 13687 49890 33 46 82 121 19 5 3 13 26 24 8 25 2 1 0 1 2 3 2 0 8
+* 5p - 10 11318 13767 12720 12086 49891 40 57 66 115 7 13 9 9 23 12 5 16 2 0 2 2 2 3 1 0 8
+* 5p - 11 11587 13425 13127 11750 49889 46 34 79 130 8 18 13 13 24 11 7 23 3 6 1 1 2 1 4 1 8
+* 5p - 12 11982 13197 12960 11752 49891 33 48 80 117 12 16 8 17 16 9 11 29 1 1 2 1 1 2 2 1 7
+* 5p - 13 11667 13746 12547 11928 49888 38 44 64 123 8 11 5 20 16 24 6 21 2 4 0 1 1 4 3 1 7
+* 5p - 14 11419 13706 12875 11890 49890 34 57 71 109 9 17 8 17 14 15 5 24 6 2 1 4 3 1 1 2 7
+* 5p - 15 11799 13279 12765 12019 49862 21 43 66 116 7 11 10 18 19 10 9 22 1 6 3 3 0 1 2 2 6
+* 5p - 16 11679 13694 12658 11863 49894 29 44 62 120 15 15 7 10 23 9 10 21 4 1 4 4 1 1 0 1 5
+* 5p - 17 11672 13519 12965 11736 49892 38 48 68 102 18 20 6 10 11 9 7 21 5 0 2 2 4 1 0 0 5
+* 5p - 18 11762 13416 12692 12018 49888 30 41 78 109 12 17 10 9 19 15 5 16 2 4 4 1 1 2 4 2 5
+* 5p - 19 11673 13534 12893 11786 49886 36 46 63 119 17 13 7 12 19 13 12 19 1 2 2 0 2 1 4 4 5
+* 5p - 20 11554 13492 13024 11822 49892 40 36 60 123 12 11 11 9 18 11 8 19 2 2 1 1 2 0 0 3 5
+* 5p - 21 12049 13035 12913 11891 49888 40 45 75 120 14 15 7 18 21 18 12 26 3 4 3 2 3 1 4 1 5
+* 5p - 22 11691 13253 13005 11937 49886 43 43 64 108 17 10 10 11 28 14 8 23 6 7 4 2 2 2 3 3 5
+* 5p - 23 11697 13535 12924 11730 49886 31 42 71 118 20 18 8 15 16 15 5 18 1 2 5 0 3 1 2 3 4
+* 5p - 24 11948 13248 12795 11898 49889 41 47 57 139 16 15 8 18 22 15 7 15 1 1 1 4 2 2 0 2 3
+* 5p - 25 11842 13336 12661 12050 49889 35 40 52 109 14 16 9 13 17 14 16 14 4 2 3 2 0 2 2 1 3
+* 5p - 26 11711 13403 12859 11900 49873 36 52 69 116 22 15 12 15 18 10 11 32 1 1 2 0 2 3 3 3 3
+* 5p - 27 11855 13158 12831 12020 49864 34 49 76 144 18 20 14 14 13 16 8 24 2 5 2 1 3 0 3 3 3
+* 5p - 28 11941 13145 12840 11919 49845 58 41 70 117 14 15 14 14 17 18 11 23 4 3 5 3 3 2 2 0 3
+* 5p - 29 11862 13352 12902 11716 49832 58 45 58 127 8 13 12 19 20 11 8 29 2 3 4 5 3 2 0 1 2
+* 5p - 30 11930 12976 12961 11953 49820 40 43 80 107 12 15 10 14 15 20 14 18 2 3 1 2 3 0 2 1 2
+* 5p - 31 11889 13441 12675 11798 49803 31 35 66 134 18 13 9 15 18 14 4 25 0 0 1 4 2 0 2 0 2
+* 5p - 32 11939 13027 12944 11874 49784 48 48 66 123 16 18 10 19 29 18 12 23 3 2 1 2 4 1 1 0 2
+* 5p - 33 11949 13125 12998 11680 49752 48 50 62 111 20 12 5 13 11 14 17 14 0 1 1 3 2 2 4 2 2
+* 5p - 34 12094 13386 12621 11618 49719 63 53 80 141 22 14 7 12 22 15 10 22 2 0 6 2 1 4 2 1 2
+* 5p - 35 11836 13223 12978 11638 49675 40 42 73 145 12 18 10 15 19 17 13 19 1 1 1 3 1 4 1 2 1
+* 5p - 36 12031 13103 12955 11528 49617 41 41 66 105 18 18 9 10 18 23 7 25 2 5 1 2 3 2 1 1 1
+* 5p - 37 12037 13213 12559 11730 49539 42 45 70 131 13 12 8 17 22 28 6 24 3 4 0 1 3 1 2 2 1
+* 5p - 38 11946 13033 12865 11615 49459 54 26 75 141 15 17 6 17 23 20 11 25 4 2 1 2 1 1 2 1 1
+* 5p - 39 11918 12610 13054 11760 49342 55 31 50 146 11 11 10 13 23 18 13 26 1 2 3 2 1 1 0 1 1
+* 5p - 40 11920 13003 12652 11622 49197 46 44 57 124 6 12 10 7 26 13 12 21 1 7 1 3 1 1 2 1 1
+* 5p - 41 11974 12760 12720 11562 49016 50 44 69 126 17 19 5 12 13 14 7 18 4 2 0 6 0 3 0 0 1
+* 5p - 42 11880 12731 12618 11572 48801 47 45 53 148 9 16 11 15 15 13 8 23 1 2 3 2 2 3 3 3 1
+* 5p - 43 11861 12636 12538 11528 48563 45 50 65 125 14 11 11 18 21 15 13 18 2 4 1 0 2 1 2 2 1
+* 5p - 44 11798 12465 12685 11310 48258 50 37 57 146 15 12 16 18 24 29 7 27 1 4 4 1 3 3 4 3 1
+* 5p - 45 11791 12284 12505 11367 47947 63 43 52 107 22 18 13 20 17 16 13 22 1 5 3 4 4 1 4 1 1
+* 5p - 46 11693 12495 12221 11198 47607 58 37 52 112 20 14 15 16 13 17 11 20 2 0 3 1 2 1 2 2 0
+* 5p - 47 11519 12189 12392 11087 47187 45 33 60 136 24 16 15 18 18 13 11 14 2 2 3 1 1 2 0 1 0
+* 5p - 48 11568 11831 12127 11256 46782 45 31 42 128 14 16 12 15 19 15 14 22 3 1 2 2 2 1 1 1 0
+* 5p - 49 11293 11924 11865 11192 46274 53 23 45 150 10 13 8 16 21 19 11 25 2 3 1 1 1 2 2 1 0
+* 5p - 50 11111 11812 12190 10655 45768 46 39 62 125 8 12 11 11 21 16 7 16 4 2 1 1 2 4 1 2 0
+* 5p - 51 11276 11338 11862 10684 45160 45 44 62 114 18 14 7 7 17 14 11 22 4 3 5 1 2 0 3 6 0
+* 5p - 52 11088 11566 11555 10388 44597 49 30 51 126 20 12 9 18 16 15 8 12 5 3 3 1 3 2 3 2 0
+* 5p - 53 10623 11348 11641 10423 44035 50 36 47 143 12 13 6 12 13 19 5 21 3 2 2 7 3 2 2 2 0
+* 5p - 54 10732 11121 11225 10297 43375 49 36 56 131 17 11 13 14 14 17 11 13 1 3 3 4 0 2 2 0 0
+* 5p - 55 10499 10956 11103 10169 42727 52 27 63 130 16 15 5 4 16 14 8 15 2 2 1 3 1 1 0 1 0
+* 5p - 56 10216 10768 11033 10047 42064 49 34 44 107 13 7 12 12 17 16 11 30 2 4 4 4 1 0 0 1 0
+* 5p - 57 10352 10359 10744 9917 41372 39 37 45 127 24 9 9 10 18 14 14 12 3 2 1 1 2 2 1 2 0
+* 5p - 58 9918 10555 10454 9763 40690 48 33 43 106 17 19 10 13 21 25 14 21 1 3 2 3 0 1 1 2 0
+* 5p - 59 9853 10121 10441 9579 39994 42 27 33 111 13 15 11 6 17 14 10 20 2 2 4 2 0 2 1 0 0
+* 5p - 60 9684 10029 10204 9372 39289 38 30 55 103 14 15 17 13 19 17 9 25 4 1 4 1 4 1 0 1 0
+* 5p - 61 9474 9809 10107 9186 38576 38 23 48 98 18 15 11 14 20 11 8 20 3 1 4 2 1 2 0 0 0
+* 5p - 62 9329 9615 9895 9048 37887 40 36 28 128 14 9 16 12 20 20 11 15 4 4 2 2 0 2 2 2 0
+* 5p - 63 9315 9256 9741 8864 37176 32 28 40 108 12 13 5 16 18 14 8 20 1 3 0 2 3 0 0 3 0
+* 5p - 64 8860 9575 9245 8701 36381 34 32 51 118 14 12 10 16 10 16 10 15 0 2 2 1 1 0 0 1 0
+* 5p - 65 8787 9061 9246 8540 35634 46 28 45 90 7 10 11 12 13 21 8 9 4 3 3 3 1 0 1 2 0
+* 5p - 66 8791 8822 8931 8283 34827 48 32 28 92 24 19 11 17 15 17 8 16 6 0 0 4 3 2 0 1 0
+* 5p - 67 8546 8654 8679 8181 34060 41 31 41 115 18 9 14 14 19 24 6 19 0 3 1 2 0 0 0 1 0
+* 5p - 68 8221 8479 8645 7966 33311 28 31 42 91 5 10 6 11 14 19 11 14 3 5 4 1 0 0 0 1 0
+* 5p - 69 8123 8106 8444 7759 32432 45 37 35 85 10 10 7 6 26 17 6 11 0 3 2 0 1 1 1 0 0
+* 5p - 70 7805 8184 8095 7630 31714 31 18 30 110 10 11 12 18 16 20 5 13 3 0 1 3 0 1 1 1 0
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/3pGtoA_freq.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/3pGtoA_freq.txt
new file mode 100644
index 0000000..2c2278e
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/3pGtoA_freq.txt
@@ -0,0 +1,26 @@
+pos 5pG>A
+1 0.00772745392658919
+2 0.00997316863261277
+3 0.00731896647321924
+4 0.00695218975388505
+5 0.00621290067273447
+6 0.00530534351145038
+7 0.00596302921884317
+8 0.0055918663761801
+9 0.00448782676988668
+10 0.0051333932853717
+11 0.00467465882530348
+12 0.00431785470797667
+13 0.00378107217729859
+14 0.00377006345651362
+15 0.00403179357216911
+16 0.00447945052073612
+17 0.00437138053427984
+18 0.00455829723111697
+19 0.00466533729636179
+20 0.0046786690122394
+21 0.00432982642713358
+22 0.0037881657701341
+23 0.00375497819078324
+24 0.00332429024492759
+25 0.00450518664344666
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/5pCtoT_freq.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/5pCtoT_freq.txt
new file mode 100644
index 0000000..a77f3b6
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/5pCtoT_freq.txt
@@ -0,0 +1,26 @@
+pos 5pC>T
+1 0.00338759222576293
+2 0.00750250964230993
+3 0.00514366796736431
+4 0.00394578203207775
+5 0.00413158715310259
+6 0.00389531805461009
+7 0.00402113064732319
+8 0.00384341637010676
+9 0.00290697674418605
+10 0.00311400937866354
+11 0.00420835025287017
+12 0.00310791582415937
+13 0.00318985114028012
+14 0.00311229907363333
+15 0.00335971330446469
+16 0.003479726017362
+17 0.00282780175621372
+18 0.00346441602476689
+19 0.00263558409740525
+20 0.00363783346806791
+21 0.00260593700430546
+22 0.00343960352097049
+23 0.00314012338837785
+24 0.00326536811330068
+25 0.0037405550983766
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/Fragmisincorporation_plot.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/Fragmisincorporation_plot.pdf
new file mode 100644
index 0000000..f1b7717
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/Fragmisincorporation_plot.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/Length_plot.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/Length_plot.pdf
new file mode 100644
index 0000000..7ad2453
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/Length_plot.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/Runtime_log.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/Runtime_log.txt
new file mode 100644
index 0000000..285c81d
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/Runtime_log.txt
@@ -0,0 +1,4 @@
+2013-10-24 15:12:10,949 INFO main: Started with the command: /home/mischu/bin/mapDamage/bin/mapDamage --no-stats --merge-reference-sequences -t mapDamage plot for library 'Pi1889_id_TAGCTT' -i - -d /home/mischu/scratch/bam_pipeline/5af1a3db-891c-4058-91d2-a662ba97b928 -r 000_prefixes/Pi_nucl.fasta --downsample 100000
+2013-10-24 15:13:53,769 DEBUG main: BAM read in 105.623730 seconds
+2013-10-24 15:13:54,529 INFO main: Successful run
+2013-10-24 15:13:54,529 DEBUG main: Run completed in 106.384062 seconds
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_correct_prob.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_correct_prob.csv
new file mode 100644
index 0000000..d53203c
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_correct_prob.csv
@@ -0,0 +1,25 @@
+"","Position","C.T","G.A"
+"1",1,0.0879043004632763,0.0836071104754618
+"2",2,0.039886935209767,0.0341324192954408
+"3",3,0.0234462957519323,0.0201199436536477
+"4",4,0.0156748357696076,0.0138914238235195
+"5",5,0.0116434446674029,0.01025058083045
+"6",6,0.00923052375900713,0.00804768689601317
+"7",7,0.00765961797363144,0.00662859750540502
+"8",8,0.00657751328623983,0.00564476058338746
+"9",9,0.0057741378486381,0.00494609331724558
+"10",10,0.0051504530352889,0.00443047112227744
+"11",11,0.00465577461528243,0.00402989595280368
+"12",12,0.00421283924599604,0.00375626974056187
+"13",-12,0.00367761104535901,0.00427846486948603
+"14",-11,0.00396728379805689,0.00470593683005674
+"15",-10,0.00437608021009923,0.00519352095933294
+"16",-9,0.00486760268365033,0.00583602009498782
+"17",-8,0.00541692381368254,0.00675529383190953
+"18",-7,0.00619881587100285,0.00799169709415256
+"19",-6,0.00745088244049932,0.00969336653574586
+"20",-5,0.00944281994454415,0.0122756083598161
+"21",-4,0.012677355806506,0.0166202923091534
+"22",-3,0.0184896879821862,0.0246582145795596
+"23",-2,0.0313086526809579,0.0419357282494106
+"24",-1,0.0748792634433484,0.0949098964302959
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_hist.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_hist.pdf
new file mode 100644
index 0000000..e0e97b0
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_hist.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_iter_summ_stat.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_iter_summ_stat.csv
new file mode 100644
index 0000000..5741ce9
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_iter_summ_stat.csv
@@ -0,0 +1,45 @@
+"","Theta","DeltaD","DeltaS","Lambda","Rho","LogLik"
+"Mean",0.0267276899413452,4.59261646178454e-06,0.120023906729044,0.926946633607648,0.269067208427093,-4739.58234846016
+"Std.",0.000211070569777376,4.68375043465101e-06,0.211389602896981,0.152863256781855,0.00359381391975458,1.67627417795788
+"Acceptance ratio",0.17726,0.1155,0.2289,0.16596,0.22114,0.65628
+"0%",0.0259022019796439,5.61959569716871e-09,4.75960046606837e-06,0.00202333624637677,0.255917716903448,-4753.34235834606
+"2.5%",0.0263071629813409,1.08572155265929e-07,4.75960046606837e-06,0.42065690510347,0.262111511114179,-4743.66952996502
+"5%",0.0263769554094461,2.62058401135438e-07,3.55724762587106e-05,0.591548057828431,0.263164559011639,-4742.77850529076
+"7.5%",0.0264250482759767,3.62309213879133e-07,0.000166582859843223,0.688607838434791,0.263854830315116,-4742.25448834969
+"10%",0.026460620478246,4.90054752944718e-07,0.000219164553288225,0.757040646908249,0.264467280374352,-4741.86489297696
+"12.5%",0.0264892407759142,6.33182365626628e-07,0.000295124900708358,0.811764683464597,0.264902251055913,-4741.54662686136
+"15%",0.0265121103927374,7.52878723736473e-07,0.000550015378272595,0.849472619498446,0.265315183588417,-4741.28528613913
+"17.5%",0.0265317617218573,8.6257796606588e-07,0.000653380260284858,0.880765451185898,0.265668346949484,-4741.04819122067
+"20%",0.0265512365698577,1.01721616450001e-06,0.000888122425230385,0.906483819723061,0.266046173691264,-4740.85080970522
+"22.5%",0.026569036775646,1.14943261137768e-06,0.00115947479849327,0.924462665704289,0.266343151366041,-4740.66354917552
+"25%",0.0265873706643275,1.31457864601889e-06,0.00173630578962956,0.939505306445971,0.266642569993501,-4740.48681627829
+"27.5%",0.0266029499243427,1.43286480281433e-06,0.00197833742717748,0.950702758933402,0.266948372571654,-4740.32855571278
+"30%",0.0266170398249067,1.59944403782952e-06,0.00236517289718259,0.959541952496135,0.267191396164529,-4740.18666672942
+"32.5%",0.0266340324611633,1.73883283091395e-06,0.00286939110700162,0.967524443557714,0.267428938557699,-4740.05087911798
+"35%",0.0266470299303806,1.87698168296552e-06,0.00374563966191921,0.97326916604423,0.267640015336712,-4739.91887209544
+"37.5%",0.0266592505365866,2.07584127885416e-06,0.0044617804734908,0.978745576042179,0.267915344243931,-4739.79990780192
+"40%",0.0266723480429323,2.28526888369302e-06,0.00564810552571734,0.983099930848627,0.268165281892257,-4739.68411020384
+"42.5%",0.0266873363860792,2.47611665195512e-06,0.00688978694700587,0.987489484531557,0.268370013879467,-4739.5641061942
+"45%",0.0267001916354469,2.6764401458849e-06,0.00882993631290875,0.990253745566916,0.26857057477266,-4739.46162186964
+"47.5%",0.0267141376156555,2.90743684466084e-06,0.0113009470845546,0.991999205754203,0.268815395101726,-4739.3578082639
+"50%",0.0267279860365846,3.10612874649677e-06,0.0139170139651084,0.993416247734022,0.269057379756578,-4739.25092622926
+"52.5%",0.026742866312116,3.35062127842456e-06,0.0181620294708513,0.995052578949859,0.269314125866,-4739.14767679374
+"55%",0.0267566163898141,3.64539331892216e-06,0.0214561010790649,0.996313500744903,0.26954729553304,-4739.04514176845
+"57.5%",0.0267719737394944,3.91383810137039e-06,0.0275948806072789,0.996905210339528,0.269793706440492,-4738.94626883993
+"60%",0.0267850212490813,4.13042057523478e-06,0.0357639914589177,0.997535794327448,0.270002543917125,-4738.86149993288
+"62.5%",0.0267975981790248,4.39285887103974e-06,0.0440207731101324,0.997842626522336,0.27022501202238,-4738.77138380973
+"65%",0.0268116086261324,4.72114083211495e-06,0.0543414938681373,0.998350387142015,0.270449281533407,-4738.68596625949
+"67.5%",0.0268253694035546,4.99999095658927e-06,0.0685588169194208,0.998766793230549,0.270691694945354,-4738.60147489066
+"70%",0.0268422219130614,5.35648626132374e-06,0.0829908000842559,0.998959369259401,0.270951281436517,-4738.51591158383
+"72.5%",0.0268568206323077,5.77505744118852e-06,0.105923018019621,0.999081134818817,0.271196074318128,-4738.42103529357
+"75%",0.0268724758879511,6.20581407184946e-06,0.131342036025384,0.999282794334074,0.271483191638363,-4738.33560926478
+"77.5%",0.0268860074347799,6.73887798638295e-06,0.163293983367041,0.99946296222391,0.271762253226818,-4738.25062825565
+"80%",0.0269029545040006,7.27841892642437e-06,0.1997610447564,0.999523608477344,0.272118459638263,-4738.16135334076
+"82.5%",0.0269221230660803,7.94600490175415e-06,0.244795453322632,0.999607399916201,0.27244436004087,-4738.06782511119
+"85%",0.0269425786499011,8.78566486390559e-06,0.296093308033493,0.999713195395418,0.272780840426472,-4737.96780448075
+"87.5%",0.0269680986854934,9.68435406474668e-06,0.358948651555185,0.999746808103712,0.27319891473509,-4737.85930364966
+"90%",0.0269932133327014,1.08331006777309e-05,0.440052056254867,0.999778950524049,0.2736125976697,-4737.73939029143
+"92.5%",0.0270281669729405,1.24414308246295e-05,0.529988043417338,0.999814430384894,0.274261315011906,-4737.62079035886
+"95%",0.0270700396204653,1.42890785193369e-05,0.640666698699883,0.999867987230572,0.275004882431294,-4737.491242633
+"97.5%",0.0271382284576324,1.76308917294565e-05,0.791671764517294,0.999917702877937,0.276146804992258,-4737.30342853945
+"100%",0.0275470501113482,4.51055013643359e-05,0.999568261579627,0.999971397852597,0.282296822344932,-4736.80397875156
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_post_pred.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_post_pred.pdf
new file mode 100644
index 0000000..336abbf
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_post_pred.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_trace.pdf b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_trace.pdf
new file mode 100644
index 0000000..9a01c7c
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/Stats_out_MCMC_trace.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/dnacomp.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/dnacomp.txt
new file mode 100644
index 0000000..44ab525
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/dnacomp.txt
@@ -0,0 +1,324 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_nucl.fasta as reference file
+# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads
+Chr End Std Pos A C G T Total
+* 3p + -70 7632 8432 8060 7627 31751
+* 3p + -69 7871 8531 8181 7898 32481
+* 3p + -68 7977 8645 8473 8117 33212
+* 3p + -67 8216 9089 8491 8150 33946
+* 3p + -66 8353 9126 8675 8441 34595
+* 3p + -65 8412 9440 8883 8595 35330
+* 3p + -64 8705 9620 9082 8711 36118
+* 3p + -63 9004 9657 9280 8924 36865
+* 3p + -62 8960 10038 9652 8950 37600
+* 3p + -61 9397 10190 9675 9037 38299
+* 3p + -60 9469 10414 9676 9478 39037
+* 3p + -59 9544 10561 10075 9578 39758
+* 3p + -58 9720 10602 10311 9846 40479
+* 3p + -57 9884 10938 10482 9908 41212
+* 3p + -56 10169 11033 10659 10056 41917
+* 3p + -55 10188 11304 10801 10239 42532
+* 3p + -54 10541 11384 10858 10433 43216
+* 3p + -53 10538 11753 11069 10488 43848
+* 3p + -52 10569 11880 11286 10695 44430
+* 3p + -51 10846 11895 11336 10920 44997
+* 3p + -50 10934 12061 11770 10746 45511
+* 3p + -49 11000 12102 11761 11151 46014
+* 3p + -48 11214 12367 12050 10882 46513
+* 3p + -47 11214 12425 12229 11115 46983
+* 3p + -46 11542 12393 12269 11189 47393
+* 3p + -45 11424 12717 12263 11361 47765
+* 3p + -44 11487 12915 12326 11355 48083
+* 3p + -43 11525 12704 12635 11509 48373
+* 3p + -42 11785 12941 12418 11486 48630
+* 3p + -41 11738 13064 12821 11222 48845
+* 3p + -40 11909 12936 12700 11492 49037
+* 3p + -39 11756 13347 12642 11457 49202
+* 3p + -38 11841 13123 12726 11637 49327
+* 3p + -37 12083 13251 12719 11392 49445
+* 3p + -36 11956 13126 12896 11558 49536
+* 3p + -35 11847 13358 13006 11400 49611
+* 3p + -34 12091 13196 13012 11375 49674
+* 3p + -33 11946 13178 13041 11540 49705
+* 3p + -32 11926 13266 13046 11486 49724
+* 3p + -31 12009 13187 12812 11739 49747
+* 3p + -30 12104 13168 12810 11654 49736
+* 3p + -29 11958 13222 13123 11482 49785
+* 3p + -28 12167 13025 13126 11485 49803
+* 3p + -27 12118 13114 13068 11528 49828
+* 3p + -26 12185 13127 13046 11488 49846
+* 3p + -25 12178 13022 13222 11417 49839
+* 3p + -24 11952 13219 13089 11603 49863
+* 3p + -23 12087 12916 13360 11508 49871
+* 3p + -22 12120 13027 13160 11531 49838
+* 3p + -21 12067 13255 13101 11440 49863
+* 3p + -20 12132 13160 13402 11175 49869
+* 3p + -19 12245 13185 13295 11146 49871
+* 3p + -18 12224 13038 13238 11372 49872
+* 3p + -17 12102 13265 13364 11136 49867
+* 3p + -16 12117 13164 13505 11083 49869
+* 3p + -15 12281 13121 12980 11206 49588
+* 3p + -14 12099 12975 13514 11281 49869
+* 3p + -13 12066 13151 13293 11360 49870
+* 3p + -12 11832 13470 13128 11439 49869
+* 3p + -11 12256 13264 13184 11165 49869
+* 3p + -10 12358 13067 13408 11039 49872
+* 3p + -9 13581 12579 13326 10387 49873
+* 3p + -8 13336 11843 13768 10926 49873
+* 3p + -7 12610 13188 12546 11528 49872
+* 3p + -6 12861 12327 13113 11572 49873
+* 3p + -5 12331 13170 12612 11760 49873
+* 3p + -4 12375 11868 13394 12236 49873
+* 3p + -3 12692 11075 13450 12654 49871
+* 3p + -2 14388 14415 9754 11314 49871
+* 3p + -1 6491 12423 16315 14644 49873
+* 3p + 1 4533 19543 4475 21317 49868
+* 3p + 2 10151 14213 14134 11367 49865
+* 3p + 3 11757 12824 13229 12054 49864
+* 3p + 4 11942 12896 13041 11985 49864
+* 3p + 5 12005 12768 13112 11977 49862
+* 3p + 6 12107 12616 13132 12005 49860
+* 3p + 7 12150 13019 12728 11962 49859
+* 3p + 8 12237 12958 12947 11716 49858
+* 3p + 9 12408 12628 12942 11880 49858
+* 3p + 10 12084 12854 12705 12214 49857
+* 3p - -70 7610 8351 7903 7779 31643
+* 3p - -69 7713 8465 8077 8052 32307
+* 3p - -68 7984 8686 8437 7994 33101
+* 3p - -67 8133 8930 8475 8320 33858
+* 3p - -66 8290 9229 8555 8568 34642
+* 3p - -65 8513 9299 9030 8528 35370
+* 3p - -64 8705 9494 9170 8748 36117
+* 3p - -63 8774 9714 9507 8897 36892
+* 3p - -62 8922 10037 9614 9061 37634
+* 3p - -61 9319 10123 9752 9205 38399
+* 3p - -60 9328 10539 9792 9488 39147
+* 3p - -59 9677 10615 10074 9497 39863
+* 3p - -58 9704 10440 10581 9790 40515
+* 3p - -57 9882 11106 10158 10111 41257
+* 3p - -56 10027 11029 10890 10009 41955
+* 3p - -55 10297 11219 10886 10248 42650
+* 3p - -54 10216 11474 11076 10523 43289
+* 3p - -53 10576 11685 11246 10424 43931
+* 3p - -52 10798 11715 11394 10658 44565
+* 3p - -51 10659 12090 11514 10830 45093
+* 3p - -50 10888 12303 11704 10750 45645
+* 3p - -49 11070 12111 11919 11077 46177
+* 3p - -48 11233 12484 11871 11104 46692
+* 3p - -47 11309 12474 12318 11019 47120
+* 3p - -46 11402 12538 12351 11255 47546
+* 3p - -45 11585 12781 12218 11332 47916
+* 3p - -44 11426 13163 12358 11308 48255
+* 3p - -43 11677 12841 12579 11475 48572
+* 3p - -42 11680 13101 12638 11431 48850
+* 3p - -41 11792 13179 12522 11586 49079
+* 3p - -40 11824 13140 12760 11548 49272
+* 3p - -39 11755 13249 12738 11674 49416
+* 3p - -38 12016 13242 12850 11443 49551
+* 3p - -37 11886 13229 12918 11625 49658
+* 3p - -36 11722 13261 13032 11749 49764
+* 3p - -35 12075 13358 12845 11555 49833
+* 3p - -34 12032 13177 13002 11701 49912
+* 3p - -33 11756 13389 12982 11828 49955
+* 3p - -32 11950 13322 13239 11474 49985
+* 3p - -31 11941 13312 13080 11675 50008
+* 3p - -30 11884 13302 13032 11793 50011
+* 3p - -29 11966 13253 13211 11616 50046
+* 3p - -28 12146 13211 13206 11503 50066
+* 3p - -27 12073 13379 13125 11504 50081
+* 3p - -26 12119 13370 13262 11346 50097
+* 3p - -25 12114 13151 13231 11585 50081
+* 3p - -24 12050 13333 13152 11576 50111
+* 3p - -23 12176 13296 13099 11546 50117
+* 3p - -22 12205 13175 13292 11415 50087
+* 3p - -21 12035 13413 13263 11401 50112
+* 3p - -20 12207 13281 13359 11273 50120
+* 3p - -19 12248 13296 13309 11271 50124
+* 3p - -18 12161 13180 13351 11429 50121
+* 3p - -17 12108 13283 13478 11258 50127
+* 3p - -16 12241 13208 13324 11351 50124
+* 3p - -15 12277 13118 13142 11316 49853
+* 3p - -14 12287 13155 13323 11357 50122
+* 3p - -13 12156 13086 13468 11414 50124
+* 3p - -12 12037 13241 13379 11466 50123
+* 3p - -11 12139 13419 13376 11190 50124
+* 3p - -10 12327 13209 13368 11220 50124
+* 3p - -9 13494 12664 13505 10463 50126
+* 3p - -8 13337 12216 13801 10772 50126
+* 3p - -7 12573 13317 12625 11611 50126
+* 3p - -6 13010 12287 13134 11696 50127
+* 3p - -5 12332 13253 12644 11898 50127
+* 3p - -4 12540 12021 13505 12061 50127
+* 3p - -3 12620 11085 13582 12836 50123
+* 3p - -2 14170 14511 9949 11496 50126
+* 3p - -1 6523 12607 16312 14685 50127
+* 3p - 1 4583 19409 4570 21563 50125
+* 3p - 2 10040 14157 14292 11636 50125
+* 3p - 3 11842 12924 13300 12059 50125
+* 3p - 4 12162 13019 12862 12081 50124
+* 3p - 5 12151 13028 13097 11847 50123
+* 3p - 6 12318 12560 13211 12033 50122
+* 3p - 7 12189 13008 12778 12147 50122
+* 3p - 8 12259 12993 12910 11960 50122
+* 3p - 9 12275 12758 13130 11958 50121
+* 3p - 10 12194 12947 12755 12225 50121
+* 5p + -10 12336 12688 12852 11985 49861
+* 5p + -9 12009 13200 12561 12093 49863
+* 5p + -8 12076 13012 12696 12079 49863
+* 5p + -7 12278 12723 12939 11926 49866
+* 5p + -6 12056 13085 12387 12339 49867
+* 5p + -5 11754 13268 12867 11979 49868
+* 5p + -4 12054 12797 12917 12101 49869
+* 5p + -3 12182 13421 12591 11677 49871
+* 5p + -2 11673 13814 14678 9707 49872
+* 5p + -1 23508 2854 21269 2241 49872
+* 5p + 1 15358 17470 11231 5814 49873
+* 5p + 2 10732 9343 14794 15004 49873
+* 5p + 3 12877 14141 10491 12363 49872
+* 5p + 4 12409 13618 11689 12147 49863
+* 5p + 5 11834 12839 13031 12166 49870
+* 5p + 6 11958 13226 12104 12585 49873
+* 5p + 7 12178 12735 12742 12218 49873
+* 5p + 8 11026 14191 11334 13321 49872
+* 5p + 9 10734 13536 12361 13242 49873
+* 5p + 10 11280 13790 12737 12066 49873
+* 5p + 11 11421 13498 13226 11728 49873
+* 5p + 12 11804 13424 12978 11667 49873
+* 5p + 13 11638 13521 12718 11995 49872
+* 5p + 14 11254 13725 13124 11770 49873
+* 5p + 15 11623 13483 13064 11641 49811
+* 5p + 16 11671 13680 12774 11748 49873
+* 5p + 17 11564 13429 13181 11699 49873
+* 5p + 18 11694 13682 12795 11702 49873
+* 5p + 19 11772 13405 12833 11863 49873
+* 5p + 20 11628 13586 13052 11607 49873
+* 5p + 21 12005 13339 13021 11508 49873
+* 5p + 22 11871 13428 12879 11694 49872
+* 5p + 23 11748 13624 12869 11631 49872
+* 5p + 24 11934 13292 12994 11651 49871
+* 5p + 25 11870 13398 12796 11807 49871
+* 5p + 26 11733 13426 12991 11698 49848
+* 5p + 27 12076 13057 13080 11618 49831
+* 5p + 28 11920 13438 12695 11753 49806
+* 5p + 29 11701 13464 12990 11633 49788
+* 5p + 30 11885 13278 12854 11750 49767
+* 5p + 31 12078 13216 12844 11608 49746
+* 5p + 32 11837 13357 12991 11540 49725
+* 5p + 33 11891 13365 13147 11303 49706
+* 5p + 34 12148 13175 12877 11474 49674
+* 5p + 35 11851 13259 13021 11476 49607
+* 5p + 36 12021 13048 13090 11379 49538
+* 5p + 37 11784 13211 12944 11506 49445
+* 5p + 38 11668 13210 12982 11467 49327
+* 5p + 39 12130 12857 12842 11373 49202
+* 5p + 40 11974 12953 12673 11439 49039
+* 5p + 41 11855 12830 12754 11406 48845
+* 5p + 42 11822 12706 12779 11323 48630
+* 5p + 43 11740 12672 12657 11305 48374
+* 5p + 44 11610 12611 12574 11286 48081
+* 5p + 45 11689 12415 12420 11241 47765
+* 5p + 46 11726 12378 12194 11092 47390
+* 5p + 47 11330 12301 12223 11129 46983
+* 5p + 48 11374 12017 12150 10975 46516
+* 5p + 49 11251 12053 11769 10941 46014
+* 5p + 50 11091 11907 11742 10772 45512
+* 5p + 51 11098 11664 11615 10620 44997
+* 5p + 52 11015 11607 11484 10323 44429
+* 5p + 53 10688 11623 11359 10178 43848
+* 5p + 54 10809 11164 11282 9962 43217
+* 5p + 55 10637 10905 10824 10166 42532
+* 5p + 56 10332 10763 11043 9781 41919
+* 5p + 57 10240 10613 10780 9579 41212
+* 5p + 58 9995 10343 10495 9646 40479
+* 5p + 59 9824 10341 10280 9314 39759
+* 5p + 60 9515 10058 10253 9212 39038
+* 5p + 61 9561 9824 9997 8917 38299
+* 5p + 62 9130 9801 9723 8946 37600
+* 5p + 63 9071 9345 9743 8706 36865
+* 5p + 64 8945 9351 9270 8552 36118
+* 5p + 65 8767 8924 9320 8319 35330
+* 5p + 66 8609 8904 9080 8002 34595
+* 5p + 67 8556 8655 8668 8066 33945
+* 5p + 68 8095 8529 8704 7885 33213
+* 5p + 69 8066 8119 8513 7699 32397
+* 5p + 70 7909 7988 8229 7625 31751
+* 5p - -10 12204 12879 12895 12131 50109
+* 5p - -9 12135 13163 12649 12163 50110
+* 5p - -8 11988 13126 12872 12125 50111
+* 5p - -7 12270 12716 13026 12099 50111
+* 5p - -6 12283 13068 12551 12211 50113
+* 5p - -5 12018 13173 12703 12221 50115
+* 5p - -4 11862 12999 13027 12228 50116
+* 5p - -3 12220 13638 12703 11557 50118
+* 5p - -2 12048 13860 14548 9664 50120
+* 5p - -1 23785 2912 21146 2278 50121
+* 5p - 1 15654 17360 11296 5817 50127
+* 5p - 2 10617 9605 14878 15027 50127
+* 5p - 3 13038 14069 10547 12471 50125
+* 5p - 4 12425 13854 11774 12065 50118
+* 5p - 5 11859 12944 12914 12410 50127
+* 5p - 6 11786 13367 12126 12848 50127
+* 5p - 7 12173 12752 12953 12249 50127
+* 5p - 8 11087 14122 11424 13493 50126
+* 5p - 9 10800 13502 12463 13362 50127
+* 5p - 10 11308 13656 12929 12233 50126
+* 5p - 11 11542 13717 13203 11665 50127
+* 5p - 12 11798 13415 13208 11705 50126
+* 5p - 13 11674 13920 12698 11834 50126
+* 5p - 14 11437 13795 12934 11960 50126
+* 5p - 15 11795 13475 12921 11884 50075
+* 5p - 16 11652 13794 12791 11890 50127
+* 5p - 17 11699 13605 13166 11657 50127
+* 5p - 18 11810 13620 12934 11763 50127
+* 5p - 19 11592 13716 12772 12047 50127
+* 5p - 20 11528 13778 13017 11804 50127
+* 5p - 21 12024 13299 13126 11678 50127
+* 5p - 22 11844 13769 12808 11703 50124
+* 5p - 23 11590 13641 13164 11727 50122
+* 5p - 24 12097 13256 12975 11792 50120
+* 5p - 25 11784 13474 12887 11973 50118
+* 5p - 26 11760 13485 13147 11709 50101
+* 5p - 27 11933 13557 12962 11632 50084
+* 5p - 28 11784 13414 12937 11924 50059
+* 5p - 29 11720 13398 13014 11915 50047
+* 5p - 30 11942 13254 13072 11763 50031
+* 5p - 31 11864 13589 12916 11642 50011
+* 5p - 32 11865 13423 13122 11575 49985
+* 5p - 33 12042 13356 13041 11518 49957
+* 5p - 34 11974 13311 12813 11815 49913
+* 5p - 35 11689 13497 12990 11657 49833
+* 5p - 36 12040 13116 12971 11640 49767
+* 5p - 37 12025 13320 12694 11618 49657
+* 5p - 38 11951 13147 12904 11547 49549
+* 5p - 39 12044 12904 13022 11447 49417
+* 5p - 40 12015 12950 12752 11554 49271
+* 5p - 41 11614 12952 13015 11498 49079
+* 5p - 42 11851 12801 12780 11417 48849
+* 5p - 43 11808 12900 12475 11390 48573
+* 5p - 44 11435 12834 12772 11215 48256
+* 5p - 45 11683 12394 12613 11228 47918
+* 5p - 46 11525 12729 12112 11180 47546
+* 5p - 47 11207 12381 12348 11185 47121
+* 5p - 48 11379 12113 12235 10965 46692
+* 5p - 49 11504 11984 11938 10751 46177
+* 5p - 50 11179 11859 12093 10514 45645
+* 5p - 51 11107 11727 11843 10416 45093
+* 5p - 52 10827 11628 11675 10435 44565
+* 5p - 53 10837 11501 11445 10148 43931
+* 5p - 54 10674 11093 11388 10135 43290
+* 5p - 55 10404 11255 11019 9971 42649
+* 5p - 56 10203 10986 10980 9786 41955
+* 5p - 57 10265 10563 10684 9745 41257
+* 5p - 58 9882 10539 10479 9615 40515
+* 5p - 59 9808 10191 10490 9375 39864
+* 5p - 60 9622 9997 10289 9239 39147
+* 5p - 61 9419 9911 9887 9182 38399
+* 5p - 62 9219 9752 9766 8898 37635
+* 5p - 63 9091 9543 9560 8698 36892
+* 5p - 64 9109 9203 9351 8454 36117
+* 5p - 65 8594 9028 9315 8433 35370
+* 5p - 66 8565 8835 9138 8105 34643
+* 5p - 67 8385 8689 8737 8048 33859
+* 5p - 68 8159 8469 8662 7811 33101
+* 5p - 69 8090 8053 8455 7618 32216
+* 5p - 70 7927 8061 8116 7540 31644
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/dnacomp_genome.csv b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/dnacomp_genome.csv
new file mode 100644
index 0000000..2fc7659
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/dnacomp_genome.csv
@@ -0,0 +1,2 @@
+A,C,G,T
+0.245290724081,0.254598401178,0.255055853499,0.245055021242
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/lgdistribution.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/lgdistribution.txt
new file mode 100644
index 0000000..e29278f
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/lgdistribution.txt
@@ -0,0 +1,328 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_nucl.fasta as reference file
+# Std: strand of reads
+Std Length Occurences
++ 23 1
++ 24 1
++ 25 4
++ 26 4
++ 27 8
++ 28 7
++ 29 12
++ 30 12
++ 31 9
++ 32 13
++ 33 25
++ 34 47
++ 35 62
++ 36 78
++ 37 106
++ 38 120
++ 39 153
++ 40 180
++ 41 204
++ 42 249
++ 43 277
++ 44 313
++ 45 366
++ 46 407
++ 47 457
++ 48 487
++ 49 496
++ 50 514
++ 51 563
++ 52 574
++ 53 621
++ 54 674
++ 55 610
++ 56 704
++ 57 732
++ 58 710
++ 59 720
++ 60 740
++ 61 689
++ 62 732
++ 63 747
++ 64 790
++ 65 731
++ 66 642
++ 67 722
++ 68 714
++ 69 752
++ 70 712
++ 71 745
++ 72 733
++ 73 738
++ 74 708
++ 75 680
++ 76 665
++ 77 651
++ 78 645
++ 79 641
++ 80 652
++ 81 642
++ 82 648
++ 83 656
++ 84 627
++ 85 588
++ 86 615
++ 87 609
++ 88 636
++ 89 545
++ 90 582
++ 91 643
++ 92 753
++ 93 1064
++ 94 6219
++ 95 320
++ 96 264
++ 97 233
++ 98 363
++ 99 230
++ 100 234
++ 101 205
++ 102 208
++ 103 194
++ 104 216
++ 105 178
++ 106 179
++ 107 182
++ 108 195
++ 109 179
++ 110 179
++ 111 175
++ 112 173
++ 113 147
++ 114 166
++ 115 147
++ 116 156
++ 117 151
++ 118 139
++ 119 149
++ 120 136
++ 121 151
++ 122 114
++ 123 140
++ 124 117
++ 125 110
++ 126 119
++ 127 102
++ 128 108
++ 129 104
++ 130 110
++ 131 96
++ 132 90
++ 133 82
++ 134 93
++ 135 87
++ 136 75
++ 137 90
++ 138 67
++ 139 71
++ 140 71
++ 141 63
++ 142 58
++ 143 51
++ 144 59
++ 145 61
++ 146 64
++ 147 44
++ 148 53
++ 149 55
++ 150 41
++ 151 56
++ 152 42
++ 153 41
++ 154 40
++ 155 40
++ 156 45
++ 157 38
++ 158 30
++ 159 31
++ 160 21
++ 161 34
++ 162 41
++ 163 41
++ 164 29
++ 165 16
++ 166 20
++ 167 31
++ 168 27
++ 169 32
++ 170 16
++ 171 29
++ 172 20
++ 173 32
++ 174 14
++ 175 17
++ 176 15
++ 177 18
++ 178 15
++ 179 9
++ 180 17
++ 181 19
++ 182 15
++ 183 18
++ 184 15
++ 185 10
+- 25 3
+- 26 5
+- 27 8
+- 28 2
+- 29 7
+- 30 12
+- 31 21
+- 32 22
+- 33 33
+- 34 64
+- 35 55
+- 36 94
+- 37 93
+- 38 131
+- 39 135
+- 40 178
+- 41 223
+- 42 263
+- 43 311
+- 44 330
+- 45 366
+- 46 419
+- 47 418
+- 48 508
+- 49 521
+- 50 545
+- 51 523
+- 52 629
+- 53 634
+- 54 634
+- 55 698
+- 56 690
+- 57 733
+- 58 647
+- 59 707
+- 60 742
+- 61 761
+- 62 746
+- 63 764
+- 64 747
+- 65 728
+- 66 776
+- 67 752
+- 68 767
+- 69 680
+- 70 690
+- 71 722
+- 72 693
+- 73 713
+- 74 641
+- 75 700
+- 76 694
+- 77 691
+- 78 628
+- 79 657
+- 80 692
+- 81 652
+- 82 635
+- 83 607
+- 84 632
+- 85 614
+- 86 584
+- 87 595
+- 88 605
+- 89 584
+- 90 585
+- 91 659
+- 92 805
+- 93 1065
+- 94 6307
+- 95 313
+- 96 251
+- 97 270
+- 98 391
+- 99 213
+- 100 219
+- 101 250
+- 102 231
+- 103 207
+- 104 187
+- 105 187
+- 106 182
+- 107 177
+- 108 179
+- 109 160
+- 110 186
+- 111 182
+- 112 159
+- 113 168
+- 114 161
+- 115 132
+- 116 150
+- 117 147
+- 118 109
+- 119 152
+- 120 122
+- 121 130
+- 122 117
+- 123 129
+- 124 128
+- 125 125
+- 126 105
+- 127 103
+- 128 94
+- 129 85
+- 130 96
+- 131 97
+- 132 75
+- 133 78
+- 134 73
+- 135 82
+- 136 77
+- 137 67
+- 138 61
+- 139 70
+- 140 59
+- 141 62
+- 142 61
+- 143 64
+- 144 62
+- 145 52
+- 146 48
+- 147 57
+- 148 45
+- 149 46
+- 150 45
+- 151 46
+- 152 43
+- 153 32
+- 154 33
+- 155 44
+- 156 35
+- 157 27
+- 158 37
+- 159 35
+- 160 27
+- 161 22
+- 162 31
+- 163 32
+- 164 23
+- 165 27
+- 166 46
+- 167 35
+- 168 29
+- 169 24
+- 170 25
+- 171 22
+- 172 21
+- 173 22
+- 174 24
+- 175 23
+- 176 33
+- 177 13
+- 178 15
+- 179 11
+- 180 19
+- 181 12
+- 182 11
+- 183 19
+- 184 17
+- 185 16
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/misincorporation.txt b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/misincorporation.txt
new file mode 100644
index 0000000..8973ac8
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.Pi_nucl.mapDamage/Pi1889_id_TAGCTT/misincorporation.txt
@@ -0,0 +1,284 @@
+# table produced by mapDamage version 2.0.1-10-g0a7fa27-dirty
+# using mapped file - and 000_prefixes/Pi_nucl.fasta as reference file
+# Chr: reference from sam/bam header, End: from which termini of DNA sequences, Std: strand of reads
+Chr End Std Pos A C G T Total G>A C>T A>G T>C A>C A>T C>G C>A T>G T>A G>C G>T A>- T>- C>- G>- ->A ->T ->C ->G S
+* 3p + 1 6247 12451 16312 14859 49869 125 72 74 167 25 26 63 115 74 128 29 56 0 0 0 0 0 0 0 0 42
+* 3p + 2 14272 14314 9787 11494 49867 103 55 62 180 30 28 21 64 48 69 29 34 0 0 1 0 0 0 0 0 38
+* 3p + 3 12609 10983 13456 12820 49868 97 53 66 187 19 24 40 41 44 53 20 40 0 0 0 1 1 0 0 0 36
+* 3p + 4 12321 11764 13401 12385 49871 99 47 72 177 16 22 25 32 38 32 14 28 0 0 0 0 1 0 1 0 36
+* 3p + 5 12304 13017 12634 11913 49868 82 43 63 189 28 23 21 25 27 34 22 29 0 0 0 0 0 2 3 0 35
+* 3p + 6 12863 12166 13073 11759 49861 64 33 91 201 20 17 13 24 29 32 10 24 1 2 2 1 4 3 1 4 33
+* 3p + 7 12598 13054 12523 11685 49860 72 39 79 171 22 14 13 16 31 35 10 24 3 3 4 1 3 5 3 1 33
+* 3p + 8 13336 11715 13731 11082 49864 68 41 89 181 13 16 22 20 30 25 12 28 2 4 1 0 5 1 0 3 30
+* 3p + 9 13594 12483 13276 10512 49865 61 53 90 171 11 26 25 21 29 29 14 18 2 4 2 2 3 3 0 2 28
+* 3p + 10 12368 12944 13367 11184 49863 74 51 89 173 17 13 23 14 34 25 16 19 4 4 2 3 1 4 3 1 27
+* 3p + 11 12225 13103 13186 11348 49862 56 35 58 200 20 14 17 23 18 30 12 18 4 0 0 5 3 2 1 1 27
+* 3p + 12 11873 13353 13066 11568 49860 54 43 87 161 26 18 31 16 27 24 18 24 4 6 4 2 5 0 1 3 25
+* 3p + 13 12066 13040 13278 11479 49863 56 45 63 158 18 18 20 25 20 27 15 20 4 2 2 0 2 3 1 1 23
+* 3p + 14 12135 12827 13486 11417 49865 44 46 70 169 35 18 11 13 30 18 23 22 3 3 1 2 1 2 1 0 22
+* 3p + 15 12329 12979 12931 11341 49580 56 37 101 175 26 13 13 22 25 21 11 26 4 9 3 0 3 3 1 1 20
+* 3p + 16 12121 13029 13493 11217 49860 64 42 66 166 22 16 14 13 31 14 14 16 2 0 3 5 4 3 0 2 20
+* 3p + 17 12120 13134 13323 11284 49861 52 51 79 172 21 16 17 16 23 31 17 14 4 4 3 3 2 0 3 1 20
+* 3p + 18 12211 12884 13222 11550 49867 71 46 73 196 23 13 14 14 36 32 19 19 4 4 5 4 1 2 0 2 19
+* 3p + 19 12241 13051 13283 11288 49863 64 40 63 173 18 12 16 15 22 23 6 22 3 3 1 2 3 3 0 2 19
+* 3p + 20 12121 13031 13367 11341 49860 66 40 82 170 18 6 18 19 21 20 13 12 4 3 3 3 5 1 0 3 19
+* 3p + 21 12085 13118 13090 11559 49852 59 33 82 160 22 19 13 14 17 28 14 22 3 3 5 5 2 2 4 3 19
+* 3p + 22 12127 12920 13124 11658 49829 54 38 70 164 20 8 22 15 18 20 11 17 1 2 2 0 3 2 2 4 19
+* 3p + 23 12103 12799 13305 11658 49865 50 46 90 163 17 17 16 26 24 25 15 15 0 1 4 2 3 0 0 3 17
+* 3p + 24 11971 13078 13062 11741 49852 43 42 62 181 25 15 10 12 29 18 11 24 2 3 6 1 6 1 0 4 16
+* 3p + 25 12198 12908 13201 11523 49830 62 55 87 154 16 12 9 20 24 25 16 22 3 4 2 3 0 4 2 3 16
+* 3p + 26 12183 12997 13024 11635 49839 49 39 68 156 16 15 13 15 25 25 9 23 2 4 0 3 1 2 1 3 16
+* 3p + 27 12132 13016 13061 11617 49826 57 52 62 141 20 15 9 11 21 13 16 18 4 6 3 0 1 0 1 0 16
+* 3p + 28 12183 12919 13081 11617 49800 52 42 67 157 12 15 6 18 35 17 4 12 3 5 2 1 1 1 0 1 15
+* 3p + 29 12002 13099 13080 11595 49776 38 46 76 148 19 15 12 12 29 19 14 25 2 7 4 2 3 2 1 3 15
+* 3p + 30 12131 13038 12779 11778 49726 50 36 73 167 23 24 7 14 27 18 9 20 4 2 4 1 3 1 3 3 15
+* 3p + 31 12002 13097 12800 11840 49739 54 44 76 142 16 13 9 21 18 16 8 25 4 2 2 2 3 2 2 1 14
+* 3p + 32 11976 13156 12998 11590 49720 49 40 83 147 17 15 15 14 29 20 9 25 4 3 0 4 1 2 1 0 14
+* 3p + 33 11975 13069 13004 11652 49700 40 46 78 133 10 14 7 10 28 21 10 22 3 3 4 1 1 3 0 1 14
+* 3p + 34 12127 13086 12957 11496 49666 36 50 75 158 15 10 10 12 24 25 10 21 6 3 0 3 2 1 2 3 13
+* 3p + 35 11870 13229 12983 11524 49606 48 47 84 176 19 9 8 15 21 16 9 25 3 2 2 4 2 1 2 0 13
+* 3p + 36 11962 13050 12855 11661 49528 42 43 65 127 9 11 13 12 17 21 4 12 3 5 0 3 4 0 4 0 13
+* 3p + 37 12115 13136 12688 11499 49438 37 47 73 130 16 12 9 8 17 21 9 22 2 8 1 2 1 3 4 1 13
+* 3p + 38 11871 13044 12700 11709 49324 38 44 62 144 24 10 7 24 19 9 5 16 0 1 4 2 1 2 1 0 13
+* 3p + 39 11760 13238 12617 11579 49194 47 38 68 142 12 13 10 12 18 15 10 17 1 4 2 2 3 2 3 1 13
+* 3p + 40 11962 12825 12668 11577 49032 36 34 67 130 14 19 9 13 16 12 8 16 3 0 1 4 1 0 0 4 13
+* 3p + 41 11754 12975 12793 11319 48841 35 37 74 147 11 11 6 16 18 20 8 29 1 4 1 3 1 1 0 3 12
+* 3p + 42 11802 12854 12384 11588 48628 41 37 76 124 13 16 6 13 21 26 5 21 8 0 1 3 2 0 1 0 11
+* 3p + 43 11555 12574 12602 11639 48370 46 39 61 149 14 17 11 9 17 18 12 11 3 2 3 2 2 0 2 2 11
+* 3p + 44 11521 12835 12302 11422 48080 39 45 69 129 20 15 11 14 12 15 10 27 4 2 1 2 3 3 1 2 11
+* 3p + 45 11444 12634 12258 11428 47764 33 39 46 112 26 14 15 17 13 21 11 13 3 3 3 2 1 0 3 0 9
+* 3p + 46 11570 12333 12229 11261 47393 39 47 86 124 13 14 16 13 12 21 7 24 4 5 2 4 1 1 0 0 9
+* 3p + 47 11254 12291 12209 11225 46979 35 42 58 152 16 16 7 13 16 13 15 17 5 2 3 3 0 2 5 0 9
+* 3p + 48 11242 12288 12004 10971 46505 34 29 79 114 15 13 14 11 17 22 8 20 1 4 2 4 1 2 4 3 8
+* 3p + 49 10995 12022 11743 11247 46007 41 40 51 117 9 6 4 16 15 22 7 13 5 6 2 0 4 3 2 4 8
+* 3p + 50 10953 11998 11753 10801 45505 36 39 63 104 15 12 10 19 18 18 2 23 3 1 4 3 1 5 2 4 7
+* 3p + 51 10853 11839 11317 10984 44993 38 53 56 112 10 13 9 9 15 19 6 16 0 1 3 3 3 3 1 1 7
+* 3p + 52 10621 11794 11239 10775 44429 32 35 57 122 15 17 7 8 11 9 8 19 5 2 5 2 2 1 2 2 6
+* 3p + 53 10579 11669 11057 10542 43847 29 40 59 108 17 17 9 9 18 12 4 21 3 1 3 0 1 3 2 0 6
+* 3p + 54 10584 11309 10818 10506 43217 28 36 66 120 20 12 11 11 6 19 7 15 7 0 8 1 2 2 0 0 5
+* 3p + 55 10183 11220 10786 10347 42536 36 41 45 118 13 10 7 11 22 18 6 19 1 3 4 1 1 1 0 1 5
+* 3p + 56 10174 11003 10659 10088 41924 35 40 41 92 17 10 9 11 14 10 10 15 1 4 1 2 1 0 1 0 4
+* 3p + 57 9922 10866 10446 9981 41215 31 40 49 101 5 18 7 14 20 20 4 22 3 2 2 3 0 2 1 0 3
+* 3p + 58 9728 10553 10307 9891 40479 31 42 58 98 13 10 9 9 9 10 3 22 3 2 3 2 2 1 1 0 3
+* 3p + 59 9569 10511 10059 9616 39755 30 34 47 72 12 13 4 8 11 14 8 18 6 1 2 1 2 2 2 3 2
+* 3p + 60 9487 10376 9662 9509 39034 31 30 54 93 11 12 4 15 19 9 5 17 0 6 6 2 1 3 3 3 2
+* 3p + 61 9399 10150 9652 9100 38301 30 37 38 77 18 7 7 14 17 12 6 19 0 2 4 3 2 0 0 1 2
+* 3p + 62 8971 9989 9643 9002 37605 31 29 53 93 12 8 5 11 11 13 7 17 1 0 4 2 2 2 0 3 2
+* 3p + 63 9028 9589 9270 8985 36872 23 30 42 72 10 10 6 4 10 14 11 7 1 1 3 1 3 1 0 0 2
+* 3p + 64 8737 9591 9057 8740 36125 27 36 46 73 8 11 7 11 15 7 9 11 1 0 2 2 1 0 1 0 2
+* 3p + 65 8443 9375 8892 8624 35334 28 31 42 88 11 16 4 10 7 10 9 15 1 1 5 2 1 2 0 0 1
+* 3p + 66 8345 9070 8676 8505 34596 25 28 42 85 12 8 2 12 10 15 7 15 2 1 4 0 2 1 0 1 1
+* 3p + 67 8217 9044 8475 8211 33947 23 28 48 76 7 8 5 18 10 13 6 9 1 0 4 6 2 1 2 2 1
+* 3p + 68 7996 8603 8457 8166 33222 26 30 36 76 13 12 5 9 11 16 5 15 2 5 2 1 2 0 1 1 1
+* 3p + 69 7858 8488 8188 7948 32482 36 38 34 73 12 11 7 13 11 13 9 12 1 2 2 3 1 2 1 0 1
+* 3p + 70 7670 8387 8030 7668 31755 27 31 44 71 14 6 5 6 12 8 7 13 5 4 2 1 0 3 0 2 1
+* 3p - 1 6317 12601 16299 14909 50126 127 70 69 197 30 42 86 100 74 119 35 54 0 0 0 0 0 0 0 0 45
+* 3p - 2 14068 14451 9966 11638 50123 94 44 78 150 37 25 39 79 26 69 34 33 0 0 0 0 0 1 0 1 42
+* 3p - 3 12530 10982 13597 13008 50117 101 58 62 200 26 24 40 43 37 53 19 35 0 0 1 0 4 1 0 1 39
+* 3p - 4 12509 11896 13497 12215 50117 88 50 79 188 24 24 25 36 32 31 21 23 2 0 0 0 4 0 2 4 37
+* 3p - 5 12322 13137 12636 12024 50119 75 60 76 185 30 24 27 32 33 27 21 34 2 4 2 1 5 1 1 1 34
+* 3p - 6 12989 12169 13127 11832 50117 75 42 64 166 27 22 22 28 23 29 12 18 4 4 4 1 3 2 3 2 30
+* 3p - 7 12554 13161 12632 11771 50118 78 46 67 198 24 18 17 21 26 27 19 23 1 1 3 0 3 1 2 2 30
+* 3p - 8 13312 12066 13809 10928 50115 86 35 72 180 20 22 13 20 25 26 16 17 3 2 5 4 4 1 4 2 26
+* 3p - 9 13495 12525 13463 10634 50117 59 42 78 177 24 15 23 14 29 37 13 17 2 6 4 2 4 2 1 2 24
+* 3p - 10 12326 13075 13321 11395 50117 63 45 86 182 26 17 18 23 36 40 12 20 1 4 6 4 4 0 1 2 24
+* 3p - 11 12145 13258 13340 11370 50113 68 37 85 203 17 11 18 19 33 19 17 23 2 4 5 2 3 5 1 2 22
+* 3p - 12 12044 13136 13336 11598 50114 60 47 79 166 15 19 23 17 27 24 11 20 2 5 5 5 1 3 3 2 22
+* 3p - 13 12183 12960 13434 11535 50112 45 35 81 161 23 14 13 22 21 25 11 24 4 6 7 4 5 5 2 0 22
+* 3p - 14 12295 13051 13304 11459 50109 57 55 74 167 28 19 20 20 17 28 12 26 7 3 7 4 2 5 2 4 22
+* 3p - 15 12297 12965 13112 11467 49841 49 38 72 177 19 16 12 16 26 17 11 21 1 4 3 7 4 3 4 3 22
+* 3p - 16 12251 13067 13296 11497 50111 56 38 69 178 22 17 12 17 23 27 8 20 5 5 1 4 1 2 3 5 22
+* 3p - 17 12112 13130 13442 11435 50119 65 46 87 201 12 19 12 14 33 28 10 24 2 2 1 4 3 1 2 2 22
+* 3p - 18 12181 13028 13323 11579 50111 50 34 80 170 20 10 9 18 18 25 13 22 2 3 4 4 3 2 3 2 21
+* 3p - 19 12260 13162 13296 11397 50115 60 34 74 156 22 21 16 15 20 22 13 16 1 1 3 5 4 4 0 1 18
+* 3p - 20 12208 13140 13350 11415 50113 59 39 71 178 20 16 12 15 17 25 17 23 3 4 3 3 1 5 0 1 16
+* 3p - 21 12065 13302 13239 11495 50101 55 48 82 149 22 17 10 13 24 17 13 24 2 4 2 4 2 2 5 2 14
+* 3p - 22 12212 13044 13274 11541 50071 46 29 78 155 19 11 7 30 21 22 9 32 4 6 7 4 6 4 4 2 14
+* 3p - 23 12208 13176 13060 11662 50106 49 40 73 156 26 19 19 17 20 18 16 20 4 5 2 1 3 3 2 3 12
+* 3p - 24 12075 13198 13109 11723 50105 44 50 77 170 16 16 12 15 26 23 11 15 3 5 5 1 4 0 1 1 12
+* 3p - 25 12121 13015 13213 11727 50076 57 41 61 176 25 20 15 20 25 22 24 20 2 6 2 1 0 3 1 2 12
+* 3p - 26 12158 13237 13228 11468 50091 40 42 72 165 20 11 10 16 19 11 9 16 2 4 2 4 2 0 1 2 11
+* 3p - 27 12064 13287 13110 11613 50074 48 43 70 150 12 14 17 15 27 23 8 15 2 3 5 7 2 5 0 0 11
+* 3p - 28 12186 13080 13164 11629 50059 48 43 78 159 20 15 5 12 30 32 8 32 5 3 4 1 2 3 0 2 11
+* 3p - 29 11984 13154 13192 11709 50039 47 45 78 150 14 12 8 15 29 17 11 30 2 2 2 4 2 3 2 0 11
+* 3p - 30 11899 13201 13006 11899 50005 51 50 71 139 19 22 15 12 31 18 15 23 4 2 0 3 6 0 0 0 10
+* 3p - 31 11952 13189 13069 11793 50003 65 36 68 154 21 13 10 18 25 19 5 25 3 2 0 3 1 1 1 2 10
+* 3p - 32 11993 13216 13191 11576 49976 39 51 84 142 19 16 7 11 20 17 13 18 3 3 4 3 2 5 2 1 10
+* 3p - 33 11789 13316 12947 11894 49946 44 55 70 130 17 25 11 13 16 19 5 16 6 2 2 4 1 3 4 2 10
+* 3p - 34 12045 13074 12984 11801 49904 45 43 58 140 17 21 4 20 21 25 4 22 3 6 2 2 2 2 1 4 9
+* 3p - 35 12083 13246 12834 11662 49825 57 45 80 163 17 16 9 19 7 23 9 25 4 3 4 2 2 0 5 2 9
+* 3p - 36 11732 13147 13007 11869 49755 37 48 78 141 12 8 9 10 15 32 11 16 2 1 2 3 5 2 1 2 9
+* 3p - 37 11904 13142 12883 11721 49650 36 43 69 138 15 9 9 13 14 20 8 15 7 1 3 4 5 0 1 2 9
+* 3p - 38 12061 13126 12805 11549 49541 35 50 82 160 14 19 12 17 21 22 11 22 0 5 5 5 1 1 7 2 8
+* 3p - 39 11797 13152 12686 11772 49407 43 34 86 137 15 14 7 20 19 16 9 21 5 1 2 2 1 4 3 1 7
+* 3p - 40 11844 13074 12732 11615 49265 45 51 63 126 17 22 10 13 25 19 10 22 0 2 4 0 2 2 4 0 7
+* 3p - 41 11817 13081 12484 11695 49077 37 37 72 124 10 15 17 13 24 25 14 16 8 2 3 2 2 1 0 0 7
+* 3p - 42 11714 12981 12619 11532 48846 34 47 64 151 17 13 10 10 20 16 10 27 7 3 3 3 1 2 1 0 7
+* 3p - 43 11692 12745 12557 11571 48565 36 47 60 142 14 10 7 13 20 11 13 22 3 0 2 2 3 2 3 2 7
+* 3p - 44 11463 13108 12287 11390 48248 36 43 76 118 18 12 11 18 21 17 8 20 5 3 2 2 2 6 2 0 7
+* 3p - 45 11602 12651 12219 11440 47912 44 37 67 141 19 12 11 10 22 19 11 26 3 6 3 1 1 3 2 1 7
+* 3p - 46 11429 12464 12331 11322 47546 45 53 56 136 14 15 9 12 18 11 8 21 4 1 2 1 1 0 2 2 7
+* 3p - 47 11329 12362 12311 11109 47111 50 34 58 122 14 16 12 16 20 13 6 18 2 0 2 5 4 3 3 0 7
+* 3p - 48 11283 12409 11849 11148 46689 32 41 66 111 11 11 11 8 16 7 13 19 1 3 2 2 1 2 1 1 5
+* 3p - 49 11044 12058 11908 11160 46170 44 43 57 110 12 12 8 13 18 15 10 20 2 9 2 3 1 2 1 4 5
+* 3p - 50 10922 12175 11673 10871 45641 24 39 52 142 18 14 8 10 20 13 9 15 3 7 2 2 1 2 1 2 5
+* 3p - 51 10668 12016 11485 10920 45089 38 48 67 135 8 12 6 10 12 19 6 21 4 5 1 5 1 3 1 2 4
+* 3p - 52 10825 11660 11368 10708 44561 38 39 62 105 13 10 5 17 18 15 7 20 0 3 4 4 3 4 0 1 4
+* 3p - 53 10618 11598 11239 10468 43923 28 50 60 103 13 9 5 4 17 16 9 27 1 1 2 1 4 5 1 2 4
+* 3p - 54 10247 11411 11054 10578 43290 35 41 55 106 18 10 7 10 20 16 8 18 4 1 0 4 0 4 1 1 4
+* 3p - 55 10297 11157 10877 10321 42652 33 38 56 104 10 13 5 10 16 10 7 18 3 0 2 0 1 2 1 2 4
+* 3p - 56 10044 10993 10873 10050 41960 40 39 52 82 12 10 7 11 17 13 19 15 0 2 2 3 1 0 0 0 4
+* 3p - 57 9949 11048 10115 10150 41262 27 49 58 96 9 14 10 12 15 10 9 17 1 1 1 0 0 1 0 1 4
+* 3p - 58 9733 10373 10561 9847 40514 30 40 66 106 8 12 6 8 12 12 8 16 2 4 2 1 2 3 1 0 4
+* 3p - 59 9687 10519 10093 9567 39866 26 47 44 113 13 10 4 6 15 13 11 20 2 2 1 0 0 2 1 1 3
+* 3p - 60 9341 10481 9780 9549 39151 25 32 43 92 12 13 2 13 16 17 6 25 1 3 0 1 2 3 1 1 3
+* 3p - 61 9327 10085 9748 9247 38407 40 41 48 82 16 12 4 12 15 12 5 13 1 5 1 1 2 3 1 2 3
+* 3p - 62 8947 9993 9588 9105 37633 32 52 48 90 11 11 10 7 8 16 10 14 1 1 4 2 3 4 3 3 2
+* 3p - 63 8793 9685 9496 8913 36887 24 32 38 73 15 11 11 10 18 11 5 23 3 1 3 0 7 3 1 2 2
+* 3p - 64 8720 9453 9163 8787 36123 30 40 51 78 8 14 11 14 9 13 5 24 0 1 1 2 3 0 1 0 2
+* 3p - 65 8518 9275 9021 8564 35378 30 28 35 68 9 12 7 5 16 13 11 19 4 3 2 1 1 0 1 0 2
+* 3p - 66 8318 9184 8558 8584 34644 25 33 45 63 9 15 4 10 14 13 6 23 3 1 2 1 1 2 0 2 2
+* 3p - 67 8137 8881 8477 8368 33863 33 38 43 73 16 6 8 11 11 13 9 11 3 3 5 1 0 2 1 0 2
+* 3p - 68 7993 8658 8417 8043 33111 23 33 39 73 8 8 9 11 11 14 8 19 2 2 1 2 1 1 0 0 2
+* 3p - 69 7710 8447 8076 8083 32316 27 37 25 65 16 7 6 12 15 13 7 13 1 3 0 1 0 0 1 0 2
+* 3p - 70 7612 8344 7893 7807 31656 21 35 33 56 4 14 9 14 19 12 7 11 2 2 0 1 0 0 1 1 2
+* 5p + 1 15388 17503 11166 5815 49872 49 61 104 55 26 16 27 43 12 24 16 13 0 0 0 0 0 0 0 0 30
+* 5p + 2 10780 9333 14753 15005 49871 36 70 80 80 19 24 10 15 13 24 5 22 2 0 0 1 1 0 0 0 23
+* 5p + 3 12885 14104 10488 12390 49867 48 61 56 90 30 14 10 27 18 20 10 23 3 0 1 1 0 1 3 0 19
+* 5p + 4 12406 13560 11699 12192 49857 51 48 60 101 20 13 9 20 18 25 12 36 3 3 0 1 2 1 2 1 19
+* 5p + 5 11863 12766 13009 12228 49866 38 54 59 118 22 14 13 12 18 14 8 22 0 3 1 1 1 2 0 1 18
+* 5p + 6 11997 13164 12067 12641 49869 34 52 72 121 15 27 8 15 15 20 3 23 4 3 2 2 1 0 2 1 17
+* 5p + 7 12214 12679 12705 12268 49866 42 49 82 108 14 15 4 18 11 16 2 16 3 0 5 0 2 0 3 2 16
+* 5p + 8 11066 14104 11304 13387 49861 28 56 63 123 20 10 12 7 17 19 10 21 2 1 4 5 2 3 4 2 16
+* 5p + 9 10791 13436 12288 13350 49865 16 43 81 147 7 14 3 8 21 17 4 21 0 2 4 3 2 3 1 2 15
+* 5p + 10 11332 13722 12690 12125 49869 32 42 77 114 16 17 11 12 17 10 4 20 3 2 1 5 1 1 1 1 15
+* 5p + 11 11445 13443 13171 11807 49866 39 62 79 126 13 14 10 21 30 19 9 18 2 4 0 2 5 2 0 0 15
+* 5p + 12 11835 13352 12937 11746 49870 38 40 84 100 15 8 5 14 20 18 11 20 0 5 2 0 0 0 1 2 15
+* 5p + 13 11677 13444 12669 12075 49865 31 45 82 120 15 10 12 12 12 24 9 21 2 1 1 3 1 2 1 3 14
+* 5p + 14 11303 13610 13098 11855 49866 33 46 82 151 17 11 6 14 14 11 13 26 2 3 6 2 1 1 3 2 14
+* 5p + 15 11663 13396 13006 11739 49804 37 47 87 137 8 10 13 13 12 16 8 12 2 2 6 4 1 3 1 2 14
+* 5p + 16 11696 13591 12748 11830 49865 41 48 75 138 14 13 9 13 11 14 12 19 3 8 2 4 3 4 1 0 14
+* 5p + 17 11620 13344 13117 11790 49871 30 32 75 123 16 16 4 12 26 10 5 19 4 1 6 1 0 0 0 2 14
+* 5p + 18 11726 13579 12756 11803 49864 26 39 68 128 12 14 13 10 27 19 8 23 7 3 1 1 5 0 3 1 14
+* 5p + 19 11819 13302 12795 11948 49864 32 29 72 133 14 21 9 16 17 12 6 25 3 3 4 5 2 1 5 1 14
+* 5p + 20 11673 13493 13011 11688 49865 32 51 76 140 12 12 6 11 22 17 5 24 0 2 0 4 4 2 1 1 13
+* 5p + 21 12029 13261 12969 11605 49864 42 31 78 124 15 10 17 18 25 12 8 19 2 2 3 3 2 5 2 0 13
+* 5p + 22 11893 13356 12848 11770 49867 35 42 70 124 8 9 7 20 15 14 13 25 2 4 3 1 3 0 2 1 13
+* 5p + 23 11762 13516 12838 11745 49861 31 44 78 141 9 7 11 17 23 25 21 19 1 0 4 4 4 3 3 1 13
+* 5p + 24 11981 13169 12946 11763 49859 33 44 78 150 16 11 5 12 17 15 10 16 1 3 2 2 0 5 2 5 13
+* 5p + 25 11886 13340 12773 11864 49863 43 55 67 111 21 13 16 16 18 17 13 20 0 5 1 1 3 2 1 2 13
+* 5p + 26 11768 13356 12938 11778 49840 37 52 75 129 13 19 16 13 21 23 12 22 5 4 1 2 3 4 1 0 12
+* 5p + 27 12098 12964 13057 11703 49822 46 41 75 121 18 13 8 15 19 17 12 17 3 3 4 1 5 1 2 1 12
+* 5p + 28 11957 13347 12632 11863 49799 40 45 78 144 18 12 14 11 27 16 8 13 6 3 3 3 4 1 3 0 12
+* 5p + 29 11708 13373 12980 11720 49781 44 45 59 126 23 10 11 18 20 19 12 19 0 4 3 1 2 3 0 2 12
+* 5p + 30 11925 13204 12808 11827 49764 37 47 71 132 14 12 17 14 23 10 10 22 1 1 3 6 1 0 0 2 12
+* 5p + 31 12137 13137 12772 11694 49740 33 43 94 114 13 12 13 14 20 13 16 14 2 6 3 1 1 4 1 0 11
+* 5p + 32 11857 13251 12968 11640 49716 42 36 81 139 16 15 8 16 19 24 15 21 3 4 3 4 1 7 0 1 11
+* 5p + 33 11958 13247 13112 11382 49699 51 37 102 129 16 24 8 9 21 15 10 28 0 3 1 0 1 3 2 1 11
+* 5p + 34 12139 13090 12842 11600 49671 47 42 67 133 16 13 6 18 23 30 10 15 4 1 3 0 2 1 0 0 11
+* 5p + 35 11880 13185 12996 11539 49600 40 41 64 121 12 15 12 13 20 17 8 24 3 2 2 3 0 3 3 1 11
+* 5p + 36 12061 12970 13027 11472 49530 41 46 81 128 22 12 14 15 33 17 13 21 2 3 8 4 1 1 0 6 11
+* 5p + 37 11835 13100 12894 11610 49439 39 54 80 150 18 9 6 11 26 11 7 20 3 4 4 3 2 2 3 1 11
+* 5p + 38 11680 13095 12959 11589 49323 50 37 73 158 14 18 13 15 27 21 14 23 1 2 2 1 2 1 2 0 11
+* 5p + 39 12146 12765 12844 11442 49197 59 47 77 128 18 13 7 15 15 16 9 21 3 4 3 4 1 4 1 0 10
+* 5p + 40 11943 12897 12650 11541 49031 55 42 55 122 11 13 13 14 21 21 9 23 1 8 0 4 3 2 2 1 9
+* 5p + 41 11879 12720 12735 11507 48841 39 43 63 131 18 15 7 13 22 22 9 23 2 6 0 3 1 2 0 2 9
+* 5p + 42 11865 12580 12729 11451 48625 43 36 74 154 16 19 13 9 32 30 9 19 2 1 4 3 0 1 1 4 8
+* 5p + 43 11759 12574 12616 11417 48366 40 41 71 148 19 19 11 17 21 17 5 20 4 3 0 2 3 6 0 2 7
+* 5p + 44 11618 12524 12571 11368 48081 44 53 57 126 16 7 11 13 25 19 10 21 4 3 1 4 2 0 2 2 7
+* 5p + 45 11698 12318 12388 11360 47764 42 38 73 137 23 8 18 19 23 21 20 23 3 1 6 6 1 2 1 0 7
+* 5p + 46 11754 12271 12184 11177 47386 43 33 60 148 15 19 11 17 28 16 13 29 2 1 3 3 0 2 3 1 7
+* 5p + 47 11321 12199 12215 11243 46978 55 44 63 150 17 16 5 22 22 24 13 20 1 5 2 2 2 3 1 1 7
+* 5p + 48 11398 11905 12112 11092 46507 45 50 64 138 16 15 8 16 27 23 9 12 5 2 0 1 2 1 5 3 7
+* 5p + 49 11260 11943 11774 11039 46016 51 41 59 138 18 17 9 15 19 13 16 19 2 3 1 3 1 3 0 1 7
+* 5p + 50 11076 11843 11737 10857 45513 54 51 49 116 8 15 10 17 24 17 7 16 4 3 4 3 2 2 1 0 7
+* 5p + 51 11092 11606 11601 10693 44992 46 38 66 126 15 8 14 16 14 15 9 24 3 3 5 3 4 3 1 1 6
+* 5p + 52 11031 11485 11468 10443 44427 53 28 57 137 18 16 17 18 19 11 6 12 6 4 2 0 3 2 2 1 6
+* 5p + 53 10701 11540 11327 10279 43847 44 43 69 139 14 18 12 11 21 31 8 25 0 1 5 3 5 0 1 0 6
+* 5p + 54 10809 11084 11238 10088 43219 40 49 55 123 19 12 11 16 20 29 12 14 3 1 0 4 1 1 0 1 6
+* 5p + 55 10627 10836 10845 10228 42536 46 40 37 129 12 14 13 18 16 15 12 17 4 5 4 1 1 0 0 2 6
+* 5p + 56 10344 10637 11043 9897 41921 45 41 57 131 17 9 10 14 23 18 8 20 5 3 2 2 0 1 1 3 6
+* 5p + 57 10242 10526 10762 9681 41211 50 30 50 126 24 10 8 12 15 19 13 17 2 2 2 1 0 3 0 2 6
+* 5p + 58 10005 10236 10502 9730 40473 36 35 48 122 11 15 10 12 22 13 14 13 0 1 1 3 2 5 1 1 5
+* 5p + 59 9803 10297 10275 9390 39765 53 33 54 109 16 7 9 13 21 14 8 17 1 1 7 4 0 0 0 0 5
+* 5p + 60 9510 9965 10269 9297 39041 52 40 50 120 12 14 10 17 23 11 17 18 0 1 3 4 2 1 0 1 5
+* 5p + 61 9576 9770 9976 8975 38297 47 44 48 103 13 15 14 15 11 17 8 9 2 3 2 0 2 3 2 0 4
+* 5p + 62 9116 9731 9726 9036 37609 56 31 50 104 17 11 14 13 22 20 12 20 1 1 1 0 2 0 1 0 3
+* 5p + 63 9098 9244 9719 8812 36873 43 27 48 121 23 12 10 13 19 15 5 16 4 2 1 2 3 0 0 0 2
+* 5p + 64 8941 9267 9254 8661 36123 49 28 50 109 22 12 10 16 25 23 8 17 5 0 2 1 1 1 1 1 2
+* 5p + 65 8761 8842 9344 8383 35330 43 30 42 111 13 10 6 15 16 15 9 28 4 5 1 2 2 1 2 2 1
+* 5p + 66 8608 8803 9095 8091 34597 30 20 35 106 15 15 14 15 12 15 19 10 3 1 3 1 1 0 1 1 1
+* 5p + 67 8553 8595 8644 8157 33949 33 28 43 86 15 11 6 11 21 20 9 9 0 3 1 1 0 3 0 0 1
+* 5p + 68 8049 8498 8729 7944 33220 47 30 29 85 12 9 10 17 17 22 10 15 2 1 4 0 2 1 1 1 1
+* 5p + 69 8068 8046 8551 7735 32400 50 33 35 101 14 12 8 11 12 15 8 29 7 1 1 3 1 1 0 2 1
+* 5p + 70 7903 7922 8213 7718 31756 37 26 41 94 10 11 10 19 15 16 16 11 1 1 1 2 0 0 2 2 0
+* 5p - 1 15710 17330 11242 5841 50123 51 57 109 68 38 10 12 29 27 21 20 25 0 0 0 0 0 0 0 0 26
+* 5p - 2 10685 9594 14829 15016 50124 34 72 91 78 16 23 14 15 27 13 18 32 0 1 0 0 0 0 0 0 21
+* 5p - 3 13042 14086 10553 12442 50123 46 84 51 67 24 20 9 22 25 20 6 37 0 0 0 2 0 0 0 0 19
+* 5p - 4 12423 13811 11787 12093 50114 54 60 46 97 21 20 17 14 20 17 17 24 1 1 3 0 1 2 0 0 16
+* 5p - 5 11889 12890 12874 12467 50120 30 52 70 105 12 11 7 15 17 18 7 18 0 0 1 1 2 0 4 1 15
+* 5p - 6 11814 13278 12089 12938 50119 38 51 84 138 9 17 7 17 19 24 17 21 4 0 1 4 3 2 2 1 14
+* 5p - 7 12205 12687 12912 12313 50117 38 53 83 126 9 10 9 18 16 11 11 18 3 1 6 1 2 3 2 3 13
+* 5p - 8 11137 13996 11382 13604 50119 34 52 75 161 24 18 8 14 17 25 9 18 3 3 3 1 0 3 2 2 13
+* 5p - 9 10835 13396 12449 13436 50116 34 35 66 126 15 15 4 7 12 18 6 31 3 0 1 3 3 4 1 3 12
+* 5p - 10 11346 13574 12904 12295 50119 36 43 71 123 11 13 6 9 14 10 10 27 2 2 3 0 4 0 3 0 12
+* 5p - 11 11598 13646 13140 11735 50119 28 52 88 121 14 12 13 15 22 16 7 21 4 2 1 2 2 4 0 2 12
+* 5p - 12 11866 13354 13129 11762 50111 32 43 109 108 10 10 11 10 23 12 15 28 2 2 2 1 4 5 3 3 11
+* 5p - 13 11731 13830 12654 11900 50115 33 42 82 115 23 12 16 12 17 16 11 22 3 3 1 4 3 3 4 1 11
+* 5p - 14 11453 13701 12912 12052 50118 36 39 59 126 10 11 13 9 17 15 10 25 0 2 1 1 0 4 2 2 11
+* 5p - 15 11862 13392 12833 11983 50070 31 43 107 135 17 10 15 17 23 14 6 18 1 0 2 1 1 2 1 1 11
+* 5p - 16 11680 13710 12777 11957 50124 45 47 72 132 13 14 5 10 20 12 8 33 2 3 2 1 1 0 1 1 10
+* 5p - 17 11723 13532 13136 11734 50125 42 44 62 118 19 15 12 12 16 19 6 19 1 3 2 1 0 1 1 0 10
+* 5p - 18 11839 13554 12901 11829 50123 32 55 67 120 14 11 12 10 25 17 8 21 1 4 0 1 1 2 0 1 10
+* 5p - 19 11636 13637 12736 12112 50121 44 42 76 116 20 16 6 18 21 14 8 21 3 3 0 1 1 2 1 2 10
+* 5p - 20 11554 13721 12974 11873 50122 40 48 74 102 16 10 10 18 19 15 7 22 3 5 2 4 2 1 1 1 10
+* 5p - 21 12059 13217 13083 11762 50121 37 38 87 118 14 11 8 16 16 13 8 18 2 3 1 2 1 1 1 3 10
+* 5p - 22 11877 13682 12771 11787 50117 34 51 77 133 12 16 7 16 18 18 11 19 2 5 1 3 2 2 1 2 10
+* 5p - 23 11657 13553 13102 11806 50118 31 41 88 129 18 14 9 15 19 21 9 17 2 1 0 5 2 1 1 0 10
+* 5p - 24 12135 13168 12949 11862 50114 32 42 84 122 13 11 9 16 11 14 10 28 3 6 4 0 1 4 1 0 10
+* 5p - 25 11832 13394 12822 12061 50109 40 45 87 127 16 19 12 13 17 21 10 14 2 6 3 1 4 2 1 2 10
+* 5p - 26 11787 13366 13096 11846 50095 38 37 73 152 11 10 4 16 20 17 6 18 8 7 1 2 1 0 3 2 9
+* 5p - 27 11952 13464 12948 11716 50080 32 43 61 130 21 15 12 13 23 22 9 25 1 1 2 6 2 1 1 0 9
+* 5p - 28 11810 13326 12919 12001 50056 54 52 81 131 19 13 5 17 26 18 13 26 2 1 1 2 1 2 0 0 9
+* 5p - 29 11744 13329 12962 12004 50039 36 52 79 140 13 13 11 20 30 16 7 30 2 2 1 0 2 2 2 2 9
+* 5p - 30 11984 13150 13043 11849 50026 39 48 68 138 13 16 13 10 21 19 18 18 9 3 2 1 1 2 1 1 9
+* 5p - 31 11898 13500 12879 11733 50010 50 51 78 132 15 15 10 13 26 17 5 25 4 2 3 4 0 1 0 0 9
+* 5p - 32 11905 13341 13068 11659 49973 49 40 92 124 21 20 11 20 23 19 10 22 6 1 2 0 7 3 2 1 9
+* 5p - 33 12067 13235 13022 11625 49949 40 40 68 149 14 16 8 13 19 20 10 26 2 3 4 2 1 1 5 2 9
+* 5p - 34 11984 13228 12800 11892 49904 53 50 63 126 19 17 7 16 24 20 9 29 2 4 1 0 4 2 3 1 9
+* 5p - 35 11696 13377 12996 11753 49822 44 41 66 143 18 12 10 13 20 20 15 23 4 2 3 3 5 3 1 3 9
+* 5p - 36 12044 13030 12946 11736 49756 46 56 70 146 15 12 15 9 28 20 11 30 3 3 5 5 4 2 2 3 9
+* 5p - 37 12071 13233 12640 11709 49653 36 41 72 141 12 20 8 15 21 19 6 21 6 5 5 3 1 3 1 0 8
+* 5p - 38 11980 13049 12873 11637 49539 44 57 80 149 16 11 7 15 21 19 10 27 4 1 1 4 5 4 1 1 8
+* 5p - 39 12042 12803 13030 11538 49413 57 46 59 141 14 23 10 20 21 18 12 19 6 1 4 4 1 0 1 2 8
+* 5p - 40 12052 12860 12712 11640 49264 39 41 68 128 18 14 7 15 29 21 11 18 4 1 2 1 2 4 2 0 7
+* 5p - 41 11601 12866 13014 11587 49068 45 45 58 130 12 12 14 13 23 19 12 26 3 2 3 4 3 7 2 0 7
+* 5p - 42 11897 12687 12735 11521 48840 43 44 80 143 17 13 12 11 27 15 14 28 3 3 1 3 2 3 1 3 7
+* 5p - 43 11840 12797 12448 11488 48573 42 41 82 140 20 17 5 18 12 23 16 15 0 1 5 2 1 1 0 1 6
+* 5p - 44 11466 12752 12737 11298 48253 40 47 83 139 15 21 10 15 23 16 9 24 1 4 4 1 4 1 1 0 6
+* 5p - 45 11699 12298 12588 11333 47918 42 41 61 139 19 7 12 10 20 20 11 23 1 3 0 4 1 1 0 1 6
+* 5p - 46 11514 12623 12108 11300 47545 50 51 54 147 18 15 8 16 17 32 8 19 1 1 3 2 1 1 3 1 6
+* 5p - 47 11221 12282 12353 11259 47115 46 48 57 128 18 13 12 17 20 12 16 26 4 6 5 3 2 1 2 2 6
+* 5p - 48 11375 12021 12232 11057 46685 47 39 42 132 18 19 11 17 28 18 13 29 2 2 3 2 4 3 0 2 6
+* 5p - 49 11497 11872 11933 10869 46171 42 33 58 145 20 16 10 14 18 27 10 15 3 2 7 1 2 2 2 1 6
+* 5p - 50 11197 11761 12068 10614 45640 49 37 55 129 22 16 15 15 11 23 15 19 2 3 4 3 2 2 1 2 6
+* 5p - 51 11107 11614 11852 10517 45090 38 33 50 134 12 13 12 20 11 18 12 24 3 4 5 2 3 1 2 0 6
+* 5p - 52 10790 11550 11676 10547 44563 63 41 54 131 19 13 17 15 19 23 14 22 3 8 5 0 0 2 3 1 6
+* 5p - 53 10846 11393 11471 10212 43922 50 31 41 134 18 22 11 13 15 20 10 27 3 4 2 3 5 4 1 3 6
+* 5p - 54 10696 10991 11339 10267 43293 34 25 53 137 14 15 16 14 24 20 13 16 1 2 3 1 2 0 1 1 6
+* 5p - 55 10402 11159 11022 10064 42647 48 38 60 129 15 8 9 11 25 14 8 28 3 4 3 6 1 4 4 1 4
+* 5p - 56 10179 10899 10993 9883 41954 59 38 52 129 13 15 9 22 21 23 11 26 2 0 3 2 2 2 1 2 4
+* 5p - 57 10259 10497 10686 9816 41258 46 39 53 114 22 14 15 13 26 25 7 20 1 3 1 5 1 1 3 1 4
+* 5p - 58 9884 10460 10493 9675 40512 58 41 47 112 14 18 15 12 9 22 16 13 1 2 0 1 3 0 2 3 4
+* 5p - 59 9824 10103 10469 9466 39862 47 45 58 122 12 12 15 9 13 17 9 16 4 2 1 3 2 2 4 1 4
+* 5p - 60 9607 9912 10297 9334 39150 47 35 43 118 11 14 13 15 19 15 21 14 1 1 2 1 3 2 2 1 3
+* 5p - 61 9411 9835 9896 9261 38403 46 36 40 128 11 16 17 17 11 22 8 27 4 2 3 1 1 6 2 3 3
+* 5p - 62 9211 9704 9757 8967 37639 50 46 50 109 14 16 13 11 15 17 5 14 2 2 1 1 2 3 2 1 2
+* 5p - 63 9097 9450 9564 8777 36888 47 36 39 117 16 12 14 6 16 19 12 22 0 1 1 1 2 5 4 1 2
+* 5p - 64 9071 9130 9378 8540 36119 50 32 39 104 10 11 10 19 17 21 11 21 2 3 3 1 0 1 4 3 2
+* 5p - 65 8597 8960 9310 8506 35373 39 38 40 121 14 12 9 10 18 11 13 14 3 2 3 5 4 1 0 2 2
+* 5p - 66 8548 8755 9158 8182 34643 54 34 30 112 17 14 9 18 20 10 7 16 1 0 4 2 2 2 2 2 2
+* 5p - 67 8353 8601 8760 8149 33863 43 23 22 107 11 7 8 13 19 16 9 19 3 4 2 4 0 2 1 0 2
+* 5p - 68 8154 8396 8649 7909 33108 34 23 40 115 11 9 15 13 17 14 5 17 1 4 1 2 3 1 0 1 1
+* 5p - 69 8060 7987 8451 7723 32221 45 16 31 94 8 5 9 16 22 20 6 10 2 2 3 1 1 3 1 0 1
+* 5p - 70 7902 8017 8124 7612 31655 42 19 26 83 9 12 14 15 10 16 11 14 0 4 1 3 1 1 1 1 1
diff --git a/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.summary b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.summary
new file mode 100644
index 0000000..20f8e81
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/alignment/Pi1889.summary
@@ -0,0 +1,236 @@
+# Command:
+# /home/mischu/bin/pypeline/bin/bam_pipeline run 000_makefile.yaml --max-threads 16 --bwa-max-threads 4
+#
+# Directory:
+# /net/franklin/disk/franklin/data/mischu/projects/2013_09_nature_protocols/FINAL/alignment
+#
+# Makefile:
+# Filename: 000_makefile.yaml
+# SHA1Sum: ee7644dd0ecfee606a2873441020247def0e2355
+# MTime: 2013-10-21 17:37:32.317334
+#
+# Genomes:
+# Name Label Contigs Size Prefix
+# Pi_mito mitochondrial 1 37922 000_prefixes/Pi_mito.fasta
+# Pi_nucl nuclear 4921 228543505 000_prefixes/Pi_nucl.fasta
+#
+# Regions Of Interest:
+# Genome ROI Size NFeatures NIntervals Path
+#
+#
+Target Sample Library Measure Value # Description
+Pi1889 * * lib_type * # SE, PE, or * (for both)
+Pi1889 * * seq_reads_se 373965151 # Total number of single-ended reads
+Pi1889 * * seq_trash_se 33208573 # Total number of trashed reads
+Pi1889 * * seq_trash_se_frac 0.0888012503604 # Fraction of SE reads trashed
+Pi1889 * * seq_reads_pairs 403930795 # Total number of reads
+Pi1889 * * seq_trash_pe_1 121544 # Total number of reads
+Pi1889 * * seq_trash_pe_1_frac 0.000300903029689 # Fraction of PE mate 1 reads trashed
+Pi1889 * * seq_trash_pe_2 27737549 # Total number of reads
+Pi1889 * * seq_trash_pe_2_frac 0.0686690624814 # Fraction of PE mate 2 reads trashed
+Pi1889 * * seq_collapsed 364857043 # Total number of pairs collapsed into one read
+Pi1889 * * seq_collapsed_frac 0.903266221631 # Fraction of PE pairs collapsed into one read
+Pi1889 * * seq_retained_reads 755902032 # Total number of retained reads
+Pi1889 * * seq_retained_nts 59693699645 # Total number of NTs in retained reads
+Pi1889 * * seq_retained_length 78.9701536945 # Average number of NTs in retained reads
+
+Pi1889 * * hits_raw(endogenous) 43409647 # Total number of hits against the nuclear and mitochondrial genome
+Pi1889 * * hits_raw_frac(endogenous) 0.0574276098784 # Total number of hits vs. total number of reads retained
+Pi1889 * * hits_clonality(endogenous) 0.637265122197 # Fraction of hits that were PCR duplicates
+Pi1889 * * hits_unique(endogenous) 15746193 # Total number of unique reads (PCR duplicates removed)
+Pi1889 * * hits_unique_frac(endogenous) 0.0208309970517 # Total number of unique hits vs. total number of reads retained
+Pi1889 * * hits_coverage(endogenous) 5.46270395801 # Estimated coverage from unique hits
+Pi1889 * * hits_length(endogenous) 79.2999721266 # Average number of aligned bases per unique hit
+Pi1889 * * ratio_reads(nuc,mito) 315.036307803 # Ratio of unique hits: Hits(nuc) / H(mito)
+Pi1889 * * ratio_genome(mito,nuc) 41.1313169889 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(mito) / NTs(nuc)) / (Size(mito) / (2 * Size(nuc)))
+Pi1889 * * ratio_genome(nuc,mito) 0.02431237493 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(nuc) / NTs(mito)) / ((2 * Size(nuc)) / Size(mito))
+
+Pi1889 * * hits_raw(mitochondrial) 163719 # Total number of hits (prior to PCR duplicate filtering)
+Pi1889 * * hits_raw_frac(mitochondrial) 0.000216587590811 # Total number of hits vs. total number of reads retained
+Pi1889 * * hits_clonality(mitochondrial) 0.695673684789 # Fraction of hits that were PCR duplicates
+Pi1889 * * hits_unique(mitochondrial) 49824 # Total number of hits (excluding any PCR duplicates)
+Pi1889 * * hits_unique_frac(mitochondrial) 6.59133034319e-05 # Total number of unique hits vs. total number of reads retained
+Pi1889 * * hits_coverage(mitochondrial) 111.980618111 # Estimated coverage from unique hits
+Pi1889 * * hits_length(mitochondrial) 85.2305916827 # Average number of aligned bases per unique hit
+
+Pi1889 * * hits_raw(nuclear) 43245928 # Total number of hits (prior to PCR duplicate filtering)
+Pi1889 * * hits_raw_frac(nuclear) 0.0572110222876 # Total number of hits vs. total number of reads retained
+Pi1889 * * hits_clonality(nuclear) 0.637044000998 # Fraction of hits that were PCR duplicates
+Pi1889 * * hits_unique(nuclear) 15696369 # Total number of hits (excluding any PCR duplicates)
+Pi1889 * * hits_unique_frac(nuclear) 0.0207650837483 # Total number of unique hits vs. total number of reads retained
+Pi1889 * * hits_coverage(nuclear) 5.44502954481 # Estimated coverage from unique hits
+Pi1889 * * hits_length(nuclear) 79.2811469328 # Average number of aligned bases per unique hit
+
+
+Pi1889 Pi1889 * lib_type * # SE, PE, or * (for both)
+Pi1889 Pi1889 * seq_reads_se 373965151 # Total number of single-ended reads
+Pi1889 Pi1889 * seq_trash_se 33208573 # Total number of trashed reads
+Pi1889 Pi1889 * seq_trash_se_frac 0.0888012503604 # Fraction of SE reads trashed
+Pi1889 Pi1889 * seq_reads_pairs 403930795 # Total number of reads
+Pi1889 Pi1889 * seq_trash_pe_1 121544 # Total number of reads
+Pi1889 Pi1889 * seq_trash_pe_1_frac 0.000300903029689 # Fraction of PE mate 1 reads trashed
+Pi1889 Pi1889 * seq_trash_pe_2 27737549 # Total number of reads
+Pi1889 Pi1889 * seq_trash_pe_2_frac 0.0686690624814 # Fraction of PE mate 2 reads trashed
+Pi1889 Pi1889 * seq_collapsed 364857043 # Total number of pairs collapsed into one read
+Pi1889 Pi1889 * seq_collapsed_frac 0.903266221631 # Fraction of PE pairs collapsed into one read
+Pi1889 Pi1889 * seq_retained_reads 755902032 # Total number of retained reads
+Pi1889 Pi1889 * seq_retained_nts 59693699645 # Total number of NTs in retained reads
+Pi1889 Pi1889 * seq_retained_length 78.9701536945 # Average number of NTs in retained reads
+
+Pi1889 Pi1889 * hits_raw(endogenous) 43409647 # Total number of hits against the nuclear and mitochondrial genome
+Pi1889 Pi1889 * hits_raw_frac(endogenous) 0.0574276098784 # Total number of hits vs. total number of reads retained
+Pi1889 Pi1889 * hits_clonality(endogenous) 0.637265122197 # Fraction of hits that were PCR duplicates
+Pi1889 Pi1889 * hits_unique(endogenous) 15746193 # Total number of unique reads (PCR duplicates removed)
+Pi1889 Pi1889 * hits_unique_frac(endogenous) 0.0208309970517 # Total number of unique hits vs. total number of reads retained
+Pi1889 Pi1889 * hits_coverage(endogenous) 5.46270395801 # Estimated coverage from unique hits
+Pi1889 Pi1889 * hits_length(endogenous) 79.2999721266 # Average number of aligned bases per unique hit
+Pi1889 Pi1889 * ratio_reads(nuc,mito) 315.036307803 # Ratio of unique hits: Hits(nuc) / H(mito)
+Pi1889 Pi1889 * ratio_genome(mito,nuc) 41.1313169889 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(mito) / NTs(nuc)) / (Size(mito) / (2 * Size(nuc)))
+Pi1889 Pi1889 * ratio_genome(nuc,mito) 0.02431237493 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(nuc) / NTs(mito)) / ((2 * Size(nuc)) / Size(mito))
+
+Pi1889 Pi1889 * hits_raw(mitochondrial) 163719 # Total number of hits (prior to PCR duplicate filtering)
+Pi1889 Pi1889 * hits_raw_frac(mitochondrial) 0.000216587590811 # Total number of hits vs. total number of reads retained
+Pi1889 Pi1889 * hits_clonality(mitochondrial) 0.695673684789 # Fraction of hits that were PCR duplicates
+Pi1889 Pi1889 * hits_unique(mitochondrial) 49824 # Total number of hits (excluding any PCR duplicates)
+Pi1889 Pi1889 * hits_unique_frac(mitochondrial) 6.59133034319e-05 # Total number of unique hits vs. total number of reads retained
+Pi1889 Pi1889 * hits_coverage(mitochondrial) 111.980618111 # Estimated coverage from unique hits
+Pi1889 Pi1889 * hits_length(mitochondrial) 85.2305916827 # Average number of aligned bases per unique hit
+
+Pi1889 Pi1889 * hits_raw(nuclear) 43245928 # Total number of hits (prior to PCR duplicate filtering)
+Pi1889 Pi1889 * hits_raw_frac(nuclear) 0.0572110222876 # Total number of hits vs. total number of reads retained
+Pi1889 Pi1889 * hits_clonality(nuclear) 0.637044000998 # Fraction of hits that were PCR duplicates
+Pi1889 Pi1889 * hits_unique(nuclear) 15696369 # Total number of hits (excluding any PCR duplicates)
+Pi1889 Pi1889 * hits_unique_frac(nuclear) 0.0207650837483 # Total number of unique hits vs. total number of reads retained
+Pi1889 Pi1889 * hits_coverage(nuclear) 5.44502954481 # Estimated coverage from unique hits
+Pi1889 Pi1889 * hits_length(nuclear) 79.2811469328 # Average number of aligned bases per unique hit
+
+
+Pi1889 Pi1889 Pi1889_id_CTTGTA lib_type * # SE, PE, or * (for both)
+Pi1889 Pi1889 Pi1889_id_CTTGTA seq_reads_se 107630753 # Total number of single-ended reads
+Pi1889 Pi1889 Pi1889_id_CTTGTA seq_trash_se 12221068 # Total number of trashed reads
+Pi1889 Pi1889 Pi1889_id_CTTGTA seq_trash_se_frac 0.113546246397 # Fraction of SE reads trashed
+Pi1889 Pi1889 Pi1889_id_CTTGTA seq_reads_pairs 111164719 # Total number of reads
+Pi1889 Pi1889 Pi1889_id_CTTGTA seq_trash_pe_1 38064 # Total number of reads
+Pi1889 Pi1889 Pi1889_id_CTTGTA seq_trash_pe_1_frac 0.00034241079672 # Fraction of PE mate 1 reads trashed
+Pi1889 Pi1889 Pi1889_id_CTTGTA seq_trash_pe_2 9470730 # Total number of reads
+Pi1889 Pi1889 Pi1889_id_CTTGTA seq_trash_pe_2_frac 0.0851954656585 # Fraction of PE mate 2 reads trashed
+Pi1889 Pi1889 Pi1889_id_CTTGTA seq_collapsed 98243284 # Total number of pairs collapsed into one read
+Pi1889 Pi1889 Pi1889_id_CTTGTA seq_collapsed_frac 0.883763165902 # Fraction of PE pairs collapsed into one read
+Pi1889 Pi1889 Pi1889_id_CTTGTA seq_retained_reads 209987045 # Total number of retained reads
+Pi1889 Pi1889 Pi1889_id_CTTGTA seq_retained_nts 15689307033 # Total number of NTs in retained reads
+Pi1889 Pi1889 Pi1889_id_CTTGTA seq_retained_length 74.7155951121 # Average number of NTs in retained reads
+
+Pi1889 Pi1889 Pi1889_id_CTTGTA hits_raw(endogenous) 12103744 # Total number of hits against the nuclear and mitochondrial genome
+Pi1889 Pi1889 Pi1889_id_CTTGTA hits_raw_frac(endogenous) 0.0576404320562 # Total number of hits vs. total number of reads retained
+Pi1889 Pi1889 Pi1889_id_CTTGTA hits_clonality(endogenous) 0.567883788686 # Fraction of hits that were PCR duplicates
+Pi1889 Pi1889 Pi1889_id_CTTGTA hits_unique(endogenous) 5230224 # Total number of unique reads (PCR duplicates removed)
+Pi1889 Pi1889 Pi1889_id_CTTGTA hits_unique_frac(endogenous) 0.0249073651186 # Total number of unique hits vs. total number of reads retained
+Pi1889 Pi1889 Pi1889_id_CTTGTA hits_coverage(endogenous) 1.77168965263 # Estimated coverage from unique hits
+Pi1889 Pi1889 Pi1889_id_CTTGTA hits_length(endogenous) 77.429828818 # Average number of aligned bases per unique hit
+Pi1889 Pi1889 Pi1889_id_CTTGTA ratio_reads(nuc,mito) 352.560738187 # Ratio of unique hits: Hits(nuc) / H(mito)
+Pi1889 Pi1889 Pi1889_id_CTTGTA ratio_genome(mito,nuc) 36.4662593864 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(mito) / NTs(nuc)) / (Size(mito) / (2 * Size(nuc)))
+Pi1889 Pi1889 Pi1889_id_CTTGTA ratio_genome(nuc,mito) 0.0274226097446 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(nuc) / NTs(mito)) / ((2 * Size(nuc)) / Size(mito))
+
+Pi1889 Pi1889 Pi1889_id_CTTGTA hits_raw(mitochondrial) 38485 # Total number of hits (prior to PCR duplicate filtering)
+Pi1889 Pi1889 Pi1889_id_CTTGTA hits_raw_frac(mitochondrial) 0.000183273210974 # Total number of hits vs. total number of reads retained
+Pi1889 Pi1889 Pi1889_id_CTTGTA hits_clonality(mitochondrial) 0.615616473951 # Fraction of hits that were PCR duplicates
+Pi1889 Pi1889 Pi1889_id_CTTGTA hits_unique(mitochondrial) 14793 # Total number of hits (excluding any PCR duplicates)
+Pi1889 Pi1889 Pi1889_id_CTTGTA hits_unique_frac(mitochondrial) 7.04472030644e-05 # Total number of unique hits vs. total number of reads retained
+Pi1889 Pi1889 Pi1889_id_CTTGTA hits_coverage(mitochondrial) 32.2113548863 # Estimated coverage from unique hits
+Pi1889 Pi1889 Pi1889_id_CTTGTA hits_length(mitochondrial) 82.574122896 # Average number of aligned bases per unique hit
+
+Pi1889 Pi1889 Pi1889_id_CTTGTA hits_raw(nuclear) 12065259 # Total number of hits (prior to PCR duplicate filtering)
+Pi1889 Pi1889 Pi1889_id_CTTGTA hits_raw_frac(nuclear) 0.0574571588452 # Total number of hits vs. total number of reads retained
+Pi1889 Pi1889 Pi1889_id_CTTGTA hits_clonality(nuclear) 0.567731533985 # Fraction of hits that were PCR duplicates
+Pi1889 Pi1889 Pi1889_id_CTTGTA hits_unique(nuclear) 5215431 # Total number of hits (excluding any PCR duplicates)
+Pi1889 Pi1889 Pi1889_id_CTTGTA hits_unique_frac(nuclear) 0.0248369179156 # Total number of unique hits vs. total number of reads retained
+Pi1889 Pi1889 Pi1889_id_CTTGTA hits_coverage(nuclear) 1.76663882879 # Estimated coverage from unique hits
+Pi1889 Pi1889 Pi1889_id_CTTGTA hits_length(nuclear) 77.4152375901 # Average number of aligned bases per unique hit
+
+
+Pi1889 Pi1889 Pi1889_id_GGCTAC lib_type * # SE, PE, or * (for both)
+Pi1889 Pi1889 Pi1889_id_GGCTAC seq_reads_se 119254408 # Total number of single-ended reads
+Pi1889 Pi1889 Pi1889_id_GGCTAC seq_trash_se 9113800 # Total number of trashed reads
+Pi1889 Pi1889 Pi1889_id_GGCTAC seq_trash_se_frac 0.0764231708735 # Fraction of SE reads trashed
+Pi1889 Pi1889 Pi1889_id_GGCTAC seq_reads_pairs 126617217 # Total number of reads
+Pi1889 Pi1889 Pi1889_id_GGCTAC seq_trash_pe_1 33752 # Total number of reads
+Pi1889 Pi1889 Pi1889_id_GGCTAC seq_trash_pe_1_frac 0.000266567223634 # Fraction of PE mate 1 reads trashed
+Pi1889 Pi1889 Pi1889_id_GGCTAC seq_trash_pe_2 7448312 # Total number of reads
+Pi1889 Pi1889 Pi1889_id_GGCTAC seq_trash_pe_2_frac 0.0588254281406 # Fraction of PE mate 2 reads trashed
+Pi1889 Pi1889 Pi1889_id_GGCTAC seq_collapsed 116109710 # Total number of pairs collapsed into one read
+Pi1889 Pi1889 Pi1889_id_GGCTAC seq_collapsed_frac 0.917013600133 # Fraction of PE pairs collapsed into one read
+Pi1889 Pi1889 Pi1889_id_GGCTAC seq_retained_reads 239783268 # Total number of retained reads
+Pi1889 Pi1889 Pi1889_id_GGCTAC seq_retained_nts 18989407027 # Total number of NTs in retained reads
+Pi1889 Pi1889 Pi1889_id_GGCTAC seq_retained_length 79.1940454619 # Average number of NTs in retained reads
+
+Pi1889 Pi1889 Pi1889_id_GGCTAC hits_raw(endogenous) 13970375 # Total number of hits against the nuclear and mitochondrial genome
+Pi1889 Pi1889 Pi1889_id_GGCTAC hits_raw_frac(endogenous) 0.0582625097928 # Total number of hits vs. total number of reads retained
+Pi1889 Pi1889 Pi1889_id_GGCTAC hits_clonality(endogenous) 0.598061183039 # Fraction of hits that were PCR duplicates
+Pi1889 Pi1889 Pi1889_id_GGCTAC hits_unique(endogenous) 5615236 # Total number of unique reads (PCR duplicates removed)
+Pi1889 Pi1889 Pi1889_id_GGCTAC hits_unique_frac(endogenous) 0.0234179642593 # Total number of unique hits vs. total number of reads retained
+Pi1889 Pi1889 Pi1889_id_GGCTAC hits_coverage(endogenous) 1.9662827549 # Estimated coverage from unique hits
+Pi1889 Pi1889 Pi1889_id_GGCTAC hits_length(endogenous) 80.0421777464 # Average number of aligned bases per unique hit
+Pi1889 Pi1889 Pi1889_id_GGCTAC ratio_reads(nuc,mito) 313.843622091 # Ratio of unique hits: Hits(nuc) / H(mito)
+Pi1889 Pi1889 Pi1889_id_GGCTAC ratio_genome(mito,nuc) 41.4130658588 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(mito) / NTs(nuc)) / (Size(mito) / (2 * Size(nuc)))
+Pi1889 Pi1889 Pi1889_id_GGCTAC ratio_genome(nuc,mito) 0.0241469685777 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(nuc) / NTs(mito)) / ((2 * Size(nuc)) / Size(mito))
+
+Pi1889 Pi1889 Pi1889_id_GGCTAC hits_raw(mitochondrial) 51414 # Total number of hits (prior to PCR duplicate filtering)
+Pi1889 Pi1889 Pi1889_id_GGCTAC hits_raw_frac(mitochondrial) 0.000214418630744 # Total number of hits vs. total number of reads retained
+Pi1889 Pi1889 Pi1889_id_GGCTAC hits_clonality(mitochondrial) 0.653110047847 # Fraction of hits that were PCR duplicates
+Pi1889 Pi1889 Pi1889_id_GGCTAC hits_unique(mitochondrial) 17835 # Total number of hits (excluding any PCR duplicates)
+Pi1889 Pi1889 Pi1889_id_GGCTAC hits_unique_frac(mitochondrial) 7.43796685597e-05 # Total number of unique hits vs. total number of reads retained
+Pi1889 Pi1889 Pi1889_id_GGCTAC hits_coverage(mitochondrial) 40.5822214018 # Estimated coverage from unique hits
+Pi1889 Pi1889 Pi1889_id_GGCTAC hits_length(mitochondrial) 86.2887019905 # Average number of aligned bases per unique hit
+
+Pi1889 Pi1889 Pi1889_id_GGCTAC hits_raw(nuclear) 13918961 # Total number of hits (prior to PCR duplicate filtering)
+Pi1889 Pi1889 Pi1889_id_GGCTAC hits_raw_frac(nuclear) 0.0580480911621 # Total number of hits vs. total number of reads retained
+Pi1889 Pi1889 Pi1889_id_GGCTAC hits_clonality(nuclear) 0.597857842981 # Fraction of hits that were PCR duplicates
+Pi1889 Pi1889 Pi1889_id_GGCTAC hits_unique(nuclear) 5597401 # Total number of hits (excluding any PCR duplicates)
+Pi1889 Pi1889 Pi1889_id_GGCTAC hits_unique_frac(nuclear) 0.0233435845907 # Total number of unique hits vs. total number of reads retained
+Pi1889 Pi1889 Pi1889_id_GGCTAC hits_coverage(nuclear) 1.95987525001 # Estimated coverage from unique hits
+Pi1889 Pi1889 Pi1889_id_GGCTAC hits_length(nuclear) 80.0222744449 # Average number of aligned bases per unique hit
+
+
+Pi1889 Pi1889 Pi1889_id_TAGCTT lib_type * # SE, PE, or * (for both)
+Pi1889 Pi1889 Pi1889_id_TAGCTT seq_reads_se 147079990 # Total number of single-ended reads
+Pi1889 Pi1889 Pi1889_id_TAGCTT seq_trash_se 11873705 # Total number of trashed reads
+Pi1889 Pi1889 Pi1889_id_TAGCTT seq_trash_se_frac 0.0807295744309 # Fraction of SE reads trashed
+Pi1889 Pi1889 Pi1889_id_TAGCTT seq_reads_pairs 166148859 # Total number of reads
+Pi1889 Pi1889 Pi1889_id_TAGCTT seq_trash_pe_1 49728 # Total number of reads
+Pi1889 Pi1889 Pi1889_id_TAGCTT seq_trash_pe_1_frac 0.000299297872398 # Fraction of PE mate 1 reads trashed
+Pi1889 Pi1889 Pi1889_id_TAGCTT seq_trash_pe_2 10818507 # Total number of reads
+Pi1889 Pi1889 Pi1889_id_TAGCTT seq_trash_pe_2_frac 0.0651133391172 # Fraction of PE mate 2 reads trashed
+Pi1889 Pi1889 Pi1889_id_TAGCTT seq_collapsed 150504049 # Total number of pairs collapsed into one read
+Pi1889 Pi1889 Pi1889_id_TAGCTT seq_collapsed_frac 0.905838595016 # Fraction of PE pairs collapsed into one read
+Pi1889 Pi1889 Pi1889_id_TAGCTT seq_retained_reads 306131719 # Total number of retained reads
+Pi1889 Pi1889 Pi1889_id_TAGCTT seq_retained_nts 25014985585 # Total number of NTs in retained reads
+Pi1889 Pi1889 Pi1889_id_TAGCTT seq_retained_length 81.7131451348 # Average number of NTs in retained reads
+
+Pi1889 Pi1889 Pi1889_id_TAGCTT hits_raw(endogenous) 17335528 # Total number of hits against the nuclear and mitochondrial genome
+Pi1889 Pi1889 Pi1889_id_TAGCTT hits_raw_frac(endogenous) 0.0566276766636 # Total number of hits vs. total number of reads retained
+Pi1889 Pi1889 Pi1889_id_TAGCTT hits_clonality(endogenous) 0.717301197864 # Fraction of hits that were PCR duplicates
+Pi1889 Pi1889 Pi1889_id_TAGCTT hits_unique(endogenous) 4900733 # Total number of unique reads (PCR duplicates removed)
+Pi1889 Pi1889 Pi1889_id_TAGCTT hits_unique_frac(endogenous) 0.0160085763606 # Total number of unique hits vs. total number of reads retained
+Pi1889 Pi1889 Pi1889_id_TAGCTT hits_coverage(endogenous) 1.72473155048 # Estimated coverage from unique hits
+Pi1889 Pi1889 Pi1889_id_TAGCTT hits_length(endogenous) 80.4454352033 # Average number of aligned bases per unique hit
+Pi1889 Pi1889 Pi1889_id_TAGCTT ratio_reads(nuc,mito) 283.992614562 # Ratio of unique hits: Hits(nuc) / H(mito)
+Pi1889 Pi1889 Pi1889_id_TAGCTT ratio_genome(mito,nuc) 45.6056900245 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(mito) / NTs(nuc)) / (Size(mito) / (2 * Size(nuc)))
+Pi1889 Pi1889 Pi1889_id_TAGCTT ratio_genome(nuc,mito) 0.0219270884722 # Ratio of NTs of unique hits corrected by genome sizes: (NTs(nuc) / NTs(mito)) / ((2 * Size(nuc)) / Size(mito))
+
+Pi1889 Pi1889 Pi1889_id_TAGCTT hits_raw(mitochondrial) 73820 # Total number of hits (prior to PCR duplicate filtering)
+Pi1889 Pi1889 Pi1889_id_TAGCTT hits_raw_frac(mitochondrial) 0.000241138031175 # Total number of hits vs. total number of reads retained
+Pi1889 Pi1889 Pi1889_id_TAGCTT hits_clonality(mitochondrial) 0.767054998645 # Fraction of hits that were PCR duplicates
+Pi1889 Pi1889 Pi1889_id_TAGCTT hits_unique(mitochondrial) 17196 # Total number of hits (excluding any PCR duplicates)
+Pi1889 Pi1889 Pi1889_id_TAGCTT hits_unique_frac(mitochondrial) 5.61718989988e-05 # Total number of unique hits vs. total number of reads retained
+Pi1889 Pi1889 Pi1889_id_TAGCTT hits_coverage(mitochondrial) 39.1870418227 # Estimated coverage from unique hits
+Pi1889 Pi1889 Pi1889_id_TAGCTT hits_length(mitochondrial) 86.4184112584 # Average number of aligned bases per unique hit
+
+Pi1889 Pi1889 Pi1889_id_TAGCTT hits_raw(nuclear) 17261708 # Total number of hits (prior to PCR duplicate filtering)
+Pi1889 Pi1889 Pi1889_id_TAGCTT hits_raw_frac(nuclear) 0.0563865386324 # Total number of hits vs. total number of reads retained
+Pi1889 Pi1889 Pi1889_id_TAGCTT hits_clonality(nuclear) 0.717088424853 # Fraction of hits that were PCR duplicates
+Pi1889 Pi1889 Pi1889_id_TAGCTT hits_unique(nuclear) 4883537 # Total number of hits (excluding any PCR duplicates)
+Pi1889 Pi1889 Pi1889_id_TAGCTT hits_unique_frac(nuclear) 0.0159524044616 # Total number of unique hits vs. total number of reads retained
+Pi1889 Pi1889 Pi1889_id_TAGCTT hits_coverage(nuclear) 1.71851546602 # Estimated coverage from unique hits
+Pi1889 Pi1889 Pi1889_id_TAGCTT hits_length(nuclear) 80.4244030505 # Average number of aligned bases per unique hit
+
+
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/M.0182896.NO.UDG.txt b/paleomix/resources/examples/nature_protocols/example_results/profiling/M.0182896.NO.UDG.txt
new file mode 100644
index 0000000..3a57988
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/profiling/M.0182896.NO.UDG.txt
@@ -0,0 +1,14 @@
+k__Bacteria 100.0
+k__Bacteria|p__Proteobacteria 100.0
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria 100.0
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales 85.78209
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales 14.21791
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae 85.78209
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae 14.21791
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas 85.78209
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea 14.21791
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_unclassified 76.23356
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_unclassified 13.464
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_syringae 7.46662
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_fluorescens 2.08192
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_vagans 0.75391
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/M.0182896.UDG.txt b/paleomix/resources/examples/nature_protocols/example_results/profiling/M.0182896.UDG.txt
new file mode 100644
index 0000000..b253da4
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/profiling/M.0182896.UDG.txt
@@ -0,0 +1,14 @@
+k__Bacteria 100.0
+k__Bacteria|p__Proteobacteria 100.0
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria 100.0
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales 86.23041
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales 13.76959
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae 86.23041
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae 13.76959
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas 86.23041
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea 13.76959
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_unclassified 79.32706
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_unclassified 12.716
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_syringae 5.48006
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_fluorescens 1.42328
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_vagans 1.05359
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/M.0182896.UDGa.txt b/paleomix/resources/examples/nature_protocols/example_results/profiling/M.0182896.UDGa.txt
new file mode 100644
index 0000000..738d636
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/profiling/M.0182896.UDGa.txt
@@ -0,0 +1,29 @@
+k__Bacteria 100.0
+k__Bacteria|p__Proteobacteria 99.95101
+k__Bacteria|p__Actinobacteria 0.04899
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria 99.90939
+k__Bacteria|p__Actinobacteria|c__Actinobacteria 0.04899
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria 0.04162
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales 83.76256
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales 16.14682
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales 0.04899
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales 0.04162
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae 83.76256
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae 16.14682
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae 0.04162
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Kineosporiaceae 0.04013
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae 0.00887
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas 83.76256
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea 16.14682
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae|g__Methylobacterium 0.04162
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Kineosporiaceae|g__Kineococcus 0.04013
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae|g__Mycobacterium 0.00887
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_unclassified 76.48866
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_unclassified 14.88738
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_syringae 5.96167
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_fluorescens 1.31223
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_vagans 1.25851
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae|g__Methylobacterium|s__Methylobacterium_unclassified 0.04162
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Kineosporiaceae|g__Kineococcus|s__Kineococcus_radiotolerans 0.04013
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae|g__Mycobacterium|s__Mycobacterium_unclassified 0.00887
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_ananatis 0.00094
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/M.0182896.UDGb.txt b/paleomix/resources/examples/nature_protocols/example_results/profiling/M.0182896.UDGb.txt
new file mode 100644
index 0000000..3f449fa
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/profiling/M.0182896.UDGb.txt
@@ -0,0 +1,21 @@
+k__Bacteria 100.0
+k__Bacteria|p__Proteobacteria 99.9716
+k__Bacteria|p__Actinobacteria 0.0284
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria 99.9716
+k__Bacteria|p__Actinobacteria|c__Actinobacteria 0.0284
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales 85.3317
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales 14.6399
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales 0.0284
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae 85.3317
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae 14.6399
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Kineosporiaceae 0.0284
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas 85.3317
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea 14.6399
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Kineosporiaceae|g__Kineococcus 0.0284
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_unclassified 78.29363
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_unclassified 13.49337
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_syringae 5.49172
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_fluorescens 1.54634
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_vagans 1.1439
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Kineosporiaceae|g__Kineococcus|s__Kineococcus_radiotolerans 0.0284
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_ananatis 0.00263
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/M.0182896.UDGc.txt b/paleomix/resources/examples/nature_protocols/example_results/profiling/M.0182896.UDGc.txt
new file mode 100644
index 0000000..1b842d3
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/profiling/M.0182896.UDGc.txt
@@ -0,0 +1,37 @@
+k__Bacteria 100.0
+k__Bacteria|p__Proteobacteria 99.94623
+k__Bacteria|p__Actinobacteria 0.03984
+k__Bacteria|p__Bacteroidetes 0.01393
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria 99.89578
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria 0.05045
+k__Bacteria|p__Actinobacteria|c__Actinobacteria 0.03984
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria 0.01393
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales 82.93364
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales 16.95444
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales 0.05045
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales 0.03984
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria|o__Sphingobacteriales 0.01393
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Chromatiales 0.0077
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae 82.93364
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae 16.95444
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae 0.05045
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Kineosporiaceae 0.03019
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria|o__Sphingobacteriales|f__Sphingobacteriaceae 0.01393
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae 0.00965
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Chromatiales|f__Ectothiorhodospiraceae 0.0077
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas 82.93364
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea 16.95444
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae|g__Methylobacterium 0.05045
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Kineosporiaceae|g__Kineococcus 0.03019
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria|o__Sphingobacteriales|f__Sphingobacteriaceae|g__Sphingobacteriaceae_unclassified 0.01393
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae|g__Mycobacterium 0.00965
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Chromatiales|f__Ectothiorhodospiraceae|g__Thioalkalivibrio 0.0077
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_unclassified 75.21452
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_unclassified 15.67433
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_syringae 6.12355
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_fluorescens 1.59558
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_vagans 1.28011
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae|g__Methylobacterium|s__Methylobacterium_unclassified 0.05045
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Kineosporiaceae|g__Kineococcus|s__Kineococcus_radiotolerans 0.03019
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae|g__Mycobacterium|s__Mycobacterium_unclassified 0.00965
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Chromatiales|f__Ectothiorhodospiraceae|g__Thioalkalivibrio|s__Thioalkalivibrio_unclassified 0.0077
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/Pi1845A.id.CGCTAT.txt b/paleomix/resources/examples/nature_protocols/example_results/profiling/Pi1845A.id.CGCTAT.txt
new file mode 100644
index 0000000..64c4553
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/profiling/Pi1845A.id.CGCTAT.txt
@@ -0,0 +1,75 @@
+k__Bacteria 100.0
+k__Bacteria|p__Actinobacteria 65.96496
+k__Bacteria|p__Proteobacteria 32.89385
+k__Bacteria|p__Bacteroidetes 1.14119
+k__Bacteria|p__Actinobacteria|c__Actinobacteria 65.96496
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria 15.74193
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria 14.31079
+k__Bacteria|p__Proteobacteria|c__Betaproteobacteria 2.84114
+k__Bacteria|p__Bacteroidetes|c__Flavobacteria 1.14119
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales 45.69488
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Coriobacteriales 19.98936
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales 12.12615
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales 10.07859
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales 4.2322
+k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales 2.84114
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Sphingomonadales 1.48299
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Caulobacterales 1.20722
+k__Bacteria|p__Bacteroidetes|c__Flavobacteria|o__Flavobacteriales 1.14119
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhodospirillales 0.92557
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales 0.28072
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Brevibacteriaceae 26.96627
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Coriobacteriales|f__Coriobacteriaceae 19.98936
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae 10.63463
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae 10.07859
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae 6.02952
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Aurantimonadaceae 5.29625
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae 4.2322
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Frankiaceae 3.21437
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae 3.18891
+k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Oxalobacteraceae 2.84114
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Sphingomonadales|f__Erythrobacteraceae 1.48299
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Caulobacterales|f__Caulobacteraceae 1.20722
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Kineosporiaceae 1.17813
+k__Bacteria|p__Bacteroidetes|c__Flavobacteria|o__Flavobacteriales|f__Flavobacteriales_uncl 1.14119
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhodospirillales|f__Acetobacteraceae 0.92557
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylocystaceae 0.80038
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae 0.51258
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales|f__Bifidobacteriaceae 0.28072
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Brevibacteriaceae|g__Brevibacterium 26.96627
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella 19.98936
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Arthrobacter 10.63463
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas 10.07859
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae|g__Methylobacterium 6.02952
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Aurantimonadaceae|g__Aurantimonadaceae_unclassified 5.29625
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Frankiaceae|g__Frankia 3.21437
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae|g__Mycobacterium 3.18891
+k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Oxalobacteraceae|g__Candidatus_Zinderia 2.84114
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Candidatus_Carsonella 2.13362
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Escherichia 2.09858
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Sphingomonadales|f__Erythrobacteraceae|g__Erythrobacteraceae_unclassified 1.48299
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Caulobacterales|f__Caulobacteraceae|g__Caulobacter 1.20722
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Kineosporiaceae|g__Kineococcus 1.17813
+k__Bacteria|p__Bacteroidetes|c__Flavobacteria|o__Flavobacteriales|f__Flavobacteriales_uncl|g__Candidatus_Sulcia 1.14119
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhodospirillales|f__Acetobacteraceae|g__Acetobacteraceae_unclassified 0.92557
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylocystaceae|g__Methylocystaceae_unclassified 0.80038
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae|g__Propionibacterium 0.51258
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Bifidobacterium 0.28072
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Brevibacteriaceae|g__Brevibacterium|s__Brevibacterium_linens 26.96627
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella|s__Collinsella_intestinalis 19.98936
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_unclassified 10.07859
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Arthrobacter|s__Arthrobacter_unclassified 10.00086
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae|g__Methylobacterium|s__Methylobacterium_unclassified 5.3647
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Frankiaceae|g__Frankia|s__Frankia_unclassified 3.21437
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae|g__Mycobacterium|s__Mycobacterium_unclassified 3.18891
+k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Oxalobacteraceae|g__Candidatus_Zinderia|s__Candidatus_Zinderia_unclassified 2.84114
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Candidatus_Carsonella|s__Candidatus_Carsonella_ruddii 2.13362
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Escherichia|s__Escherichia_coli 2.09858
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Caulobacterales|f__Caulobacteraceae|g__Caulobacter|s__Caulobacter_unclassified 1.20722
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Kineosporiaceae|g__Kineococcus|s__Kineococcus_radiotolerans 1.17813
+k__Bacteria|p__Bacteroidetes|c__Flavobacteria|o__Flavobacteriales|f__Flavobacteriales_uncl|g__Candidatus_Sulcia|s__Candidatus_Sulcia_muelleri 1.14119
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae|g__Methylobacterium|s__Methylobacterium_radiotolerans 0.66482
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Arthrobacter|s__Arthrobacter_phenanthrenivorans 0.63377
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae|g__Propionibacterium|s__Propionibacterium_acnes 0.31543
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Bifidobacterium|s__Bifidobacterium_unclassified 0.28072
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae|g__Propionibacterium|s__Propionibacterium_unclassified 0.19714
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/Pi1889.id.CTTGTA.txt b/paleomix/resources/examples/nature_protocols/example_results/profiling/Pi1889.id.CTTGTA.txt
new file mode 100644
index 0000000..ca644f8
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/profiling/Pi1889.id.CTTGTA.txt
@@ -0,0 +1,61 @@
+k__Bacteria 100.0
+k__Bacteria|p__Proteobacteria 84.5005
+k__Bacteria|p__Firmicutes 11.00357
+k__Bacteria|p__Actinobacteria 4.45534
+k__Bacteria|p__Bacteroidetes 0.04059
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria 84.33874
+k__Bacteria|p__Firmicutes|c__Bacilli 11.00357
+k__Bacteria|p__Actinobacteria|c__Actinobacteria 4.45534
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria 0.16176
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria 0.04059
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales 73.90616
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales 11.00357
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales 10.43258
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales 4.45534
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales 0.16176
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria|o__Sphingobacteriales 0.04059
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae 70.46446
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_uncl 11.00357
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae 10.43258
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae 4.39765
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae 3.4417
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae 0.08949
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Aurantimonadaceae 0.07227
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae 0.05769
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria|o__Sphingobacteriales|f__Sphingobacteriaceae 0.04059
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas 70.46446
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_uncl|g__Exiguobacterium 11.00357
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pectobacterium 5.6553
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Arthrobacter 4.31343
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter 3.38248
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea 3.37568
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Rahnella 0.91885
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Erwinia 0.42319
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae|g__Methylobacterium 0.08949
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Micrococcus 0.08422
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Aurantimonadaceae|g__Aurantimonadaceae_unclassified 0.07227
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Enterobacter 0.05956
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Psychrobacter 0.05922
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae|g__Mycobacterium 0.05769
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria|o__Sphingobacteriales|f__Sphingobacteriaceae|g__Sphingobacteriaceae_unclassified 0.04059
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_unclassified 70.13223
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_uncl|g__Exiguobacterium|s__Exiguobacterium_sibiricum 11.00357
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Arthrobacter|s__Arthrobacter_arilaitensis 4.31343
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pectobacterium|s__Pectobacterium_unclassified 4.18123
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_unclassified 3.13912
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter|s__Acinetobacter_unclassified 3.13194
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pectobacterium|s__Pectobacterium_carotovorum 1.47407
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Rahnella|s__Rahnella_unclassified 0.91885
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Erwinia|s__Erwinia_unclassified 0.42319
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_vagans 0.23656
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_fluorescens 0.20104
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter|s__Acinetobacter_johnsonii 0.17158
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae|g__Methylobacterium|s__Methylobacterium_unclassified 0.08949
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Micrococcus|s__Micrococcus_luteus 0.08422
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter|s__Acinetobacter_lwoffii 0.07895
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_syringae 0.06906
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Enterobacter|s__Enterobacter_unclassified 0.05956
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Psychrobacter|s__Psychrobacter_unclassified 0.05922
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_putida 0.05879
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae|g__Mycobacterium|s__Mycobacterium_unclassified 0.05769
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_mendocina 0.00335
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/Pi1889.id.GGCTAC.txt b/paleomix/resources/examples/nature_protocols/example_results/profiling/Pi1889.id.GGCTAC.txt
new file mode 100644
index 0000000..0e332ac
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/profiling/Pi1889.id.GGCTAC.txt
@@ -0,0 +1,61 @@
+k__Bacteria 100.0
+k__Bacteria|p__Proteobacteria 84.12097
+k__Bacteria|p__Firmicutes 11.52692
+k__Bacteria|p__Actinobacteria 4.20686
+k__Bacteria|p__Bacteroidetes 0.14525
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria 84.0085
+k__Bacteria|p__Firmicutes|c__Bacilli 11.52692
+k__Bacteria|p__Actinobacteria|c__Actinobacteria 4.20686
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria 0.14525
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria 0.11248
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales 73.85794
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales 11.52692
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales 10.15055
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales 4.20686
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria|o__Sphingobacteriales 0.14525
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales 0.11248
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae 70.19604
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_uncl 11.52692
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae 10.15055
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae 4.1946
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae 3.66191
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria|o__Sphingobacteriales|f__Sphingobacteriaceae 0.14525
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae 0.08435
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Aurantimonadaceae 0.02813
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae 0.01226
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas 70.19604
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_uncl|g__Exiguobacterium 11.52692
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pectobacterium 5.61134
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Arthrobacter 4.14533
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter 3.60764
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea 3.15824
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Rahnella 0.98379
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Erwinia 0.39718
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria|o__Sphingobacteriales|f__Sphingobacteriaceae|g__Sphingobacteriaceae_unclassified 0.14525
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae|g__Methylobacterium 0.08435
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Psychrobacter 0.05427
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Micrococcus 0.04927
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Aurantimonadaceae|g__Aurantimonadaceae_unclassified 0.02813
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae|g__Propionibacterium 0.01226
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_unclassified 69.62544
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_uncl|g__Exiguobacterium|s__Exiguobacterium_sibiricum 11.52692
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pectobacterium|s__Pectobacterium_unclassified 4.20381
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Arthrobacter|s__Arthrobacter_arilaitensis 4.14533
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter|s__Acinetobacter_unclassified 3.31128
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_unclassified 2.9336
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pectobacterium|s__Pectobacterium_carotovorum 1.40753
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Rahnella|s__Rahnella_unclassified 0.98379
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Erwinia|s__Erwinia_unclassified 0.38739
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_fluorescens 0.37799
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_vagans 0.22464
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter|s__Acinetobacter_johnsonii 0.20383
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_syringae 0.14251
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter|s__Acinetobacter_lwoffii 0.09253
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae|g__Methylobacterium|s__Methylobacterium_unclassified 0.08435
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Psychrobacter|s__Psychrobacter_unclassified 0.05427
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Micrococcus|s__Micrococcus_luteus 0.04927
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_putida 0.04539
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae|g__Propionibacterium|s__Propionibacterium_acnes 0.01226
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Erwinia|s__Erwinia_billingiae 0.00506
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Erwinia|s__Erwinia_tasmaniensis 0.00474
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_mendocina 0.00471
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/Pi1889.id.TAGCTT.txt b/paleomix/resources/examples/nature_protocols/example_results/profiling/Pi1889.id.TAGCTT.txt
new file mode 100644
index 0000000..a634e66
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/profiling/Pi1889.id.TAGCTT.txt
@@ -0,0 +1,71 @@
+k__Bacteria 100.0
+k__Bacteria|p__Proteobacteria 84.53615
+k__Bacteria|p__Firmicutes 10.9544
+k__Bacteria|p__Actinobacteria 4.47192
+k__Bacteria|p__Chloroflexi 0.03753
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria 84.21373
+k__Bacteria|p__Firmicutes|c__Bacilli 10.9544
+k__Bacteria|p__Actinobacteria|c__Actinobacteria 4.47192
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria 0.32243
+k__Bacteria|p__Chloroflexi|c__Thermomicrobia 0.03753
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales 73.96409
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales 10.9544
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales 10.2167
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales 4.45915
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales 0.32243
+k__Bacteria|p__Chloroflexi|c__Thermomicrobia|o__Thermomicrobia_unclassified 0.03753
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Chromatiales 0.02397
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales 0.01277
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Xanthomonadales 0.00896
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae 70.15228
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_uncl 10.9544
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae 10.2167
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae 4.45241
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae 3.81181
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Aurantimonadaceae 0.19676
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae 0.12566
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Chromatiales|f__Ectothiorhodospiraceae 0.02397
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales|f__Bifidobacteriaceae 0.01277
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Xanthomonadales|f__Xanthomonadaceae 0.00896
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae 0.00674
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas 70.15228
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_uncl|g__Exiguobacterium 10.9544
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pectobacterium 5.3445
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Arthrobacter 4.39643
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter 3.761
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea 3.25615
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Rahnella 1.00808
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Erwinia 0.46413
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Aurantimonadaceae|g__Aurantimonadaceae_unclassified 0.19676
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Enterobacter 0.14384
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae|g__Methylobacterium 0.12566
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Micrococcus 0.05598
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Psychrobacter 0.05081
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Chromatiales|f__Ectothiorhodospiraceae|g__Thioalkalivibrio 0.02397
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Bifidobacterium 0.01277
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Xanthomonadales|f__Xanthomonadaceae|g__Stenotrophomonas 0.00896
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae|g__Propionibacterium 0.00674
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_unclassified 69.7506
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_uncl|g__Exiguobacterium|s__Exiguobacterium_sibiricum 10.9544
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Arthrobacter|s__Arthrobacter_arilaitensis 4.39643
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pectobacterium|s__Pectobacterium_unclassified 3.79739
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter|s__Acinetobacter_unclassified 3.51115
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_unclassified 3.05066
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pectobacterium|s__Pectobacterium_carotovorum 1.54711
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Rahnella|s__Rahnella_unclassified 1.00808
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Erwinia|s__Erwinia_unclassified 0.45852
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_fluorescens 0.2438
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_vagans 0.20548
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter|s__Acinetobacter_johnsonii 0.1717
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Enterobacter|s__Enterobacter_unclassified 0.14384
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae|g__Methylobacterium|s__Methylobacterium_unclassified 0.12566
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_syringae 0.08718
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter|s__Acinetobacter_lwoffii 0.07815
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_putida 0.0707
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Micrococcus|s__Micrococcus_luteus 0.05598
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Psychrobacter|s__Psychrobacter_unclassified 0.05081
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Chromatiales|f__Ectothiorhodospiraceae|g__Thioalkalivibrio|s__Thioalkalivibrio_unclassified 0.02397
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Bifidobacterium|s__Bifidobacterium_unclassified 0.01277
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Xanthomonadales|f__Xanthomonadaceae|g__Stenotrophomonas|s__Stenotrophomonas_maltophilia 0.00896
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae|g__Propionibacterium|s__Propionibacterium_acnes 0.00674
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Erwinia|s__Erwinia_billingiae 0.00561
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/Figure_Krona_M.0182896.all.html b/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/Figure_Krona_M.0182896.all.html
new file mode 100644
index 0000000..4f32613
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/Figure_Krona_M.0182896.all.html
@@ -0,0 +1,177 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+ <head>
+ <meta charset="utf-8"/>
+ <link rel="shortcut icon" href="http://krona.sourceforge.net/img/favicon.ico"/>
+ <script id="notfound">window.onload=function(){document.body.innerHTML="Could not get resources from \"http://krona.sourceforge.net\"."}</script>
+ <script src="http://krona.sourceforge.net/src/krona-2.0.js"></script>
+ </head>
+ <body>
+ <img id="hiddenImage" src="http://krona.sourceforge.net/img/hidden.png" style="display:none"/>
+ <img id="loadingImage" src="http://krona.sourceforge.net/img/loading.gif" style="display:none"/>
+ <img id="logo" src="http://krona.sourceforge.net/img/logo.png" style="display:none"/>
+ <noscript>Javascript must be enabled to view this page.</noscript>
+ <div style="display:none">
+ <krona collapse="true" key="true">
+ <attributes magnitude="magnitude">
+ <list>members</list>
+ <attribute display="Total">magnitude</attribute>
+ </attributes>
+ <datasets>
+ <dataset>Krona_M.0182896.all</dataset>
+ </datasets>
+<node name="all">
+ <magnitude><val>99.99998</val></magnitude>
+ <node name="Bacteria">
+ <magnitude><val>99.99998</val></magnitude>
+ <node name="Actinobacteria">
+ <magnitude><val>0.04733</val></magnitude>
+ <node name="Actinobacteria">
+ <magnitude><val>0.04733</val></magnitude>
+ <node name="Actinomycetales">
+ <magnitude><val>0.04733</val></magnitude>
+ <node name="Mycobacteriaceae">
+ <magnitude><val>0.00583</val></magnitude>
+ <node name="Mycobacterium">
+ <magnitude><val>0.00583</val></magnitude>
+ <node name="Mycobacterium_unclassified">
+ <magnitude><val>0.00583</val></magnitude>
+ </node>
+ </node>
+ </node>
+ <node name="Microbacteriaceae">
+ <magnitude><val>0.00273</val></magnitude>
+ <node name="Clavibacter">
+ <magnitude><val>0.00273</val></magnitude>
+ <node name="Clavibacter_michiganensis">
+ <magnitude><val>0.00273</val></magnitude>
+ </node>
+ </node>
+ </node>
+ <node name="Kineosporiaceae">
+ <magnitude><val>0.03877</val></magnitude>
+ <node name="Kineococcus">
+ <magnitude><val>0.03877</val></magnitude>
+ <node name="Kineococcus_radiotolerans">
+ <magnitude><val>0.03877</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ </node>
+ </node>
+ <node name="Proteobacteria">
+ <magnitude><val>99.94002</val></magnitude>
+ <node name="Alphaproteobacteria">
+ <magnitude><val>0.03042</val></magnitude>
+ <node name="Rhizobiales">
+ <magnitude><val>0.03042</val></magnitude>
+ <node name="Methylobacteriaceae">
+ <magnitude><val>0.03042</val></magnitude>
+ <node name="Methylobacterium">
+ <magnitude><val>0.03042</val></magnitude>
+ <node name="Methylobacterium_unclassified">
+ <magnitude><val>0.03042</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ </node>
+ <node name="Gammaproteobacteria">
+ <magnitude><val>99.90628</val></magnitude>
+ <node name="Chromatiales">
+ <magnitude><val>0.00931</val></magnitude>
+ <node name="Ectothiorhodospiraceae">
+ <magnitude><val>0.00931</val></magnitude>
+ <node name="Thioalkalivibrio">
+ <magnitude><val>0.00931</val></magnitude>
+ <node name="Thioalkalivibrio_unclassified">
+ <magnitude><val>0.00931</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ <node name="Enterobacteriales">
+ <magnitude><val>16.33988</val></magnitude>
+ <node name="Enterobacteriaceae">
+ <magnitude><val>16.33988</val></magnitude>
+ <node name="Erwinia">
+ <magnitude><val>0.00436</val></magnitude>
+ <node name="Erwinia_billingiae">
+ <magnitude><val>0.00037</val></magnitude>
+ </node>
+ <node name="Erwinia_unclassified">
+ <magnitude><val>0.00399</val></magnitude>
+ </node>
+ </node>
+ <node name="Pantoea">
+ <magnitude><val>16.33552</val></magnitude>
+ <node name="Pantoea_unclassified">
+ <magnitude><val>15.0525</val></magnitude>
+ </node>
+ <node name="Pantoea_ananatis">
+ <magnitude><val>0.00472</val></magnitude>
+ </node>
+ <node name="Pantoea_vagans">
+ <magnitude><val>1.2783</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ <node name="Pseudomonadales">
+ <magnitude><val>83.55709</val></magnitude>
+ <node name="Pseudomonadaceae">
+ <magnitude><val>83.55709</val></magnitude>
+ <node name="Pseudomonas">
+ <magnitude><val>83.55709</val></magnitude>
+ <node name="Pseudomonas_unclassified">
+ <magnitude><val>75.87271</val></magnitude>
+ </node>
+ <node name="Pseudomonas_syringae">
+ <magnitude><val>6.12101</val></magnitude>
+ </node>
+ <node name="Pseudomonas_savastanoi">
+ <magnitude><val>0.00141</val></magnitude>
+ </node>
+ <node name="Pseudomonas_fluorescens">
+ <magnitude><val>1.56196</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ </node>
+ <node name="Betaproteobacteria">
+ <magnitude><val>0.00332</val></magnitude>
+ <node name="Burkholderiales">
+ <magnitude><val>0.00332</val></magnitude>
+ <node name="Burkholderiaceae">
+ <magnitude><val>0.00332</val></magnitude>
+ <node name="Burkholderia">
+ <magnitude><val>0.00332</val></magnitude>
+ <node name="Burkholderia_unclassified">
+ <magnitude><val>0.00332</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ </node>
+ </node>
+ <node name="Bacteroidetes">
+ <magnitude><val>0.01263</val></magnitude>
+ <node name="Sphingobacteria">
+ <magnitude><val>0.01263</val></magnitude>
+ <node name="Sphingobacteriales">
+ <magnitude><val>0.01263</val></magnitude>
+ <node name="Sphingobacteriaceae">
+ <magnitude><val>0.01263</val></magnitude>
+ <node name="Sphingobacteriaceae_unclassified">
+ <magnitude><val>0.01263</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ </node>
+ </node>
+</node>
+ </krona>
+</div></body></html>
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/Figure_Krona_Pi1845A.all.html b/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/Figure_Krona_Pi1845A.all.html
new file mode 100644
index 0000000..9b688d3
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/Figure_Krona_Pi1845A.all.html
@@ -0,0 +1,252 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+ <head>
+ <meta charset="utf-8"/>
+ <link rel="shortcut icon" href="http://krona.sourceforge.net/img/favicon.ico"/>
+ <script id="notfound">window.onload=function(){document.body.innerHTML="Could not get resources from \"http://krona.sourceforge.net\"."}</script>
+ <script src="http://krona.sourceforge.net/src/krona-2.0.js"></script>
+ </head>
+ <body>
+ <img id="hiddenImage" src="http://krona.sourceforge.net/img/hidden.png" style="display:none"/>
+ <img id="loadingImage" src="http://krona.sourceforge.net/img/loading.gif" style="display:none"/>
+ <img id="logo" src="http://krona.sourceforge.net/img/logo.png" style="display:none"/>
+ <noscript>Javascript must be enabled to view this page.</noscript>
+ <div style="display:none">
+ <krona collapse="true" key="true">
+ <attributes magnitude="magnitude">
+ <list>members</list>
+ <attribute display="Total">magnitude</attribute>
+ </attributes>
+ <datasets>
+ <dataset>Krona_Pi1845A.all</dataset>
+ </datasets>
+<node name="all">
+ <magnitude><val>100.00001</val></magnitude>
+ <node name="Bacteria">
+ <magnitude><val>100.00001</val></magnitude>
+ <node name="Actinobacteria">
+ <magnitude><val>65.96496</val></magnitude>
+ <node name="Actinobacteria">
+ <magnitude><val>65.96496</val></magnitude>
+ <node name="Actinomycetales">
+ <magnitude><val>45.69488</val></magnitude>
+ <node name="Micrococcaceae">
+ <magnitude><val>10.63463</val></magnitude>
+ <node name="Arthrobacter">
+ <magnitude><val>10.63463</val></magnitude>
+ <node name="Arthrobacter_phenanthrenivorans">
+ <magnitude><val>0.63377</val></magnitude>
+ </node>
+ <node name="Arthrobacter_unclassified">
+ <magnitude><val>10.00086</val></magnitude>
+ </node>
+ </node>
+ </node>
+ <node name="Propionibacteriaceae">
+ <magnitude><val>0.51257</val></magnitude>
+ <node name="Propionibacterium">
+ <magnitude><val>0.51257</val></magnitude>
+ <node name="Propionibacterium_acnes">
+ <magnitude><val>0.31543</val></magnitude>
+ </node>
+ <node name="Propionibacterium_unclassified">
+ <magnitude><val>0.19714</val></magnitude>
+ </node>
+ </node>
+ </node>
+ <node name="Mycobacteriaceae">
+ <magnitude><val>3.18891</val></magnitude>
+ <node name="Mycobacterium">
+ <magnitude><val>3.18891</val></magnitude>
+ <node name="Mycobacterium_unclassified">
+ <magnitude><val>3.18891</val></magnitude>
+ </node>
+ </node>
+ </node>
+ <node name="Brevibacteriaceae">
+ <magnitude><val>26.96627</val></magnitude>
+ <node name="Brevibacterium">
+ <magnitude><val>26.96627</val></magnitude>
+ <node name="Brevibacterium_linens">
+ <magnitude><val>26.96627</val></magnitude>
+ </node>
+ </node>
+ </node>
+ <node name="Kineosporiaceae">
+ <magnitude><val>1.17813</val></magnitude>
+ <node name="Kineococcus">
+ <magnitude><val>1.17813</val></magnitude>
+ <node name="Kineococcus_radiotolerans">
+ <magnitude><val>1.17813</val></magnitude>
+ </node>
+ </node>
+ </node>
+ <node name="Frankiaceae">
+ <magnitude><val>3.21437</val></magnitude>
+ <node name="Frankia">
+ <magnitude><val>3.21437</val></magnitude>
+ <node name="Frankia_unclassified">
+ <magnitude><val>3.21437</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ <node name="Bifidobacteriales">
+ <magnitude><val>0.28072</val></magnitude>
+ <node name="Bifidobacteriaceae">
+ <magnitude><val>0.28072</val></magnitude>
+ <node name="Bifidobacterium">
+ <magnitude><val>0.28072</val></magnitude>
+ <node name="Bifidobacterium_unclassified">
+ <magnitude><val>0.28072</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ <node name="Coriobacteriales">
+ <magnitude><val>19.98936</val></magnitude>
+ <node name="Coriobacteriaceae">
+ <magnitude><val>19.98936</val></magnitude>
+ <node name="Collinsella">
+ <magnitude><val>19.98936</val></magnitude>
+ <node name="Collinsella_intestinalis">
+ <magnitude><val>19.98936</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ </node>
+ </node>
+ <node name="Bacteroidetes">
+ <magnitude><val>1.14119</val></magnitude>
+ <node name="Flavobacteria">
+ <magnitude><val>1.14119</val></magnitude>
+ <node name="Flavobacteriales">
+ <magnitude><val>1.14119</val></magnitude>
+ <node name="Flavobacteriales_uncl">
+ <magnitude><val>1.14119</val></magnitude>
+ <node name="Candidatus_Sulcia">
+ <magnitude><val>1.14119</val></magnitude>
+ <node name="Candidatus_Sulcia_muelleri">
+ <magnitude><val>1.14119</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ </node>
+ </node>
+ <node name="Proteobacteria">
+ <magnitude><val>32.89386</val></magnitude>
+ <node name="Alphaproteobacteria">
+ <magnitude><val>15.74193</val></magnitude>
+ <node name="Rhizobiales">
+ <magnitude><val>12.12615</val></magnitude>
+ <node name="Methylocystaceae">
+ <magnitude><val>0.80038</val></magnitude>
+ <node name="Methylocystaceae_unclassified">
+ <magnitude><val>0.80038</val></magnitude>
+ </node>
+ </node>
+ <node name="Methylobacteriaceae">
+ <magnitude><val>6.02952</val></magnitude>
+ <node name="Methylobacterium">
+ <magnitude><val>6.02952</val></magnitude>
+ <node name="Methylobacterium_radiotolerans">
+ <magnitude><val>0.66482</val></magnitude>
+ </node>
+ <node name="Methylobacterium_unclassified">
+ <magnitude><val>5.3647</val></magnitude>
+ </node>
+ </node>
+ </node>
+ <node name="Aurantimonadaceae">
+ <magnitude><val>5.29625</val></magnitude>
+ <node name="Aurantimonadaceae_unclassified">
+ <magnitude><val>5.29625</val></magnitude>
+ </node>
+ </node>
+ </node>
+ <node name="Caulobacterales">
+ <magnitude><val>1.20722</val></magnitude>
+ <node name="Caulobacteraceae">
+ <magnitude><val>1.20722</val></magnitude>
+ <node name="Caulobacter">
+ <magnitude><val>1.20722</val></magnitude>
+ <node name="Caulobacter_unclassified">
+ <magnitude><val>1.20722</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ <node name="Rhodospirillales">
+ <magnitude><val>0.92557</val></magnitude>
+ <node name="Acetobacteraceae">
+ <magnitude><val>0.92557</val></magnitude>
+ <node name="Acetobacteraceae_unclassified">
+ <magnitude><val>0.92557</val></magnitude>
+ </node>
+ </node>
+ </node>
+ <node name="Sphingomonadales">
+ <magnitude><val>1.48299</val></magnitude>
+ <node name="Erythrobacteraceae">
+ <magnitude><val>1.48299</val></magnitude>
+ <node name="Erythrobacteraceae_unclassified">
+ <magnitude><val>1.48299</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ <node name="Gammaproteobacteria">
+ <magnitude><val>14.31079</val></magnitude>
+ <node name="Enterobacteriales">
+ <magnitude><val>4.2322</val></magnitude>
+ <node name="Enterobacteriaceae">
+ <magnitude><val>4.2322</val></magnitude>
+ <node name="Escherichia">
+ <magnitude><val>2.09858</val></magnitude>
+ <node name="Escherichia_coli">
+ <magnitude><val>2.09858</val></magnitude>
+ </node>
+ </node>
+ <node name="Candidatus_Carsonella">
+ <magnitude><val>2.13362</val></magnitude>
+ <node name="Candidatus_Carsonella_ruddii">
+ <magnitude><val>2.13362</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ <node name="Pseudomonadales">
+ <magnitude><val>10.07859</val></magnitude>
+ <node name="Pseudomonadaceae">
+ <magnitude><val>10.07859</val></magnitude>
+ <node name="Pseudomonas">
+ <magnitude><val>10.07859</val></magnitude>
+ <node name="Pseudomonas_unclassified">
+ <magnitude><val>10.07859</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ </node>
+ <node name="Betaproteobacteria">
+ <magnitude><val>2.84114</val></magnitude>
+ <node name="Burkholderiales">
+ <magnitude><val>2.84114</val></magnitude>
+ <node name="Oxalobacteraceae">
+ <magnitude><val>2.84114</val></magnitude>
+ <node name="Candidatus_Zinderia">
+ <magnitude><val>2.84114</val></magnitude>
+ <node name="Candidatus_Zinderia_unclassified">
+ <magnitude><val>2.84114</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ </node>
+ </node>
+ </node>
+</node>
+ </krona>
+</div></body></html>
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/Figure_Krona_Pi1889.all.html b/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/Figure_Krona_Pi1889.all.html
new file mode 100644
index 0000000..38ffcdd
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/Figure_Krona_Pi1889.all.html
@@ -0,0 +1,327 @@
+<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
+ <head>
+ <meta charset="utf-8"/>
+ <link rel="shortcut icon" href="http://krona.sourceforge.net/img/favicon.ico"/>
+ <script id="notfound">window.onload=function(){document.body.innerHTML="Could not get resources from \"http://krona.sourceforge.net\"."}</script>
+ <script src="http://krona.sourceforge.net/src/krona-2.0.js"></script>
+ </head>
+ <body>
+ <img id="hiddenImage" src="http://krona.sourceforge.net/img/hidden.png" style="display:none"/>
+ <img id="loadingImage" src="http://krona.sourceforge.net/img/loading.gif" style="display:none"/>
+ <img id="logo" src="http://krona.sourceforge.net/img/logo.png" style="display:none"/>
+ <noscript>Javascript must be enabled to view this page.</noscript>
+ <div style="display:none">
+ <krona collapse="true" key="true">
+ <attributes magnitude="magnitude">
+ <list>members</list>
+ <attribute display="Total">magnitude</attribute>
+ </attributes>
+ <datasets>
+ <dataset>Krona_Pi1889.all</dataset>
+ </datasets>
+<node name="all">
+ <magnitude><val>100.00001</val></magnitude>
+ <node name="Bacteria">
+ <magnitude><val>100.00001</val></magnitude>
+ <node name="Actinobacteria">
+ <magnitude><val>4.52966</val></magnitude>
+ <node name="Actinobacteria">
+ <magnitude><val>4.52966</val></magnitude>
+ <node name="Actinomycetales">
+ <magnitude><val>4.52158</val></magnitude>
+ <node name="Micrococcaceae">
+ <magnitude><val>4.44789</val></magnitude>
+ <node name="Micrococcus">
+ <magnitude><val>0.08361</val></magnitude>
+ <node name="Micrococcus_luteus">
+ <magnitude><val>0.08361</val></magnitude>
+ </node>
+ </node>
+ <node name="Arthrobacter">
+ <magnitude><val>4.36428</val></magnitude>
+ <node name="Arthrobacter_arilaitensis">
+ <magnitude><val>4.36428</val></magnitude>
+ </node>
+ </node>
+ </node>
+ <node name="Mycobacteriaceae">
+ <magnitude><val>0.01895</val></magnitude>
+ <node name="Mycobacterium">
+ <magnitude><val>0.01895</val></magnitude>
+ <node name="Mycobacterium_unclassified">
+ <magnitude><val>0.01895</val></magnitude>
+ </node>
+ </node>
+ </node>
+ <node name="Propionibacteriaceae">
+ <magnitude><val>0.02818</val></magnitude>
+ <node name="Propionibacterium">
+ <magnitude><val>0.02818</val></magnitude>
+ <node name="Propionibacterium_acnes">
+ <magnitude><val>0.02818</val></magnitude>
+ </node>
+ </node>
+ </node>
+ <node name="Sanguibacteraceae">
+ <magnitude><val>0.01449</val></magnitude>
+ <node name="Sanguibacter">
+ <magnitude><val>0.01449</val></magnitude>
+ <node name="Sanguibacter_keddieii">
+ <magnitude><val>0.01449</val></magnitude>
+ </node>
+ </node>
+ </node>
+ <node name="Frankiaceae">
+ <magnitude><val>0.01207</val></magnitude>
+ <node name="Frankia">
+ <magnitude><val>0.01207</val></magnitude>
+ <node name="Frankia_unclassified">
+ <magnitude><val>0.01207</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ <node name="Bifidobacteriales">
+ <magnitude><val>0.00808</val></magnitude>
+ <node name="Bifidobacteriaceae">
+ <magnitude><val>0.00808</val></magnitude>
+ <node name="Bifidobacterium">
+ <magnitude><val>0.00808</val></magnitude>
+ <node name="Bifidobacterium_unclassified">
+ <magnitude><val>0.00808</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ </node>
+ </node>
+ <node name="Bacteroidetes">
+ <magnitude><val>0.07999</val></magnitude>
+ <node name="Sphingobacteria">
+ <magnitude><val>0.07999</val></magnitude>
+ <node name="Sphingobacteriales">
+ <magnitude><val>0.07999</val></magnitude>
+ <node name="Sphingobacteriaceae">
+ <magnitude><val>0.07999</val></magnitude>
+ <node name="Sphingobacteriaceae_unclassified">
+ <magnitude><val>0.07999</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ </node>
+ <node name="Proteobacteria">
+ <magnitude><val>84.15166</val></magnitude>
+ <node name="Alphaproteobacteria">
+ <magnitude><val>0.22196</val></magnitude>
+ <node name="Rhizobiales">
+ <magnitude><val>0.21038</val></magnitude>
+ <node name="Methylobacteriaceae">
+ <magnitude><val>0.11727</val></magnitude>
+ <node name="Methylobacterium">
+ <magnitude><val>0.11727</val></magnitude>
+ <node name="Methylobacterium_radiotolerans">
+ <magnitude><val>0.02073</val></magnitude>
+ </node>
+ <node name="Methylobacterium_unclassified">
+ <magnitude><val>0.09654</val></magnitude>
+ </node>
+ </node>
+ </node>
+ <node name="Aurantimonadaceae">
+ <magnitude><val>0.09311</val></magnitude>
+ <node name="Aurantimonadaceae_unclassified">
+ <magnitude><val>0.09311</val></magnitude>
+ </node>
+ </node>
+ </node>
+ <node name="Rhodospirillales">
+ <magnitude><val>0.01158</val></magnitude>
+ <node name="Acetobacteraceae">
+ <magnitude><val>0.01158</val></magnitude>
+ <node name="Acetobacteraceae_unclassified">
+ <magnitude><val>0.01158</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ <node name="Gammaproteobacteria">
+ <magnitude><val>83.9297</val></magnitude>
+ <node name="Xanthomonadales">
+ <magnitude><val>0.02516</val></magnitude>
+ <node name="Xanthomonadaceae">
+ <magnitude><val>0.02516</val></magnitude>
+ <node name="Stenotrophomonas">
+ <magnitude><val>0.02516</val></magnitude>
+ <node name="Stenotrophomonas_maltophilia">
+ <magnitude><val>0.02516</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ <node name="Chromatiales">
+ <magnitude><val>0.00791</val></magnitude>
+ <node name="Ectothiorhodospiraceae">
+ <magnitude><val>0.00791</val></magnitude>
+ <node name="Thioalkalivibrio">
+ <magnitude><val>0.00791</val></magnitude>
+ <node name="Thioalkalivibrio_unclassified">
+ <magnitude><val>0.00791</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ <node name="Enterobacteriales">
+ <magnitude><val>10.33468</val></magnitude>
+ <node name="Enterobacteriaceae">
+ <magnitude><val>10.33468</val></magnitude>
+ <node name="Serratia">
+ <magnitude><val>0.00271</val></magnitude>
+ <node name="Serratia_unclassified">
+ <magnitude><val>0.00271</val></magnitude>
+ </node>
+ </node>
+ <node name="Enterobacter">
+ <magnitude><val>0.10093</val></magnitude>
+ <node name="Enterobacter_cloacae">
+ <magnitude><val>0.00126</val></magnitude>
+ </node>
+ <node name="Enterobacter_unclassified">
+ <magnitude><val>0.09967</val></magnitude>
+ </node>
+ </node>
+ <node name="Erwinia">
+ <magnitude><val>0.42828</val></magnitude>
+ <node name="Erwinia_billingiae">
+ <magnitude><val>0.01755</val></magnitude>
+ </node>
+ <node name="Erwinia_tasmaniensis">
+ <magnitude><val>0.00498</val></magnitude>
+ </node>
+ <node name="Erwinia_unclassified">
+ <magnitude><val>0.40575</val></magnitude>
+ </node>
+ </node>
+ <node name="Rahnella">
+ <magnitude><val>1.02689</val></magnitude>
+ <node name="Rahnella_unclassified">
+ <magnitude><val>1.02689</val></magnitude>
+ </node>
+ </node>
+ <node name="Pectobacterium">
+ <magnitude><val>5.56943</val></magnitude>
+ <node name="Pectobacterium_unclassified">
+ <magnitude><val>4.02044</val></magnitude>
+ </node>
+ <node name="Pectobacterium_carotovorum">
+ <magnitude><val>1.54588</val></magnitude>
+ </node>
+ <node name="Pectobacterium_wasabiae">
+ <magnitude><val>0.00311</val></magnitude>
+ </node>
+ </node>
+ <node name="Pantoea">
+ <magnitude><val>3.20644</val></magnitude>
+ <node name="Pantoea_unclassified">
+ <magnitude><val>2.94967</val></magnitude>
+ </node>
+ <node name="Pantoea_ananatis">
+ <magnitude><val>0.00794</val></magnitude>
+ </node>
+ <node name="Pantoea_vagans">
+ <magnitude><val>0.24883</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ <node name="Pseudomonadales">
+ <magnitude><val>73.56195</val></magnitude>
+ <node name="Pseudomonadaceae">
+ <magnitude><val>69.80932</val></magnitude>
+ <node name="Pseudomonas">
+ <magnitude><val>69.80932</val></magnitude>
+ <node name="Pseudomonas_aeruginosa">
+ <magnitude><val>0.00066</val></magnitude>
+ </node>
+ <node name="Pseudomonas_unclassified">
+ <magnitude><val>69.25646</val></magnitude>
+ </node>
+ <node name="Pseudomonas_mendocina">
+ <magnitude><val>0.01106</val></magnitude>
+ </node>
+ <node name="Pseudomonas_syringae">
+ <magnitude><val>0.15187</val></magnitude>
+ </node>
+ <node name="Pseudomonas_fluorescens">
+ <magnitude><val>0.31989</val></magnitude>
+ </node>
+ <node name="Pseudomonas_putida">
+ <magnitude><val>0.06938</val></magnitude>
+ </node>
+ </node>
+ </node>
+ <node name="Moraxellaceae">
+ <magnitude><val>3.75263</val></magnitude>
+ <node name="Psychrobacter">
+ <magnitude><val>0.06983</val></magnitude>
+ <node name="Psychrobacter_unclassified">
+ <magnitude><val>0.06983</val></magnitude>
+ </node>
+ </node>
+ <node name="Acinetobacter">
+ <magnitude><val>3.6828</val></magnitude>
+ <node name="Acinetobacter_unclassified">
+ <magnitude><val>3.36356</val></magnitude>
+ </node>
+ <node name="Acinetobacter_lwoffii">
+ <magnitude><val>0.10669</val></magnitude>
+ </node>
+ <node name="Acinetobacter_johnsonii">
+ <magnitude><val>0.21255</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ </node>
+ </node>
+ <node name="Firmicutes">
+ <magnitude><val>11.22683</val></magnitude>
+ <node name="Bacilli">
+ <magnitude><val>11.22683</val></magnitude>
+ <node name="Bacillales">
+ <magnitude><val>11.22683</val></magnitude>
+ <node name="Bacillaceae">
+ <magnitude><val>0.01886</val></magnitude>
+ <node name="Bacillus">
+ <magnitude><val>0.01886</val></magnitude>
+ <node name="Bacillus_pumilus">
+ <magnitude><val>0.01886</val></magnitude>
+ </node>
+ </node>
+ </node>
+ <node name="Bacillales_uncl">
+ <magnitude><val>11.20797</val></magnitude>
+ <node name="Exiguobacterium">
+ <magnitude><val>11.20797</val></magnitude>
+ <node name="Exiguobacterium_sibiricum">
+ <magnitude><val>11.20797</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+ </node>
+ </node>
+ <node name="Chloroflexi">
+ <magnitude><val>0.01187</val></magnitude>
+ <node name="Thermomicrobia">
+ <magnitude><val>0.01187</val></magnitude>
+ <node name="Thermomicrobia_unclassified">
+ <magnitude><val>0.01187</val></magnitude>
+ </node>
+ </node>
+ </node>
+ </node>
+</node>
+ </krona>
+</div></body></html>
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/Krona_M.0182896.all.txt b/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/Krona_M.0182896.all.txt
new file mode 100644
index 0000000..d68a640
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/Krona_M.0182896.all.txt
@@ -0,0 +1,16 @@
+0.01263 Bacteria Bacteroidetes Sphingobacteria Sphingobacteriales Sphingobacteriaceae Sphingobacteriaceae_unclassified
+75.87271 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas Pseudomonas_unclassified
+15.0525 Bacteria Proteobacteria Gammaproteobacteria Enterobacteriales Enterobacteriaceae Pantoea Pantoea_unclassified
+6.12101 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas Pseudomonas_syringae
+1.56196 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas Pseudomonas_fluorescens
+1.2783 Bacteria Proteobacteria Gammaproteobacteria Enterobacteriales Enterobacteriaceae Pantoea Pantoea_vagans
+0.03877 Bacteria Actinobacteria Actinobacteria Actinomycetales Kineosporiaceae Kineococcus Kineococcus_radiotolerans
+0.03042 Bacteria Proteobacteria Alphaproteobacteria Rhizobiales Methylobacteriaceae Methylobacterium Methylobacterium_unclassified
+0.00931 Bacteria Proteobacteria Gammaproteobacteria Chromatiales Ectothiorhodospiraceae Thioalkalivibrio Thioalkalivibrio_unclassified
+0.00583 Bacteria Actinobacteria Actinobacteria Actinomycetales Mycobacteriaceae Mycobacterium Mycobacterium_unclassified
+0.00472 Bacteria Proteobacteria Gammaproteobacteria Enterobacteriales Enterobacteriaceae Pantoea Pantoea_ananatis
+0.00399 Bacteria Proteobacteria Gammaproteobacteria Enterobacteriales Enterobacteriaceae Erwinia Erwinia_unclassified
+0.00332 Bacteria Proteobacteria Betaproteobacteria Burkholderiales Burkholderiaceae Burkholderia Burkholderia_unclassified
+0.00273 Bacteria Actinobacteria Actinobacteria Actinomycetales Microbacteriaceae Clavibacter Clavibacter_michiganensis
+0.00141 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas Pseudomonas_savastanoi
+0.00037 Bacteria Proteobacteria Gammaproteobacteria Enterobacteriales Enterobacteriaceae Erwinia Erwinia_billingiae
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/Krona_Pi1845A.all.txt b/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/Krona_Pi1845A.all.txt
new file mode 100644
index 0000000..51e7906
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/Krona_Pi1845A.all.txt
@@ -0,0 +1,22 @@
+5.29625 Bacteria Proteobacteria Alphaproteobacteria Rhizobiales Aurantimonadaceae Aurantimonadaceae_unclassified
+1.48299 Bacteria Proteobacteria Alphaproteobacteria Sphingomonadales Erythrobacteraceae Erythrobacteraceae_unclassified
+0.92557 Bacteria Proteobacteria Alphaproteobacteria Rhodospirillales Acetobacteraceae Acetobacteraceae_unclassified
+0.80038 Bacteria Proteobacteria Alphaproteobacteria Rhizobiales Methylocystaceae Methylocystaceae_unclassified
+26.96627 Bacteria Actinobacteria Actinobacteria Actinomycetales Brevibacteriaceae Brevibacterium Brevibacterium_linens
+19.98936 Bacteria Actinobacteria Actinobacteria Coriobacteriales Coriobacteriaceae Collinsella Collinsella_intestinalis
+10.07859 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas Pseudomonas_unclassified
+10.00086 Bacteria Actinobacteria Actinobacteria Actinomycetales Micrococcaceae Arthrobacter Arthrobacter_unclassified
+5.3647 Bacteria Proteobacteria Alphaproteobacteria Rhizobiales Methylobacteriaceae Methylobacterium Methylobacterium_unclassified
+3.21437 Bacteria Actinobacteria Actinobacteria Actinomycetales Frankiaceae Frankia Frankia_unclassified
+3.18891 Bacteria Actinobacteria Actinobacteria Actinomycetales Mycobacteriaceae Mycobacterium Mycobacterium_unclassified
+2.84114 Bacteria Proteobacteria Betaproteobacteria Burkholderiales Oxalobacteraceae Candidatus_Zinderia Candidatus_Zinderia_unclassified
+2.13362 Bacteria Proteobacteria Gammaproteobacteria Enterobacteriales Enterobacteriaceae Candidatus_Carsonella Candidatus_Carsonella_ruddii
+2.09858 Bacteria Proteobacteria Gammaproteobacteria Enterobacteriales Enterobacteriaceae Escherichia Escherichia_coli
+1.20722 Bacteria Proteobacteria Alphaproteobacteria Caulobacterales Caulobacteraceae Caulobacter Caulobacter_unclassified
+1.17813 Bacteria Actinobacteria Actinobacteria Actinomycetales Kineosporiaceae Kineococcus Kineococcus_radiotolerans
+1.14119 Bacteria Bacteroidetes Flavobacteria Flavobacteriales Flavobacteriales_uncl Candidatus_Sulcia Candidatus_Sulcia_muelleri
+0.66482 Bacteria Proteobacteria Alphaproteobacteria Rhizobiales Methylobacteriaceae Methylobacterium Methylobacterium_radiotolerans
+0.63377 Bacteria Actinobacteria Actinobacteria Actinomycetales Micrococcaceae Arthrobacter Arthrobacter_phenanthrenivorans
+0.31543 Bacteria Actinobacteria Actinobacteria Actinomycetales Propionibacteriaceae Propionibacterium Propionibacterium_acnes
+0.28072 Bacteria Actinobacteria Actinobacteria Bifidobacteriales Bifidobacteriaceae Bifidobacterium Bifidobacterium_unclassified
+0.19714 Bacteria Actinobacteria Actinobacteria Actinomycetales Propionibacteriaceae Propionibacterium Propionibacterium_unclassified
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/Krona_Pi1845A.pdf b/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/Krona_Pi1845A.pdf
new file mode 100644
index 0000000..d394e06
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/Krona_Pi1845A.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/Krona_Pi1889.all.txt b/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/Krona_Pi1889.all.txt
new file mode 100644
index 0000000..ba28051
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/Krona_Pi1889.all.txt
@@ -0,0 +1,40 @@
+0.01187 Bacteria Chloroflexi Thermomicrobia Thermomicrobia_unclassified
+0.09311 Bacteria Proteobacteria Alphaproteobacteria Rhizobiales Aurantimonadaceae Aurantimonadaceae_unclassified
+0.07999 Bacteria Bacteroidetes Sphingobacteria Sphingobacteriales Sphingobacteriaceae Sphingobacteriaceae_unclassified
+0.01158 Bacteria Proteobacteria Alphaproteobacteria Rhodospirillales Acetobacteraceae Acetobacteraceae_unclassified
+69.25646 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas Pseudomonas_unclassified
+11.20797 Bacteria Firmicutes Bacilli Bacillales Bacillales_uncl Exiguobacterium Exiguobacterium_sibiricum
+4.36428 Bacteria Actinobacteria Actinobacteria Actinomycetales Micrococcaceae Arthrobacter Arthrobacter_arilaitensis
+4.02044 Bacteria Proteobacteria Gammaproteobacteria Enterobacteriales Enterobacteriaceae Pectobacterium Pectobacterium_unclassified
+3.36356 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Moraxellaceae Acinetobacter Acinetobacter_unclassified
+2.94967 Bacteria Proteobacteria Gammaproteobacteria Enterobacteriales Enterobacteriaceae Pantoea Pantoea_unclassified
+1.54588 Bacteria Proteobacteria Gammaproteobacteria Enterobacteriales Enterobacteriaceae Pectobacterium Pectobacterium_carotovorum
+1.02689 Bacteria Proteobacteria Gammaproteobacteria Enterobacteriales Enterobacteriaceae Rahnella Rahnella_unclassified
+0.40575 Bacteria Proteobacteria Gammaproteobacteria Enterobacteriales Enterobacteriaceae Erwinia Erwinia_unclassified
+0.31989 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas Pseudomonas_fluorescens
+0.24883 Bacteria Proteobacteria Gammaproteobacteria Enterobacteriales Enterobacteriaceae Pantoea Pantoea_vagans
+0.21255 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Moraxellaceae Acinetobacter Acinetobacter_johnsonii
+0.15187 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas Pseudomonas_syringae
+0.10669 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Moraxellaceae Acinetobacter Acinetobacter_lwoffii
+0.09967 Bacteria Proteobacteria Gammaproteobacteria Enterobacteriales Enterobacteriaceae Enterobacter Enterobacter_unclassified
+0.09654 Bacteria Proteobacteria Alphaproteobacteria Rhizobiales Methylobacteriaceae Methylobacterium Methylobacterium_unclassified
+0.08361 Bacteria Actinobacteria Actinobacteria Actinomycetales Micrococcaceae Micrococcus Micrococcus_luteus
+0.06983 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Moraxellaceae Psychrobacter Psychrobacter_unclassified
+0.06938 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas Pseudomonas_putida
+0.02818 Bacteria Actinobacteria Actinobacteria Actinomycetales Propionibacteriaceae Propionibacterium Propionibacterium_acnes
+0.02516 Bacteria Proteobacteria Gammaproteobacteria Xanthomonadales Xanthomonadaceae Stenotrophomonas Stenotrophomonas_maltophilia
+0.02073 Bacteria Proteobacteria Alphaproteobacteria Rhizobiales Methylobacteriaceae Methylobacterium Methylobacterium_radiotolerans
+0.01895 Bacteria Actinobacteria Actinobacteria Actinomycetales Mycobacteriaceae Mycobacterium Mycobacterium_unclassified
+0.01886 Bacteria Firmicutes Bacilli Bacillales Bacillaceae Bacillus Bacillus_pumilus
+0.01755 Bacteria Proteobacteria Gammaproteobacteria Enterobacteriales Enterobacteriaceae Erwinia Erwinia_billingiae
+0.01449 Bacteria Actinobacteria Actinobacteria Actinomycetales Sanguibacteraceae Sanguibacter Sanguibacter_keddieii
+0.01207 Bacteria Actinobacteria Actinobacteria Actinomycetales Frankiaceae Frankia Frankia_unclassified
+0.01106 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas Pseudomonas_mendocina
+0.00808 Bacteria Actinobacteria Actinobacteria Bifidobacteriales Bifidobacteriaceae Bifidobacterium Bifidobacterium_unclassified
+0.00794 Bacteria Proteobacteria Gammaproteobacteria Enterobacteriales Enterobacteriaceae Pantoea Pantoea_ananatis
+0.00791 Bacteria Proteobacteria Gammaproteobacteria Chromatiales Ectothiorhodospiraceae Thioalkalivibrio Thioalkalivibrio_unclassified
+0.00498 Bacteria Proteobacteria Gammaproteobacteria Enterobacteriales Enterobacteriaceae Erwinia Erwinia_tasmaniensis
+0.00311 Bacteria Proteobacteria Gammaproteobacteria Enterobacteriales Enterobacteriaceae Pectobacterium Pectobacterium_wasabiae
+0.00271 Bacteria Proteobacteria Gammaproteobacteria Enterobacteriales Enterobacteriaceae Serratia Serratia_unclassified
+0.00126 Bacteria Proteobacteria Gammaproteobacteria Enterobacteriales Enterobacteriaceae Enterobacter Enterobacter_cloacae
+0.00066 Bacteria Proteobacteria Gammaproteobacteria Pseudomonadales Pseudomonadaceae Pseudomonas Pseudomonas_aeruginosa
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/M.0182896.all.txt b/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/M.0182896.all.txt
new file mode 100644
index 0000000..6fae74e
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/M.0182896.all.txt
@@ -0,0 +1,50 @@
+k__Bacteria 100.0
+k__Bacteria|p__Proteobacteria 99.94004
+k__Bacteria|p__Actinobacteria 0.04733
+k__Bacteria|p__Bacteroidetes 0.01263
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria 99.9063
+k__Bacteria|p__Actinobacteria|c__Actinobacteria 0.04733
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria 0.03042
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria 0.01263
+k__Bacteria|p__Proteobacteria|c__Betaproteobacteria 0.00332
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales 83.5571
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales 16.33989
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales 0.04733
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales 0.03042
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria|o__Sphingobacteriales 0.01263
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Chromatiales 0.00931
+k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales 0.00332
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae 83.5571
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae 16.33989
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Kineosporiaceae 0.03877
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae 0.03042
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria|o__Sphingobacteriales|f__Sphingobacteriaceae 0.01263
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Chromatiales|f__Ectothiorhodospiraceae 0.00931
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae 0.00583
+k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Burkholderiaceae 0.00332
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Microbacteriaceae 0.00273
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas 83.5571
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea 16.33552
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Kineosporiaceae|g__Kineococcus 0.03877
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae|g__Methylobacterium 0.03042
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria|o__Sphingobacteriales|f__Sphingobacteriaceae|g__Sphingobacteriaceae_unclassified 0.01263
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Chromatiales|f__Ectothiorhodospiraceae|g__Thioalkalivibrio 0.00931
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae|g__Mycobacterium 0.00583
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Erwinia 0.00437
+k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Burkholderiaceae|g__Burkholderia 0.00332
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Microbacteriaceae|g__Clavibacter 0.00273
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_unclassified 75.87271
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_unclassified 15.0525
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_syringae 6.12101
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_fluorescens 1.56196
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_vagans 1.2783
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Kineosporiaceae|g__Kineococcus|s__Kineococcus_radiotolerans 0.03877
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae|g__Methylobacterium|s__Methylobacterium_unclassified 0.03042
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Chromatiales|f__Ectothiorhodospiraceae|g__Thioalkalivibrio|s__Thioalkalivibrio_unclassified 0.00931
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae|g__Mycobacterium|s__Mycobacterium_unclassified 0.00583
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_ananatis 0.00472
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Erwinia|s__Erwinia_unclassified 0.00399
+k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Burkholderiaceae|g__Burkholderia|s__Burkholderia_unclassified 0.00332
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Microbacteriaceae|g__Clavibacter|s__Clavibacter_michiganensis 0.00273
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_savastanoi 0.00141
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Erwinia|s__Erwinia_billingiae 0.00037
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/Pi1889.all.txt b/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/Pi1889.all.txt
new file mode 100644
index 0000000..5623983
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/profiling/krona/Pi1889.all.txt
@@ -0,0 +1,100 @@
+k__Bacteria 100.0
+k__Bacteria|p__Proteobacteria 84.15166
+k__Bacteria|p__Firmicutes 11.22683
+k__Bacteria|p__Actinobacteria 4.52965
+k__Bacteria|p__Bacteroidetes 0.07999
+k__Bacteria|p__Chloroflexi 0.01187
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria 83.92971
+k__Bacteria|p__Firmicutes|c__Bacilli 11.22683
+k__Bacteria|p__Actinobacteria|c__Actinobacteria 4.52965
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria 0.22195
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria 0.07999
+k__Bacteria|p__Chloroflexi|c__Thermomicrobia 0.01187
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales 73.56195
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales 11.22683
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales 10.33469
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales 4.52158
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales 0.21038
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria|o__Sphingobacteriales 0.07999
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Xanthomonadales 0.02516
+k__Bacteria|p__Chloroflexi|c__Thermomicrobia|o__Thermomicrobia_unclassified 0.01187
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhodospirillales 0.01158
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales 0.00808
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Chromatiales 0.00791
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae 69.80932
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_uncl 11.20797
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae 10.33469
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae 4.44789
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae 3.75263
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae 0.11727
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Aurantimonadaceae 0.09311
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria|o__Sphingobacteriales|f__Sphingobacteriaceae 0.07999
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae 0.02818
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Xanthomonadales|f__Xanthomonadaceae 0.02516
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae 0.01895
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillaceae 0.01886
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Sanguibacteraceae 0.01449
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Frankiaceae 0.01207
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhodospirillales|f__Acetobacteraceae 0.01158
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales|f__Bifidobacteriaceae 0.00808
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Chromatiales|f__Ectothiorhodospiraceae 0.00791
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas 69.80932
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_uncl|g__Exiguobacterium 11.20797
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pectobacterium 5.56944
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Arthrobacter 4.36428
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter 3.6828
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea 3.20645
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Rahnella 1.02689
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Erwinia 0.42828
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae|g__Methylobacterium 0.11727
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Enterobacter 0.10093
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Aurantimonadaceae|g__Aurantimonadaceae_unclassified 0.09311
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Micrococcus 0.08361
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria|o__Sphingobacteriales|f__Sphingobacteriaceae|g__Sphingobacteriaceae_unclassified 0.07999
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Psychrobacter 0.06983
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae|g__Propionibacterium 0.02818
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Xanthomonadales|f__Xanthomonadaceae|g__Stenotrophomonas 0.02516
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae|g__Mycobacterium 0.01895
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillaceae|g__Bacillus 0.01886
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Sanguibacteraceae|g__Sanguibacter 0.01449
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Frankiaceae|g__Frankia 0.01207
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhodospirillales|f__Acetobacteraceae|g__Acetobacteraceae_unclassified 0.01158
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Bifidobacterium 0.00808
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Chromatiales|f__Ectothiorhodospiraceae|g__Thioalkalivibrio 0.00791
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Serratia 0.00271
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_unclassified 69.25646
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_uncl|g__Exiguobacterium|s__Exiguobacterium_sibiricum 11.20797
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Arthrobacter|s__Arthrobacter_arilaitensis 4.36428
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pectobacterium|s__Pectobacterium_unclassified 4.02044
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter|s__Acinetobacter_unclassified 3.36356
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_unclassified 2.94967
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pectobacterium|s__Pectobacterium_carotovorum 1.54588
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Rahnella|s__Rahnella_unclassified 1.02689
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Erwinia|s__Erwinia_unclassified 0.40575
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_fluorescens 0.31989
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_vagans 0.24883
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter|s__Acinetobacter_johnsonii 0.21255
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_syringae 0.15187
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter|s__Acinetobacter_lwoffii 0.10669
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Enterobacter|s__Enterobacter_unclassified 0.09967
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae|g__Methylobacterium|s__Methylobacterium_unclassified 0.09654
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Micrococcus|s__Micrococcus_luteus 0.08361
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Psychrobacter|s__Psychrobacter_unclassified 0.06983
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_putida 0.06938
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae|g__Propionibacterium|s__Propionibacterium_acnes 0.02818
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Xanthomonadales|f__Xanthomonadaceae|g__Stenotrophomonas|s__Stenotrophomonas_maltophilia 0.02516
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae|g__Methylobacterium|s__Methylobacterium_radiotolerans 0.02073
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae|g__Mycobacterium|s__Mycobacterium_unclassified 0.01895
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillaceae|g__Bacillus|s__Bacillus_pumilus 0.01886
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Erwinia|s__Erwinia_billingiae 0.01755
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Sanguibacteraceae|g__Sanguibacter|s__Sanguibacter_keddieii 0.01449
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Frankiaceae|g__Frankia|s__Frankia_unclassified 0.01207
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_mendocina 0.01106
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Bifidobacterium|s__Bifidobacterium_unclassified 0.00808
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_ananatis 0.00794
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Chromatiales|f__Ectothiorhodospiraceae|g__Thioalkalivibrio|s__Thioalkalivibrio_unclassified 0.00791
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Erwinia|s__Erwinia_tasmaniensis 0.00498
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pectobacterium|s__Pectobacterium_wasabiae 0.00311
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Serratia|s__Serratia_unclassified 0.00271
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Enterobacter|s__Enterobacter_cloacae 0.00126
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_aeruginosa 0.00066
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Barplot_class.pdf b/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Barplot_class.pdf
new file mode 100644
index 0000000..58c78de
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Barplot_class.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Clustering_genus.pdf b/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Clustering_genus.pdf
new file mode 100644
index 0000000..df1eb47
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Clustering_genus.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Distances_genus.txt b/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Distances_genus.txt
new file mode 100644
index 0000000..c54d333
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Distances_genus.txt
@@ -0,0 +1,10 @@
+M.0182896.NO.UDG M.0182896.UDG M.0182896.UDGa M.0182896.UDGb M.0182896.UDGc Pi1845A.id.CGCTAT Pi1889.id.CTTGTA Pi1889.id.GGCTAC Pi1889.id.TAGCTT
+0.000000000 0.004483200 0.020195300 0.004503900 0.028484500 0.899214110 0.261598600 0.266457237 0.265777887
+0.004483200 0.000000000 0.024678500 0.008987100 0.032967700 0.899214110 0.261598600 0.266457237 0.265777887
+0.020195300 0.024678500 0.000000000 0.015691400 0.008388600 0.898307910 0.261093700 0.266041037 0.265361608
+0.004503900 0.008987100 0.015691400 0.000000000 0.023980600 0.898930110 0.261598600 0.266457237 0.265777887
+0.028484500 0.032967700 0.008388600 0.023980600 0.000000000 0.898311210 0.260858300 0.265813437 0.265196277
+0.899214110 0.899214110 0.898307910 0.898930110 0.898311210 0.000000000 0.853885315 0.856513422 0.851802698
+0.261598600 0.261598600 0.261093700 0.261598600 0.260858300 0.853885315 0.000000000 0.009303650 0.009080104
+0.266457237 0.266457237 0.266041037 0.266457237 0.265813437 0.856513422 0.009303650 0.000000000 0.010187662
+0.265777887 0.265777887 0.265361608 0.265777887 0.265196277 0.851802698 0.009080104 0.010187662 0.000000000
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Diversity_genus.txt b/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Diversity_genus.txt
new file mode 100644
index 0000000..32f19b6
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Diversity_genus.txt
@@ -0,0 +1,9 @@
+"M.0182896.NO.UDG" 0.408899548817149
+"M.0182896.UDG" 0.400758726494359
+"M.0182896.UDGa" 0.450048541093465
+"M.0182896.UDGb" 0.418969892590388
+"M.0182896.UDGc" 0.46520912982916
+"Pi1845A.id.CGCTAT" 2.29916013722066
+"Pi1889.id.CTTGTA" 1.11647899599897
+"Pi1889.id.GGCTAC" 1.11412227336574
+"Pi1889.id.TAGCTT" 1.13380402565392
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Heatmap_genus.pdf b/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Heatmap_genus.pdf
new file mode 100644
index 0000000..5bff484
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Heatmap_genus.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/results/PCA_genus.pdf b/paleomix/resources/examples/nature_protocols/example_results/profiling/results/PCA_genus.pdf
new file mode 100644
index 0000000..5ae87ef
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/profiling/results/PCA_genus.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/results/PCOA_genus.pdf b/paleomix/resources/examples/nature_protocols/example_results/profiling/results/PCOA_genus.pdf
new file mode 100644
index 0000000..de95f0d
Binary files /dev/null and b/paleomix/resources/examples/nature_protocols/example_results/profiling/results/PCOA_genus.pdf differ
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Potato_merged.csv b/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Potato_merged.csv
new file mode 100644
index 0000000..97a022a
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Potato_merged.csv
@@ -0,0 +1,126 @@
+ID M.0182896.NO.UDG M.0182896.UDG M.0182896.UDGa M.0182896.UDGb M.0182896.UDGc Pi1845A.id.CGCTAT Pi1889.id.CTTGTA Pi1889.id.GGCTAC Pi1889.id.TAGCTT
+k__Bacteria 100.0 100.0 100.0 100.0 100.0 100.0 100.0 100.0 100.0
+k__Bacteria|p__Actinobacteria 0.0 0.0 0.04899 0.0284 0.03984 65.96496 4.45534 4.20686 4.47192
+k__Bacteria|p__Actinobacteria|c__Actinobacteria 0.0 0.0 0.04899 0.0284 0.03984 65.96496 4.45534 4.20686 4.47192
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales 0.0 0.0 0.04899 0.0284 0.03984 45.69488 4.45534 4.20686 4.45915
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Brevibacteriaceae 0.0 0.0 0.0 0.0 0.0 26.96627 0.0 0.0 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Brevibacteriaceae|g__Brevibacterium 0.0 0.0 0.0 0.0 0.0 26.96627 0.0 0.0 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Brevibacteriaceae|g__Brevibacterium|s__Brevibacterium_linens 0.0 0.0 0.0 0.0 0.0 26.96627 0.0 0.0 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Frankiaceae 0.0 0.0 0.0 0.0 0.0 3.21437 0.0 0.0 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Frankiaceae|g__Frankia 0.0 0.0 0.0 0.0 0.0 3.21437 0.0 0.0 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Frankiaceae|g__Frankia|s__Frankia_unclassified 0.0 0.0 0.0 0.0 0.0 3.21437 0.0 0.0 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Kineosporiaceae 0.0 0.0 0.04013 0.0284 0.03019 1.17813 0.0 0.0 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Kineosporiaceae|g__Kineococcus 0.0 0.0 0.04013 0.0284 0.03019 1.17813 0.0 0.0 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Kineosporiaceae|g__Kineococcus|s__Kineococcus_radiotolerans 0.0 0.0 0.04013 0.0284 0.03019 1.17813 0.0 0.0 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae 0.0 0.0 0.0 0.0 0.0 10.63463 4.39765 4.1946 4.45241
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Arthrobacter 0.0 0.0 0.0 0.0 0.0 10.63463 4.31343 4.14533 4.39643
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Arthrobacter|s__Arthrobacter_arilaitensis 0.0 0.0 0.0 0.0 0.0 0.0 4.31343 4.14533 4.39643
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Arthrobacter|s__Arthrobacter_phenanthrenivorans 0.0 0.0 0.0 0.0 0.0 0.63377 0.0 0.0 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Arthrobacter|s__Arthrobacter_unclassified 0.0 0.0 0.0 0.0 0.0 10.00086 0.0 0.0 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Micrococcus 0.0 0.0 0.0 0.0 0.0 0.0 0.08422 0.04927 0.05598
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Micrococcus|s__Micrococcus_luteus 0.0 0.0 0.0 0.0 0.0 0.0 0.08422 0.04927 0.05598
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae 0.0 0.0 0.00887 0.0 0.00965 3.18891 0.05769 0.0 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae|g__Mycobacterium 0.0 0.0 0.00887 0.0 0.00965 3.18891 0.05769 0.0 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae|g__Mycobacterium|s__Mycobacterium_unclassified 0.0 0.0 0.00887 0.0 0.00965 3.18891 0.05769 0.0 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae 0.0 0.0 0.0 0.0 0.0 0.51258 0.0 0.01226 0.00674
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae|g__Propionibacterium 0.0 0.0 0.0 0.0 0.0 0.51258 0.0 0.01226 0.00674
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae|g__Propionibacterium|s__Propionibacterium_acnes 0.0 0.0 0.0 0.0 0.0 0.31543 0.0 0.01226 0.00674
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae|g__Propionibacterium|s__Propionibacterium_unclassified 0.0 0.0 0.0 0.0 0.0 0.19714 0.0 0.0 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales 0.0 0.0 0.0 0.0 0.0 0.28072 0.0 0.0 0.01277
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales|f__Bifidobacteriaceae 0.0 0.0 0.0 0.0 0.0 0.28072 0.0 0.0 0.01277
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Bifidobacterium 0.0 0.0 0.0 0.0 0.0 0.28072 0.0 0.0 0.01277
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Bifidobacterium|s__Bifidobacterium_unclassified 0.0 0.0 0.0 0.0 0.0 0.28072 0.0 0.0 0.01277
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Coriobacteriales 0.0 0.0 0.0 0.0 0.0 19.98936 0.0 0.0 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Coriobacteriales|f__Coriobacteriaceae 0.0 0.0 0.0 0.0 0.0 19.98936 0.0 0.0 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella 0.0 0.0 0.0 0.0 0.0 19.98936 0.0 0.0 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella|s__Collinsella_intestinalis 0.0 0.0 0.0 0.0 0.0 19.98936 0.0 0.0 0.0
+k__Bacteria|p__Bacteroidetes 0.0 0.0 0.0 0.0 0.01393 1.14119 0.04059 0.14525 0.0
+k__Bacteria|p__Bacteroidetes|c__Flavobacteria 0.0 0.0 0.0 0.0 0.0 1.14119 0.0 0.0 0.0
+k__Bacteria|p__Bacteroidetes|c__Flavobacteria|o__Flavobacteriales 0.0 0.0 0.0 0.0 0.0 1.14119 0.0 0.0 0.0
+k__Bacteria|p__Bacteroidetes|c__Flavobacteria|o__Flavobacteriales|f__Flavobacteriales_uncl 0.0 0.0 0.0 0.0 0.0 1.14119 0.0 0.0 0.0
+k__Bacteria|p__Bacteroidetes|c__Flavobacteria|o__Flavobacteriales|f__Flavobacteriales_uncl|g__Candidatus_Sulcia 0.0 0.0 0.0 0.0 0.0 1.14119 0.0 0.0 0.0
+k__Bacteria|p__Bacteroidetes|c__Flavobacteria|o__Flavobacteriales|f__Flavobacteriales_uncl|g__Candidatus_Sulcia|s__Candidatus_Sulcia_muelleri 0.0 0.0 0.0 0.0 0.0 1.14119 0.0 0.0 0.0
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria 0.0 0.0 0.0 0.0 0.01393 0.0 0.04059 0.14525 0.0
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria|o__Sphingobacteriales 0.0 0.0 0.0 0.0 0.01393 0.0 0.04059 0.14525 0.0
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria|o__Sphingobacteriales|f__Sphingobacteriaceae 0.0 0.0 0.0 0.0 0.01393 0.0 0.04059 0.14525 0.0
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria|o__Sphingobacteriales|f__Sphingobacteriaceae|g__Sphingobacteriaceae_unclassified 0.0 0.0 0.0 0.0 0.01393 0.0 0.04059 0.14525 0.0
+k__Bacteria|p__Chloroflexi 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.03753
+k__Bacteria|p__Chloroflexi|c__Thermomicrobia 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.03753
+k__Bacteria|p__Chloroflexi|c__Thermomicrobia|o__Thermomicrobia_unclassified 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.03753
+k__Bacteria|p__Firmicutes 0.0 0.0 0.0 0.0 0.0 0.0 11.00357 11.52692 10.9544
+k__Bacteria|p__Firmicutes|c__Bacilli 0.0 0.0 0.0 0.0 0.0 0.0 11.00357 11.52692 10.9544
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales 0.0 0.0 0.0 0.0 0.0 0.0 11.00357 11.52692 10.9544
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_uncl 0.0 0.0 0.0 0.0 0.0 0.0 11.00357 11.52692 10.9544
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_uncl|g__Exiguobacterium 0.0 0.0 0.0 0.0 0.0 0.0 11.00357 11.52692 10.9544
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_uncl|g__Exiguobacterium|s__Exiguobacterium_sibiricum 0.0 0.0 0.0 0.0 0.0 0.0 11.00357 11.52692 10.9544
+k__Bacteria|p__Proteobacteria 100.0 100.0 99.95101 99.9716 99.94623 32.89385 84.5005 84.12097 84.53615
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria 0.0 0.0 0.04162 0.0 0.05045 15.74193 0.16176 0.11248 0.32243
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Caulobacterales 0.0 0.0 0.0 0.0 0.0 1.20722 0.0 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Caulobacterales|f__Caulobacteraceae 0.0 0.0 0.0 0.0 0.0 1.20722 0.0 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Caulobacterales|f__Caulobacteraceae|g__Caulobacter 0.0 0.0 0.0 0.0 0.0 1.20722 0.0 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Caulobacterales|f__Caulobacteraceae|g__Caulobacter|s__Caulobacter_unclassified 0.0 0.0 0.0 0.0 0.0 1.20722 0.0 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales 0.0 0.0 0.04162 0.0 0.05045 12.12615 0.16176 0.11248 0.32243
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Aurantimonadaceae 0.0 0.0 0.0 0.0 0.0 5.29625 0.07227 0.02813 0.19676
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Aurantimonadaceae|g__Aurantimonadaceae_unclassified 0.0 0.0 0.0 0.0 0.0 5.29625 0.07227 0.02813 0.19676
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae 0.0 0.0 0.04162 0.0 0.05045 6.02952 0.08949 0.08435 0.12566
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae|g__Methylobacterium 0.0 0.0 0.04162 0.0 0.05045 6.02952 0.08949 0.08435 0.12566
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae|g__Methylobacterium|s__Methylobacterium_radiotolerans 0.0 0.0 0.0 0.0 0.0 0.66482 0.0 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae|g__Methylobacterium|s__Methylobacterium_unclassified 0.0 0.0 0.04162 0.0 0.05045 5.3647 0.08949 0.08435 0.12566
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylocystaceae 0.0 0.0 0.0 0.0 0.0 0.80038 0.0 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylocystaceae|g__Methylocystaceae_unclassified 0.0 0.0 0.0 0.0 0.0 0.80038 0.0 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhodospirillales 0.0 0.0 0.0 0.0 0.0 0.92557 0.0 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhodospirillales|f__Acetobacteraceae 0.0 0.0 0.0 0.0 0.0 0.92557 0.0 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhodospirillales|f__Acetobacteraceae|g__Acetobacteraceae_unclassified 0.0 0.0 0.0 0.0 0.0 0.92557 0.0 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Sphingomonadales 0.0 0.0 0.0 0.0 0.0 1.48299 0.0 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Sphingomonadales|f__Erythrobacteraceae 0.0 0.0 0.0 0.0 0.0 1.48299 0.0 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Sphingomonadales|f__Erythrobacteraceae|g__Erythrobacteraceae_unclassified 0.0 0.0 0.0 0.0 0.0 1.48299 0.0 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Betaproteobacteria 0.0 0.0 0.0 0.0 0.0 2.84114 0.0 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales 0.0 0.0 0.0 0.0 0.0 2.84114 0.0 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Oxalobacteraceae 0.0 0.0 0.0 0.0 0.0 2.84114 0.0 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Oxalobacteraceae|g__Candidatus_Zinderia 0.0 0.0 0.0 0.0 0.0 2.84114 0.0 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Oxalobacteraceae|g__Candidatus_Zinderia|s__Candidatus_Zinderia_unclassified 0.0 0.0 0.0 0.0 0.0 2.84114 0.0 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria 100.0 100.0 99.90939 99.9716 99.89578 14.31079 84.33874 84.0085 84.21373
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Chromatiales 0.0 0.0 0.0 0.0 0.0077 0.0 0.0 0.0 0.02397
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Chromatiales|f__Ectothiorhodospiraceae 0.0 0.0 0.0 0.0 0.0077 0.0 0.0 0.0 0.02397
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Chromatiales|f__Ectothiorhodospiraceae|g__Thioalkalivibrio 0.0 0.0 0.0 0.0 0.0077 0.0 0.0 0.0 0.02397
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Chromatiales|f__Ectothiorhodospiraceae|g__Thioalkalivibrio|s__Thioalkalivibrio_unclassified 0.0 0.0 0.0 0.0 0.0077 0.0 0.0 0.0 0.02397
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales 14.21791 13.76959 16.14682 14.6399 16.95444 4.2322 10.43258 10.15055 10.2167
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae 14.21791 13.76959 16.14682 14.6399 16.95444 4.2322 10.43258 10.15055 10.2167
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Candidatus_Carsonella 0.0 0.0 0.0 0.0 0.0 2.13362 0.0 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Candidatus_Carsonella|s__Candidatus_Carsonella_ruddii 0.0 0.0 0.0 0.0 0.0 2.13362 0.0 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Enterobacter 0.0 0.0 0.0 0.0 0.0 0.0 0.05956 0.0 0.14384
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Enterobacter|s__Enterobacter_unclassified 0.0 0.0 0.0 0.0 0.0 0.0 0.05956 0.0 0.14384
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Erwinia 0.0 0.0 0.0 0.0 0.0 0.0 0.42319 0.39718 0.46413
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Erwinia|s__Erwinia_billingiae 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00506 0.00561
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Erwinia|s__Erwinia_tasmaniensis 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00474 0.0
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Erwinia|s__Erwinia_unclassified 0.0 0.0 0.0 0.0 0.0 0.0 0.42319 0.38739 0.45852
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Escherichia 0.0 0.0 0.0 0.0 0.0 2.09858 0.0 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Escherichia|s__Escherichia_coli 0.0 0.0 0.0 0.0 0.0 2.09858 0.0 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea 14.21791 13.76959 16.14682 14.6399 16.95444 0.0 3.37568 3.15824 3.25615
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_ananatis 0.0 0.0 0.00094 0.00263 0.0 0.0 0.0 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_unclassified 13.464 12.716 14.88738 13.49337 15.67433 0.0 3.13912 2.9336 3.05066
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_vagans 0.75391 1.05359 1.25851 1.1439 1.28011 0.0 0.23656 0.22464 0.20548
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pectobacterium 0.0 0.0 0.0 0.0 0.0 0.0 5.6553 5.61134 5.3445
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pectobacterium|s__Pectobacterium_carotovorum 0.0 0.0 0.0 0.0 0.0 0.0 1.47407 1.40753 1.54711
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pectobacterium|s__Pectobacterium_unclassified 0.0 0.0 0.0 0.0 0.0 0.0 4.18123 4.20381 3.79739
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Rahnella 0.0 0.0 0.0 0.0 0.0 0.0 0.91885 0.98379 1.00808
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Rahnella|s__Rahnella_unclassified 0.0 0.0 0.0 0.0 0.0 0.0 0.91885 0.98379 1.00808
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales 85.78209 86.23041 83.76256 85.3317 82.93364 10.07859 73.90616 73.85794 73.96409
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae 0.0 0.0 0.0 0.0 0.0 0.0 3.4417 3.66191 3.81181
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter 0.0 0.0 0.0 0.0 0.0 0.0 3.38248 3.60764 3.761
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter|s__Acinetobacter_johnsonii 0.0 0.0 0.0 0.0 0.0 0.0 0.17158 0.20383 0.1717
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter|s__Acinetobacter_lwoffii 0.0 0.0 0.0 0.0 0.0 0.0 0.07895 0.09253 0.07815
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter|s__Acinetobacter_unclassified 0.0 0.0 0.0 0.0 0.0 0.0 3.13194 3.31128 3.51115
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Psychrobacter 0.0 0.0 0.0 0.0 0.0 0.0 0.05922 0.05427 0.05081
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Psychrobacter|s__Psychrobacter_unclassified 0.0 0.0 0.0 0.0 0.0 0.0 0.05922 0.05427 0.05081
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae 85.78209 86.23041 83.76256 85.3317 82.93364 10.07859 70.46446 70.19604 70.15228
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas 85.78209 86.23041 83.76256 85.3317 82.93364 10.07859 70.46446 70.19604 70.15228
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_fluorescens 2.08192 1.42328 1.31223 1.54634 1.59558 0.0 0.20104 0.37799 0.2438
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_mendocina 0.0 0.0 0.0 0.0 0.0 0.0 0.00335 0.00471 0.0
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_putida 0.0 0.0 0.0 0.0 0.0 0.0 0.05879 0.04539 0.0707
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_syringae 7.46662 5.48006 5.96167 5.49172 6.12355 0.0 0.06906 0.14251 0.08718
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_unclassified 76.23356 79.32706 76.48866 78.29363 75.21452 10.07859 70.13223 69.62544 69.7506
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Xanthomonadales 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00896
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Xanthomonadales|f__Xanthomonadaceae 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00896
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Xanthomonadales|f__Xanthomonadaceae|g__Stenotrophomonas 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00896
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Xanthomonadales|f__Xanthomonadaceae|g__Stenotrophomonas|s__Stenotrophomonas_maltophilia 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0 0.00896
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Potato_merged_profiles.csv b/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Potato_merged_profiles.csv
new file mode 100644
index 0000000..26a15f9
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Potato_merged_profiles.csv
@@ -0,0 +1,144 @@
+ID M.0182896.all Pi1845A.id.CGCTAT Pi1889.all
+k__Bacteria 100.0 100.0 100.0
+k__Bacteria|p__Actinobacteria 0.04733 65.96496 4.52965
+k__Bacteria|p__Actinobacteria|c__Actinobacteria 0.04733 65.96496 4.52965
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales 0.04733 45.69488 4.52158
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Brevibacteriaceae 0.0 26.96627 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Brevibacteriaceae|g__Brevibacterium 0.0 26.96627 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Brevibacteriaceae|g__Brevibacterium|s__Brevibacterium_linens 0.0 26.96627 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Frankiaceae 0.0 3.21437 0.01207
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Frankiaceae|g__Frankia 0.0 3.21437 0.01207
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Frankiaceae|g__Frankia|s__Frankia_unclassified 0.0 3.21437 0.01207
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Kineosporiaceae 0.03877 1.17813 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Kineosporiaceae|g__Kineococcus 0.03877 1.17813 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Kineosporiaceae|g__Kineococcus|s__Kineococcus_radiotolerans 0.03877 1.17813 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Microbacteriaceae 0.00273 0.0 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Microbacteriaceae|g__Clavibacter 0.00273 0.0 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Microbacteriaceae|g__Clavibacter|s__Clavibacter_michiganensis 0.00273 0.0 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae 0.0 10.63463 4.44789
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Arthrobacter 0.0 10.63463 4.36428
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Arthrobacter|s__Arthrobacter_arilaitensis 0.0 0.0 4.36428
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Arthrobacter|s__Arthrobacter_phenanthrenivorans 0.0 0.63377 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Arthrobacter|s__Arthrobacter_unclassified 0.0 10.00086 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Micrococcus 0.0 0.0 0.08361
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Micrococcaceae|g__Micrococcus|s__Micrococcus_luteus 0.0 0.0 0.08361
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae 0.00583 3.18891 0.01895
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae|g__Mycobacterium 0.00583 3.18891 0.01895
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Mycobacteriaceae|g__Mycobacterium|s__Mycobacterium_unclassified 0.00583 3.18891 0.01895
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae 0.0 0.51258 0.02818
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae|g__Propionibacterium 0.0 0.51258 0.02818
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae|g__Propionibacterium|s__Propionibacterium_acnes 0.0 0.31543 0.02818
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Propionibacteriaceae|g__Propionibacterium|s__Propionibacterium_unclassified 0.0 0.19714 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Sanguibacteraceae 0.0 0.0 0.01449
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Sanguibacteraceae|g__Sanguibacter 0.0 0.0 0.01449
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Actinomycetales|f__Sanguibacteraceae|g__Sanguibacter|s__Sanguibacter_keddieii 0.0 0.0 0.01449
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales 0.0 0.28072 0.00808
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales|f__Bifidobacteriaceae 0.0 0.28072 0.00808
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Bifidobacterium 0.0 0.28072 0.00808
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Bifidobacteriales|f__Bifidobacteriaceae|g__Bifidobacterium|s__Bifidobacterium_unclassified 0.0 0.28072 0.00808
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Coriobacteriales 0.0 19.98936 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Coriobacteriales|f__Coriobacteriaceae 0.0 19.98936 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella 0.0 19.98936 0.0
+k__Bacteria|p__Actinobacteria|c__Actinobacteria|o__Coriobacteriales|f__Coriobacteriaceae|g__Collinsella|s__Collinsella_intestinalis 0.0 19.98936 0.0
+k__Bacteria|p__Bacteroidetes 0.01263 1.14119 0.07999
+k__Bacteria|p__Bacteroidetes|c__Flavobacteria 0.0 1.14119 0.0
+k__Bacteria|p__Bacteroidetes|c__Flavobacteria|o__Flavobacteriales 0.0 1.14119 0.0
+k__Bacteria|p__Bacteroidetes|c__Flavobacteria|o__Flavobacteriales|f__Flavobacteriales_uncl 0.0 1.14119 0.0
+k__Bacteria|p__Bacteroidetes|c__Flavobacteria|o__Flavobacteriales|f__Flavobacteriales_uncl|g__Candidatus_Sulcia 0.0 1.14119 0.0
+k__Bacteria|p__Bacteroidetes|c__Flavobacteria|o__Flavobacteriales|f__Flavobacteriales_uncl|g__Candidatus_Sulcia|s__Candidatus_Sulcia_muelleri 0.0 1.14119 0.0
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria 0.01263 0.0 0.07999
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria|o__Sphingobacteriales 0.01263 0.0 0.07999
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria|o__Sphingobacteriales|f__Sphingobacteriaceae 0.01263 0.0 0.07999
+k__Bacteria|p__Bacteroidetes|c__Sphingobacteria|o__Sphingobacteriales|f__Sphingobacteriaceae|g__Sphingobacteriaceae_unclassified 0.01263 0.0 0.07999
+k__Bacteria|p__Chloroflexi 0.0 0.0 0.01187
+k__Bacteria|p__Chloroflexi|c__Thermomicrobia 0.0 0.0 0.01187
+k__Bacteria|p__Chloroflexi|c__Thermomicrobia|o__Thermomicrobia_unclassified 0.0 0.0 0.01187
+k__Bacteria|p__Firmicutes 0.0 0.0 11.22683
+k__Bacteria|p__Firmicutes|c__Bacilli 0.0 0.0 11.22683
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales 0.0 0.0 11.22683
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillaceae 0.0 0.0 0.01886
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillaceae|g__Bacillus 0.0 0.0 0.01886
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillaceae|g__Bacillus|s__Bacillus_pumilus 0.0 0.0 0.01886
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_uncl 0.0 0.0 11.20797
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_uncl|g__Exiguobacterium 0.0 0.0 11.20797
+k__Bacteria|p__Firmicutes|c__Bacilli|o__Bacillales|f__Bacillales_uncl|g__Exiguobacterium|s__Exiguobacterium_sibiricum 0.0 0.0 11.20797
+k__Bacteria|p__Proteobacteria 99.94004 32.89385 84.15166
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria 0.03042 15.74193 0.22195
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Caulobacterales 0.0 1.20722 0.0
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Caulobacterales|f__Caulobacteraceae 0.0 1.20722 0.0
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Caulobacterales|f__Caulobacteraceae|g__Caulobacter 0.0 1.20722 0.0
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Caulobacterales|f__Caulobacteraceae|g__Caulobacter|s__Caulobacter_unclassified 0.0 1.20722 0.0
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales 0.03042 12.12615 0.21038
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Aurantimonadaceae 0.0 5.29625 0.09311
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Aurantimonadaceae|g__Aurantimonadaceae_unclassified 0.0 5.29625 0.09311
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae 0.03042 6.02952 0.11727
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae|g__Methylobacterium 0.03042 6.02952 0.11727
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae|g__Methylobacterium|s__Methylobacterium_radiotolerans 0.0 0.66482 0.02073
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylobacteriaceae|g__Methylobacterium|s__Methylobacterium_unclassified 0.03042 5.3647 0.09654
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylocystaceae 0.0 0.80038 0.0
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhizobiales|f__Methylocystaceae|g__Methylocystaceae_unclassified 0.0 0.80038 0.0
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhodospirillales 0.0 0.92557 0.01158
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhodospirillales|f__Acetobacteraceae 0.0 0.92557 0.01158
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Rhodospirillales|f__Acetobacteraceae|g__Acetobacteraceae_unclassified 0.0 0.92557 0.01158
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Sphingomonadales 0.0 1.48299 0.0
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Sphingomonadales|f__Erythrobacteraceae 0.0 1.48299 0.0
+k__Bacteria|p__Proteobacteria|c__Alphaproteobacteria|o__Sphingomonadales|f__Erythrobacteraceae|g__Erythrobacteraceae_unclassified 0.0 1.48299 0.0
+k__Bacteria|p__Proteobacteria|c__Betaproteobacteria 0.00332 2.84114 0.0
+k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales 0.00332 2.84114 0.0
+k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Burkholderiaceae 0.00332 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Burkholderiaceae|g__Burkholderia 0.00332 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Burkholderiaceae|g__Burkholderia|s__Burkholderia_unclassified 0.00332 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Oxalobacteraceae 0.0 2.84114 0.0
+k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Oxalobacteraceae|g__Candidatus_Zinderia 0.0 2.84114 0.0
+k__Bacteria|p__Proteobacteria|c__Betaproteobacteria|o__Burkholderiales|f__Oxalobacteraceae|g__Candidatus_Zinderia|s__Candidatus_Zinderia_unclassified 0.0 2.84114 0.0
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria 99.9063 14.31079 83.92971
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Chromatiales 0.00931 0.0 0.00791
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Chromatiales|f__Ectothiorhodospiraceae 0.00931 0.0 0.00791
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Chromatiales|f__Ectothiorhodospiraceae|g__Thioalkalivibrio 0.00931 0.0 0.00791
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Chromatiales|f__Ectothiorhodospiraceae|g__Thioalkalivibrio|s__Thioalkalivibrio_unclassified 0.00931 0.0 0.00791
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales 16.33989 4.2322 10.33469
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae 16.33989 4.2322 10.33469
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Candidatus_Carsonella 0.0 2.13362 0.0
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Candidatus_Carsonella|s__Candidatus_Carsonella_ruddii 0.0 2.13362 0.0
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Enterobacter 0.0 0.0 0.10093
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Enterobacter|s__Enterobacter_cloacae 0.0 0.0 0.00126
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Enterobacter|s__Enterobacter_unclassified 0.0 0.0 0.09967
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Erwinia 0.00437 0.0 0.42828
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Erwinia|s__Erwinia_billingiae 0.00037 0.0 0.01755
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Erwinia|s__Erwinia_tasmaniensis 0.0 0.0 0.00498
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Erwinia|s__Erwinia_unclassified 0.00399 0.0 0.40575
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Escherichia 0.0 2.09858 0.0
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Escherichia|s__Escherichia_coli 0.0 2.09858 0.0
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea 16.33552 0.0 3.20645
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_ananatis 0.00472 0.0 0.00794
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_unclassified 15.0525 0.0 2.94967
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pantoea|s__Pantoea_vagans 1.2783 0.0 0.24883
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pectobacterium 0.0 0.0 5.56944
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pectobacterium|s__Pectobacterium_carotovorum 0.0 0.0 1.54588
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pectobacterium|s__Pectobacterium_unclassified 0.0 0.0 4.02044
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Pectobacterium|s__Pectobacterium_wasabiae 0.0 0.0 0.00311
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Rahnella 0.0 0.0 1.02689
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Rahnella|s__Rahnella_unclassified 0.0 0.0 1.02689
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Serratia 0.0 0.0 0.00271
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Enterobacteriales|f__Enterobacteriaceae|g__Serratia|s__Serratia_unclassified 0.0 0.0 0.00271
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales 83.5571 10.07859 73.56195
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae 0.0 0.0 3.75263
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter 0.0 0.0 3.6828
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter|s__Acinetobacter_johnsonii 0.0 0.0 0.21255
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter|s__Acinetobacter_lwoffii 0.0 0.0 0.10669
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Acinetobacter|s__Acinetobacter_unclassified 0.0 0.0 3.36356
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Psychrobacter 0.0 0.0 0.06983
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Moraxellaceae|g__Psychrobacter|s__Psychrobacter_unclassified 0.0 0.0 0.06983
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae 83.5571 10.07859 69.80932
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas 83.5571 10.07859 69.80932
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_aeruginosa 0.0 0.0 0.00066
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_fluorescens 1.56196 0.0 0.31989
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_mendocina 0.0 0.0 0.01106
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_putida 0.0 0.0 0.06938
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_savastanoi 0.00141 0.0 0.0
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_syringae 6.12101 0.0 0.15187
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Pseudomonadales|f__Pseudomonadaceae|g__Pseudomonas|s__Pseudomonas_unclassified 75.87271 10.07859 69.25646
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Xanthomonadales 0.0 0.0 0.02516
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Xanthomonadales|f__Xanthomonadaceae 0.0 0.0 0.02516
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Xanthomonadales|f__Xanthomonadaceae|g__Stenotrophomonas 0.0 0.0 0.02516
+k__Bacteria|p__Proteobacteria|c__Gammaproteobacteria|o__Xanthomonadales|f__Xanthomonadaceae|g__Stenotrophomonas|s__Stenotrophomonas_maltophilia 0.0 0.0 0.02516
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Relative_abundances_genus.csv b/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Relative_abundances_genus.csv
new file mode 100644
index 0000000..0ecbf29
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Relative_abundances_genus.csv
@@ -0,0 +1,32 @@
+"","M.0182896.NO.UDG","M.0182896.UDG","M.0182896.UDGa","M.0182896.UDGb","M.0182896.UDGc","Pi1845A.id.CGCTAT","Pi1889.id.CTTGTA","Pi1889.id.GGCTAC","Pi1889.id.TAGCTT"
+"Brevibacterium",0,0,0,0,0,26.96627,0,0,0
+"Frankia",0,0,0,0,0,3.21437,0,0,0
+"Kineococcus",0,0,0.04013,0.0284,0.03019,1.17813,0,0,0
+"Arthrobacter",0,0,0,0,0,10.63463,4.31343,4.14533,4.39643
+"Micrococcus",0,0,0,0,0,0,0.08422,0.04927,0.05598
+"Mycobacterium",0,0,0.00887,0,0.00965,3.18891,0.05769,0,0
+"Propionibacterium",0,0,0,0,0,0.51258,0,0.01226,0.00674
+"Bifidobacterium",0,0,0,0,0,0.28072,0,0,0.01277
+"Collinsella",0,0,0,0,0,19.98936,0,0,0
+"Candidatus_Sulcia",0,0,0,0,0,1.14119,0,0,0
+"Sphingobacteriaceae_unclassified",0,0,0,0,0.01393,0,0.04059,0.14525,0
+"Exiguobacterium",0,0,0,0,0,0,11.00357,11.52692,10.9544
+"Caulobacter",0,0,0,0,0,1.20722,0,0,0
+"Aurantimonadaceae_unclassified",0,0,0,0,0,5.29625,0.07227,0.02813,0.19676
+"Methylobacterium",0,0,0.04162,0,0.05045,6.02952,0.08949,0.08435,0.12566
+"Methylocystaceae_unclassified",0,0,0,0,0,0.80038,0,0,0
+"Acetobacteraceae_unclassified",0,0,0,0,0,0.92557,0,0,0
+"Erythrobacteraceae_unclassified",0,0,0,0,0,1.48299,0,0,0
+"Candidatus_Zinderia",0,0,0,0,0,2.84114,0,0,0
+"Thioalkalivibrio",0,0,0,0,0.0077,0,0,0,0.02397
+"Candidatus_Carsonella",0,0,0,0,0,2.13362,0,0,0
+"Enterobacter",0,0,0,0,0,0,0.05956,0,0.14384
+"Erwinia",0,0,0,0,0,0,0.42319,0.39718,0.46413
+"Escherichia",0,0,0,0,0,2.09858,0,0,0
+"Pantoea",14.21791,13.76959,16.14682,14.6399,16.95444,0,3.37568,3.15824,3.25615
+"Pectobacterium",0,0,0,0,0,0,5.6553,5.61134,5.3445
+"Rahnella",0,0,0,0,0,0,0.91885,0.98379,1.00808
+"Acinetobacter",0,0,0,0,0,0,3.38248,3.60764,3.761
+"Psychrobacter",0,0,0,0,0,0,0.05922,0.05427,0.05081
+"Pseudomonas",85.78209,86.23041,83.76256,85.3317,82.93364,10.07859,70.46446,70.19604,70.15228
+"Stenotrophomonas",0,0,0,0,0,0,0,0,0.00896
diff --git a/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Taxon_count_genus.txt b/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Taxon_count_genus.txt
new file mode 100644
index 0000000..a8ea925
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/example_results/profiling/results/Taxon_count_genus.txt
@@ -0,0 +1,9 @@
+"M.0182896.NO.UDG" 2
+"M.0182896.UDG" 2
+"M.0182896.UDGa" 5
+"M.0182896.UDGb" 3
+"M.0182896.UDGc" 7
+"Pi1845A.id.CGCTAT" 19
+"Pi1889.id.CTTGTA" 15
+"Pi1889.id.GGCTAC" 14
+"Pi1889.id.TAGCTT" 17
diff --git a/paleomix/resources/examples/nature_protocols/phylogeny/000_makefile.yaml b/paleomix/resources/examples/nature_protocols/phylogeny/000_makefile.yaml
new file mode 100644
index 0000000..31e3141
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/phylogeny/000_makefile.yaml
@@ -0,0 +1,178 @@
+# -*- mode: Yaml; -*-
+Project:
+ Title: P_infestans
+
+ # List of samples to be included in the analytical steps, which may be
+ # grouped using any arbitrary number of (levels of) groups. (Sub)groups
+ # are not required, but may be used instead of listing individual samples
+ # in 'ExcludeSamples' and 'FilterSingletons'.
+ Samples:
+ 06_3928A:
+ Gender: NA
+ DDR7602:
+ Gender: NA
+ LBUS5:
+ Gender: NA
+ M-0182896:
+ Gender: NA
+ NL07434:
+ Gender: NA
+ P13527:
+ Gender: NA
+ P13626:
+ Gender: NA
+ P17777:
+ Gender: NA
+ Pi1845A:
+ Gender: NA
+ Pi1889:
+ Gender: NA
+
+ # Specifies a set of regions of interest, each representing one or more
+ # named regions in a reference sequence (e.g. genes) in BED format.
+ RegionsOfInterest:
+ Ensembl.v20.protein_coding.CDS:
+ # Name of the prefix; is expected to correspond to the filename
+ # of the FASTA file without the extension / the name of the
+ # prefix used in the BAM pipeline.
+ Prefix: Pi_nucl
+ # If true, BAM files are expected to have the postfix ".realigned";
+ # allows easier interopterability with the BAM pipeline.
+ Realigned: yes
+ # Specifies whether or not the sequences are protein coding; if true
+ # indels are only included in the final sequence if the length is
+ # divisible by 3.
+ ProteinCoding: yes
+ # Do not include indels in final sequence; note that indels are still
+ # called, and used to filter SNPs. Requires that
+ # 'MultipleSequenceAlignment' is enabled
+ IncludeIndels: yes
+ # List of contigs for which heterozygous SNPs should be filtered
+ # (site set to 'N'); e.g. chrX for 'Male' humans, or chrM, etc.
+ HomozygousContigs:
+ NA: []
+
+ # Filter sites in a sample, replacing any nucleotide not observed
+ # in the specified list of samples or groups with 'N'.
+# FilterSingletons:
+# NAME_OF_SAMPLE:
+# - <NAME_OF_GROUP>
+# - NAME_OF_SAMPLE
+
+
+Genotyping:
+ # Default settings for all regions of interest
+ Defaults:
+ # Regions of interest are expanded by this number of bases when calling
+ # SNPs, in order to ensure that adjacent indels can be used during filtering
+ # (VCF_filter --min-distance-to-indels and --min-distance-between-indels).
+ # The final sequences does not include the padding.
+ Padding: 10
+
+ # Settings for genotyping by random sampling of nucletoides at each site
+ Random:
+ # Min distance of variants to indels
+ --min-distance-to-indels: 2
+
+ MPileup:
+ -E: # extended BAQ for higher sensitivity but lower specificity
+ -A: # count anomalous read pairs
+
+ BCFTools:
+ -g: # Call genotypes at variant sites
+
+ VCF_Filter:
+ # Maximum coverage acceptable for genotyping calls
+ # If zero, the default vcf_filter value is used
+ MaxReadDepth:
+ 06_3928A: 110
+ DDR7602: 73
+ LBUS5: 85
+ M-0182896: 93
+ NL07434: 133
+ P13527: 87
+ P13626: 117
+ P17777: 114
+ Pi1845A: 17
+ Pi1889: 41
+
+ # Minimum coverage acceptable for genotyping calls
+ --min-read-depth: 6
+ # Min RMS mapping quality
+ --min-mapping-quality: 10
+ # Min QUAL score (Phred) for genotyping calls
+ --min-quality: 20
+ # Min distance of variants to indels
+ --min-distance-to-indels: 2
+ # Min distance between indels
+ --min-distance-between-indels: 10
+ # Min P-value for strand bias (given PV4)
+ --min-strand-bias: 1.0e-4
+ # Min P-value for baseQ bias (given PV4)
+ --min-baseq-bias: 1.0e-4
+ # Min P-value for mapQ bias (given PV4)
+ --min-mapq-bias: 1.0e-4
+ # Min P-value for end distance bias (given PV4)
+ --min-end-distance-bias: 1.0e-4
+ # Max frequency of the major allele at heterozygous sites
+ --min-allele-frequency: 0.2
+ # Minimum number of alternative bases observed for variants
+ --min-num-alt-bases: 2
+
+# Add / overwrite default settings for a set of regions
+# NAME_OF_REGIONS:
+# ...
+
+
+MultipleSequenceAlignment:
+ # Default settings for all regions of interest
+ Defaults:
+ Enabled: yes
+
+ # Multiple sequence alignment using MAFFT
+ MAFFT:
+ # Select alignment algorithm; valid values are 'mafft', 'auto', 'fft-ns-1',
+ # 'fft-ns-2', 'fft-ns-i', 'nw-ns-i', 'l-ins-i', 'e-ins-i', and 'g-ins-i'.
+ Algorithm: G-INS-i
+
+ # Parameters for mafft algorithm; see above for example of how to specify
+ --maxiterate: 1000
+
+# Add / overwrite default settings for a set of regions
+# NAME_OF_REGIONS:
+# ...
+
+
+PhylogeneticInference:
+ HighlyCoveredGenes:
+ # Root the final tree(s) on one or more samples; if no samples
+ # are specified, the tree(s) will be rooted on the midpoint(s)
+ RootTreesOn:
+ - P17777
+
+ # If 'yes', a tree is generated per named sequence in the areas of
+ # interest; otherwise a super-matrix is created from the combined set
+ # of regions specfied below.
+ PerGeneTrees: no
+
+ # Which Regions Of Interest to build the phylogeny from.
+ RegionsOfInterest:
+ Ensembl.v20.protein_coding.CDS:
+ # Partitioning scheme for sequences: Numbers specify which group a
+ # position belongs to, while 'X' excludes the position from the final
+ # partioned sequence; thus "123" splits sequences by codon-positions,
+ # while "111" produces a single partition per gene. If set to 'no',
+ # a single partition is used for the entire set of regions.
+ Partitions: "112"
+ # Limit analysis to a subset of a RegionOfInterest; subsets are expected to be
+ # located at <genome root>/<prefix>.<region name>.<subset name>.names, and
+ # contain single name (corresponding to column 4 in the BED file) per line.
+ SubsetRegions: HighlyCovered
+
+ ExaML:
+ # Number of times to perform full phylogenetic inference
+ Replicates: 1
+ # Number of bootstraps to compute
+ Bootstraps: 100
+ # Model of rate heterogeneity (GAMMA or PSR)
+ Model: GAMMA
diff --git a/paleomix/resources/examples/nature_protocols/phylogeny/select_highly_covered_genes.py b/paleomix/resources/examples/nature_protocols/phylogeny/select_highly_covered_genes.py
new file mode 100755
index 0000000..561bf72
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/phylogeny/select_highly_covered_genes.py
@@ -0,0 +1,68 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2013 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import sys
+import optparse
+
+from paleomix.common.formats.msa import \
+ MSA
+
+
+def _is_sufficently_covered(filepath, min_coverage):
+ msa = MSA.from_file(filepath)
+ if msa.seqlen() % 3:
+ return False
+
+ total_bases_not_covered = 0
+ for fasta_record in msa:
+ total_bases_not_covered += fasta_record.sequence.upper().count("N")
+ total_bases_not_covered += fasta_record.sequence.count("-")
+
+ total_bases = float(len(msa) * msa.seqlen())
+ frac_covered = 1.0 - total_bases_not_covered / total_bases
+ return frac_covered >= min_coverage
+
+
+def main(argv):
+ usage = "%prog [options] <PATH>"
+ parser = optparse.OptionParser(usage=usage)
+ parser.add_option("--min-coverage", default=0.8, type=float,
+ help="Minimum fraction of called bases in a MSA "
+ "[%default]")
+
+ config, args = parser.parse_args(argv)
+ if not args:
+ parser.print_usage()
+ return 1
+
+ for root_dir in args:
+ for filename in os.listdir(root_dir):
+ if filename.endswith(".afa"):
+ fpath = os.path.join(root_dir, filename)
+ if _is_sufficently_covered(fpath, config.min_coverage):
+ sequence_name, _ = filename.rsplit(".", 1)
+ print(sequence_name)
+
+
+if __name__ == '__main__':
+ sys.exit(main(sys.argv[1:]))
diff --git a/paleomix/resources/examples/nature_protocols/phylogeny/summarize_heterozygosity.py b/paleomix/resources/examples/nature_protocols/phylogeny/summarize_heterozygosity.py
new file mode 100755
index 0000000..0b8e701
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/phylogeny/summarize_heterozygosity.py
@@ -0,0 +1,129 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import sys
+import pysam
+
+from paleomix.common.vcfwrap import \
+ get_ml_genotype
+
+import paleomix.common.timer as timer
+
+
+def read_bed_records(filename):
+ """Reads a bed-file (i.e. for a set of regions of interest), and returns
+ a sorted list containing each line as a tuple containing the contig name,
+ the start position, and the end position."""
+ regions = []
+ bed_parser = pysam.asBed()
+ with open(filename) as bed_file:
+ for line in bed_file:
+ line = line.strip()
+ if not line or line.startswith('#'):
+ continue
+ regions.append(bed_parser(line, len(line)))
+ return regions
+
+
+def select_vcf_records(bed_records, vcf_records):
+ """Returns an iterable of VCF records, corresponding to the contents of each
+ region specified by the BED records. Records are returned at most once, even
+ if covered by multiple BED records."""
+ contigs = frozenset(vcf_records.contigs)
+ vcf_parser = pysam.asVCF()
+
+ # Timer class used processing progress; meant primarily for BAM files
+ progress = timer.BAMTimer(None)
+
+ # Cache of positions observed for this contig, to prevent returning
+ # positions in overlapping regions multiple times
+ contig_cache = None
+ contig_cache_name = None
+
+ for bed in sorted(bed_records):
+ if bed.contig not in contigs:
+ # Skip contigs for which no calls have been made (e.g. due to
+ # low coverage. Otherwise Pysam raises an exception.
+ continue
+ elif contig_cache_name != bed.contig:
+ # Reset cache per contig, to save memory
+ contig_cache = set()
+ contig_cache_name = bed.contig
+
+ for record in vcf_records.fetch(bed.contig, bed.start, bed.end, parser = vcf_parser):
+ progress.increment()
+
+ if record.pos in contig_cache:
+ # We've already reported this VCF record
+ continue
+
+ contig_cache.add(record.pos)
+ # Skip records filtered by VCF_filter
+ if record.filter in ('.', "PASS"):
+ yield record
+ progress.finalize()
+
+
+def main(argv):
+ if len(argv) != 2:
+ sys.stderr.write("Usage: %s <BED-file> <VCF.bgz>\n")
+ return 1
+
+ sites = 0
+ sites_non_ref = 0
+ sites_homo_non_ref = 0
+ sites_het_one_non_ref = 0
+ sites_het_two_non_ref = 0
+
+ vcf_records = pysam.Tabixfile(argv[1])
+ bed_records = read_bed_records(argv[0])
+
+ for record in select_vcf_records(bed_records, vcf_records):
+ if record.alt != '.':
+ # Get the most likely diploid genotype
+ nt_a, nt_b = get_ml_genotype(record)
+ if (nt_a, nt_b) == ('N', 'N'):
+ # Skip sites with no most likely genotype
+ continue
+
+ sites += 1
+ sites_non_ref += 1
+ if nt_a == nt_b:
+ sites_homo_non_ref += 1
+ elif record.ref not in (nt_a, nt_b):
+ sites_het_two_non_ref += 1
+ else:
+ sites_het_one_non_ref += 1
+ else:
+ # Heterozygous for the reference allele
+ sites += 1
+
+ print
+ print "%i sites kept after filtering:" % (sites,)
+ print " % 10i homozygous sites containing the reference allele (%.2f%%)" % (sites - sites_non_ref, 100.0 * (sites - sites_non_ref) / float(sites))
+ print " % 10i heterozygous sites containing the reference and a non-reference allele (%.2f%%)" % (sites_het_one_non_ref, (100.0 * sites_het_one_non_ref) / sites)
+ print " % 10i homozygous sites containing a single non-reference allele (%.2f%%)" % (sites_homo_non_ref, (100.0 * sites_homo_non_ref) / sites)
+ print " % 10i heterozygous sites containing two different non-reference alleles (%.2f%%)" % (sites_het_two_non_ref, (100.0 * sites_het_two_non_ref) / sites)
+
+
+if __name__ == '__main__':
+ sys.exit(main(sys.argv[1:]))
diff --git a/paleomix/resources/examples/nature_protocols/profiling/build_all_profiles.sh b/paleomix/resources/examples/nature_protocols/profiling/build_all_profiles.sh
new file mode 100755
index 0000000..a44dcd9
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/profiling/build_all_profiles.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+#
+# Copyright (c) 2013 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+set -o nounset # Fail on unset variables
+set -o errexit # Fail on uncaught non-zero returncodes
+set -o pipefail # Fail is a command in a chain of pipes fails
+
+{
+## Pi1845A
+echo Pi1845A.id.CGCTAT ../alignment/Pi1845A/reads/Pi1845A/Pi1845A_id_CGCTAT/*/reads.collapsed.bz2
+
+## Pi1889
+echo Pi1889.id.CTTGTA ../alignment/Pi1889/reads/Pi1889/Pi1889_id_CTTGTA/*/reads.collapsed.bz2
+echo Pi1889.id.TAGCTT ../alignment/Pi1889/reads/Pi1889/Pi1889_id_TAGCTT/*/reads.collapsed.bz2
+echo Pi1889.id.GGCTAC ../alignment/Pi1889/reads/Pi1889/Pi1889_id_GGCTAC/*/reads.collapsed.bz2
+
+## M-0182896
+echo M.0182896_UDG ../alignment/M-0182896/reads/M-0182896/M-0182896_UDG/*/reads.collapsed.bz2
+echo M.0182896_UDGa ../alignment/M-0182896/reads/M-0182896/M-0182896_UDGa/*/reads.collapsed.bz2
+echo M.0182896_UDGb ../alignment/M-0182896/reads/M-0182896/M-0182896_UDGb/*/reads.collapsed.bz2
+echo M.0182896_UDGc ../alignment/M-0182896/reads/M-0182896/M-0182896_UDGc/*/reads.collapsed.bz2
+echo M.0182896_NO_UDG ../alignment/M-0182896/reads/M-0182896/M-0182896_NO_UDG/*/reads.collapsed.bz2
+} | xargs -P 16 -L1 ./build_profile.sh
diff --git a/paleomix/resources/examples/nature_protocols/profiling/build_profile.sh b/paleomix/resources/examples/nature_protocols/profiling/build_profile.sh
new file mode 100755
index 0000000..b867827
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/profiling/build_profile.sh
@@ -0,0 +1,103 @@
+#!/bin/bash
+#
+# Copyright (c) 2013 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+set -o nounset # Fail on unset variables
+set -o errexit # Fail on uncaught non-zero returncodes
+set -o pipefail # Fail is a command in a chain of pipes fails
+
+if [ $# -lt 2 ];
+then
+ echo "Usage: $0 <output-prefix> <input_1.bz2> [<input_2.bz2>, ...]"
+ exit 1
+fi
+
+OUTPUT_PREFIX=$1
+# Remove the prefix parameter from the list of parameters
+shift 1
+
+JAR_ROOT=~/install/jar_root
+METAPHLAN_ROOT=~/install/metaphlan
+BOWTIE_PREFIX=${METAPHLAN_ROOT}/bowtie2db/mpa
+
+OUTPUT_SAM=${OUTPUT_PREFIX}.bowtie2.out.sam
+OUTPUT_SORTED_BAM=${OUTPUT_PREFIX}.sorted.bam
+OUTPUT_RMDUP_BAM=${OUTPUT_PREFIX}.noduplicates.bam
+OUTPUT_METAPHLAN_INPUT=${OUTPUT_PREFIX}.noduplicates
+OUTPUT_METAPHLAN=${OUTPUT_PREFIX}.txt
+
+if [ ! -e "${JAR_ROOT}/SortSam.jar" ];
+then
+ echo "Required JAR file is missing; please install:"
+ echo " - ${JAR_ROOT}/SortSam.jar"
+ exit 1
+elif [ ! -e "${METAPHLAN_ROOT}/metaphlan.py" ];
+then
+ echo "MetaPhlAn does not appear to be installed; please install at:"
+ echo " - ${METAPHLAN_ROOT}/metaphlan.py"
+ exit 1
+fi
+
+for executable in bzcat bowtie2 java bam_rmdup_collapsed samtools;
+do
+ if ! which ${executable} > /dev/null;
+ then
+ echo "Required executable is missing ('${executable}'); please install."
+ exit 1
+ fi
+done
+
+
+echo
+echo "Generating profile from $# files, saving to ${OUTPUT_METAPHLAN}"
+
+if [ ! -e "${OUTPUT_METAPHLAN}" ];
+then
+ if [ ! -e "${OUTPUT_METAPHLAN_INPUT}" ];
+ then
+ if [ ! -e "${OUTPUT_RMDUP_BAM}" ];
+ then
+ if [ ! -e "${OUTPUT_SORTED_BAM}" ];
+ then
+ if [ ! -e "${OUTPUT_SAM}" ];
+ then
+ bzcat $@ | bowtie2 -x "${BOWTIE_PREFIX}" -U - -S "${OUTPUT_SAM}.tmp" --no-unal --quiet
+ mv "${OUTPUT_SAM}.tmp" "${OUTPUT_SAM}"
+ fi
+
+ java -Xmx4g -jar ${JAR_ROOT}/SortSam.jar SO=coordinate VERBOSITY=WARNING QUIET=TRUE \
+ "I=${OUTPUT_SAM}" "O=${OUTPUT_SORTED_BAM}.tmp"
+ mv "${OUTPUT_SORTED_BAM}.tmp" "${OUTPUT_SORTED_BAM}"
+ fi
+
+ bam_rmdup_collapsed --remove-duplicates < "${OUTPUT_SORTED_BAM}" > "${OUTPUT_RMDUP_BAM}.tmp"
+ mv "${OUTPUT_RMDUP_BAM}.tmp" "${OUTPUT_RMDUP_BAM}"
+ fi
+
+ samtools view ${OUTPUT_RMDUP_BAM} | awk '{print $1 "\t" $3}' > "${OUTPUT_METAPHLAN_INPUT}.tmp";
+ mv "${OUTPUT_METAPHLAN_INPUT}.tmp" "${OUTPUT_METAPHLAN_INPUT}"
+ fi
+
+ ${METAPHLAN_ROOT}/metaphlan.py "${OUTPUT_METAPHLAN_INPUT}" > "${OUTPUT_METAPHLAN}.tmp"
+ mv "${OUTPUT_METAPHLAN}.tmp" "${OUTPUT_METAPHLAN}"
+fi
+
+echo "Done: Profile written to ${OUTPUT_METAPHLAN}"
diff --git a/paleomix/resources/examples/nature_protocols/profiling/metagenomic_profile.R b/paleomix/resources/examples/nature_protocols/profiling/metagenomic_profile.R
new file mode 100644
index 0000000..699b24d
--- /dev/null
+++ b/paleomix/resources/examples/nature_protocols/profiling/metagenomic_profile.R
@@ -0,0 +1,118 @@
+############################################################
+## Part 46:
+input <- read.csv("Potato_merged.csv", header=TRUE, row.names=1, sep="\t")
+
+dim(input)
+
+abundances <- input[row.names(input)[grep("g__[\\w_]*$", row.names(input), perl=TRUE)],]
+row.names(abundances) <- sapply(strsplit(row.names(abundances), "g__"), `[`, 2)
+write.csv(abundances, "Relative_abundances_genus.csv")
+
+
+############################################################
+## Part 47:
+
+# Load required packages
+library(permute)
+library(vegan)
+library(MASS)
+# Calculate the number of genera identified for each profile
+taxon_table <- specnumber(t(abundances))
+# Export a summary table
+write.table(taxon_table, "Taxon_count_genus.txt", sep="\t", col.names=FALSE)
+# Calculate the Shannon diversity index for each profile at the genus level
+diversity_table <- diversity(t(abundances), index="shannon")
+# Export a summary table
+write.table(diversity_table, "Diversity_genus.txt", sep="\t", col.names=FALSE)
+# Calculate Bray-Curtis distances among profiles at the genus level
+distances <- vegdist(t(abundances), method="bray")
+# Export a summary table
+write.matrix(as.matrix(distances), "Distances_genus.txt", sep="\t")
+
+
+############################################################
+## Part 48:
+
+# Load required packages
+library(gplots)
+# Save heatmap as .pdf file
+pdf("Heatmap_genus.pdf")
+# Draw the heatmap (see below)
+heatmap.2(as.matrix(abundances), trace="none", col=colorpanel(50, low="gray91", mid="darkblue", high="red"), dendrogram="both", ColSideColors=rep(c("red","green","blue"), times=c(5,1,3)), margins=c(12, 13))
+# Close the .pdf file
+dev.off()
+
+
+############################################################
+## Part 49:
+
+# Perform the Principal Coordinate Analysis
+library(ape)
+pcoa <- pcoa(distances)
+# Save the PCA plot as a pdf file.
+pdf("PCOA_genus.pdf")
+
+
+############################################################
+## Part 50:
+
+# Plot the first two dimensions
+plot(pcoa$vectors[,1], pcoa$vectors[,2], pch=16, cex=1.5, cex.axis=0.9, font.lab=2, font.axis=2, xlab="Dimension 1", ylab="Dimension 2", col=rep(c("red","green","blue"), times=c(5, 1, 3)))
+# Add profile name labels, colored using the same color scheme as above.
+text(x=pcoa$vectors[,1], y=pcoa$vectors[,2], labels=row.names(pcoa$vectors), font=2, cex=0.8, pos=3, col=rep(c("red","green","blue"), times=c(5, 1, 3)))
+# Add a legend showing the correspondence between profiles and samples.
+legend('topright', legend=c("M-0182896", "Pi1845A", "Pi1889"), pch=16, col=c('red', 'green', 'blue'), bty='n', cex=.75)
+dev.off()
+
+# Perform Principal Component Analysis
+pca <- prcomp(t(abundances), scale.=T)
+# Calculate the percentage of the variance accounted by principal components 1 and 2
+PC1 <- round (100 * (summary(pca)$importance[2,1]), 1)
+PC2 <- round (100 * (summary(pca)$importance[2,2]), 1)
+# Plot PCA scores for principal component 1 and 2
+pdf("PCA_genus.pdf")
+
+# Plot the first two principal components
+plot(pca$x[,1], pca$x[,2], pch=16, cex=1.5, cex.axis=0.9, font.lab=2, font.axis=2, xlab=paste("Principal component 1 - ", PC1, "% variance", sep=""), ylab=paste("Principal component 2 - ", PC2, "% variance", sep=""), col=rep(c("red","green","blue"), times=c(5, 1, 3)))
+# Add profile name labels, colored using the same color scheme as above.
+text(pca$x[,1], pca$x[,2], colnames(abundances), font=2, cex=0.8, pos=3, col=rep(c("red", "green", "blue"), times=c(5, 1, 3)))
+# Add a legend showing the correspondence between profiles and samples.
+legend('topright', legend=c("M-0182896", "Pi1845A", "Pi1889"), pch=16, col=c('red', 'green', 'blue'), bty='n', cex=.75)
+
+# Plot PCA loadings and their labels
+vectors.x <- (pca$rotation[,1]) * 8
+vectors.y <- (pca$rotation[,2]) * 8
+points(cbind(vectors.x, vectors.y), col="grey50", type="n")
+text(cbind(vectors.x, vectors.y), rownames(cbind(vectors.x, vectors.y)), cex=0.6, font=2, pos=3, col="grey50")
+for (v in 1:length(vectors.x)) {
+ segments(0,0, vectors.x[v], vectors.y[v], col="grey50", lty=3, lwd=2.5)
+}
+dev.off()
+
+
+############################################################
+## Part 51:
+
+library(pvclust)
+clustering <- pvclust(abundances, method.dist="manhattan", method.hclust="average", nboot=1000)
+pdf("Clustering_genus.pdf")
+plot(clustering)
+dev.off()
+
+clustering
+
+
+############################################################
+## Part 53:
+
+# Import and format class-level abundance data
+input2 <- read.csv("Potato_merged_profiles.csv", header=TRUE, row.names=1, sep="\t")
+abundances2 <- input2[row.names(input2)[grep("c__[\\w_]*$", row.names(input2), perl=TRUE)],]
+row.names(abundances2) <- sapply(strsplit(row.names(abundances2), "c__"), `[`, 2)
+data_table <- data.frame(samples=rep(colnames(abundances2), each=dim(abundances2)[1]), taxa=rep(row.names(abundances2), dim(abundances2)[2]), datavect=unlist(abundances2))
+
+library(ggplot2)
+library(grid)
+# Draw the stacked bar plot
+ggplot(data_table, aes(x=samples)) + geom_bar(aes(weight=datavect, fill=taxa), position='fill') + scale_y_continuous("", breaks=NULL) + scale_fill_manual(values=rainbow(dim(abundances2)[1])) + theme(axis.text.x=element_text(angle=90, hjust = 0, color="black"), legend.text=element_text(size = 8)) + theme(legend.key.size=unit(0.6, "lines"))
+ggsave("Barplot_class.pdf")
diff --git a/paleomix/resources/examples/phylo_pipeline/alignment/000_makefile.yaml b/paleomix/resources/examples/phylo_pipeline/alignment/000_makefile.yaml
new file mode 100644
index 0000000..5ccda55
--- /dev/null
+++ b/paleomix/resources/examples/phylo_pipeline/alignment/000_makefile.yaml
@@ -0,0 +1,169 @@
+# -*- mode: Yaml; -*-
+# Timestamp: 2013-09-24T10:53:40.257580
+#
+# Default options.
+# Can also be specific for a set of samples, libraries, and lanes,
+# by including the "Options" hierarchy at the same level as those
+# samples, libraries, or lanes below. This does not include
+# "Features", which may only be specific globally.
+Options:
+ # Sequencing platform, see SAM/BAM reference for valid values
+ Platform: Illumina
+ # Quality offset for Phred scores, either 33 (Sanger/Illumina 1.8+)
+ # or 64 (Illumina 1.3+ / 1.5+). For Bowtie2 it is also possible to
+ # specify 'Solexa', to handle reads on the Solexa scale. This is
+ # used during adapter-trimming and sequence alignment
+ QualityOffset: 33
+ # Split a lane into multiple entries, one for each (pair of) file(s)
+ # found using the search-string specified for a given lane. Each
+ # lane is named by adding a number to the end of the given barcode.
+ SplitLanesByFilenames: no
+ # Compression format for FASTQ reads; 'gz' for GZip, 'bz2' for BZip2
+ CompressionFormat: bz2
+
+ # Settings for trimming of reads, see AdapterRemoval man-page
+ AdapterRemoval:
+ # Adapter sequences, set and uncomment to override defaults
+# --adapter1: AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG
+# --adapter2: AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT
+ # Some BAM pipeline defaults differ from AR defaults;
+ # To override, change these value(s):
+ --mm: 3
+ --minlength: 25
+ # Extra features enabled by default; change 'yes' to 'no' to disable
+ --collapse: yes
+ --trimns: yes
+ --trimqualities: yes
+
+ # Settings for aligners supported by the pipeline
+ Aligners:
+ # Choice of aligner software to use, either "BWA" or "Bowtie2"
+ Program: BWA
+
+ # Settings for mappings performed using BWA
+ BWA:
+ # One of "backtrack", "bwasw", or "mem"; see the BWA documentation
+ # for a description of each algorithm (defaults to 'backtrack')
+ Algorithm: backtrack
+ # Filter aligned reads with a mapping quality (Phred) below this value
+ MinQuality: 25
+ # Filter reads that did not map to the reference sequence
+ FilterUnmappedReads: yes
+ # May be disabled ("no") for aDNA alignments, as post-mortem damage
+ # localizes to the seed region, which BWA expects to have few
+ # errors (sets "-l"). See http://pmid.us/22574660
+ UseSeed: yes
+ # Additional command-line options may be specified for the "aln"
+ # call(s), as described below for Bowtie2 below.
+
+ # Settings for mappings performed using Bowtie2
+ Bowtie2:
+ # Filter aligned reads with a mapping quality (Phred) below this value
+ MinQuality: 0
+ # Filter reads that did not map to the reference sequence
+ FilterUnmappedReads: yes
+ # Examples of how to add additional command-line options
+# --trim5: 5
+# --trim3: 5
+ # Note that the colon is required, even if no value is specified
+ --very-sensitive:
+ # Example of how to specify multiple values for an option
+# --rg:
+# - CN:SequencingCenterNameHere
+# - DS:DescriptionOfReadGroup
+
+ # Mark / filter PCR duplicates. If set to 'filter', PCR duplicates are
+ # removed from the output files; if set to 'mark', PCR duplicates are
+ # flagged with bit 0x400, and not removed from the output files; if set to
+ # 'no', the reads are assumed to not have been amplified. Collapsed reads
+ # are filtered using the command 'paleomix rmdup_duplicates', while "normal"
+ # reads are filtered using Picard MarkDuplicates.
+ PCRDuplicates: filter
+
+ # Carry out quality base re-scaling of libraries using mapDamage
+ # This will be done using the options set for mapDamage below
+ RescaleQualities: no
+
+ # Command-line options for mapDamage; note that the long-form
+ # options are expected; --length, not -l, etc. Uncomment the
+ # "mapDamage" line adding command-line options below.
+ mapDamage:
+ # By default, the pipeline will downsample the input to 100k hits
+ # when running mapDamage; remove to use all hits
+ --downsample: 100000
+
+ # Set to 'yes' exclude a type of trimmed reads from alignment / analysis;
+ # possible read-types reflect the output of AdapterRemoval
+ ExcludeReads:
+ Single: no # Single-ended reads / Orphaned paired-ended reads
+ Paired: no # Paired ended reads
+ Singleton: no # Paired reads for which the mate was discarded
+ Collapsed: no # Overlapping paired-ended reads collapsed into a
+ # single sequence by AdapterRemoval
+ CollapsedTruncated: no # Like 'Collapsed', except that the reads
+ # truncated due to the presence ambigious
+ # bases or low quality bases at read termini.
+
+ # Optional steps to perform during processing
+ Features:
+ RawBAM: no # Generate BAM from the raw libraries (no indel realignment)
+ # Location: {Destination}/{Target}.{Genome}.bam
+ RealignedBAM: yes # Generate indel-realigned BAM using the GATK Indel realigner
+ # Location: {Destination}/{Target}.{Genome}.realigned.bam
+ mapDamage: no # Generate mapDamage plot for each (unrealigned) library
+ # Location: {Destination}/{Target}.{Genome}.mapDamage/{Library}/
+ Coverage: yes # Generate coverage information for the raw BAM (wo/ indel realignment)
+ # Location: {Destination}/{Target}.{Genome}.coverage
+ Depths: yes # Generate histogram of number of sites with a given read-depth
+ # Location: {Destination}/{Target}.{Genome}.depths
+ Summary: yes # Generate summary table for each target
+ # Location: {Destination}/{Target}.summary
+ DuplicateHist: no # Generate histogram of PCR duplicates, for use with PreSeq
+ # Location: {Destination}/{Target}.{Genome}.duphist/{Library}/
+
+
+# Map of prefixes by name, each having a Path key, which specifies the
+# location of the BWA/Bowtie2 index, and optional label, and an option
+# set of regions for which additional statistics are produced.
+Prefixes:
+ # Name of the prefix; is used as part of the output filenames
+ rCRS:
+
+ # Path to .fasta file containg a set of reference sequences.
+ Path: 000_prefixes/rCRS.fasta
+
+ # Label for prefix: One of nuclear, mitochondrial, chloroplast,
+ # plasmid, bacterial, or viral. Is used in the .summary files.
+ Label: "mitochondrial"
+
+ # Produce additional coverage / depth statistics for a set of
+ # regions defined in a BED file; if no names are specified for the
+ # BED records, results are named after the chromosome / contig.
+# RegionsOfInterest:
+# NAME: PATH_TO_BEDFILE
+
+
+bonobo:
+ synth_bonobo_sample:
+ synth_bonobo_library:
+ synth_bonobo_lane: "000_reads/bonobo/*_R{Pair}_*.fastq.gz"
+
+chimpanzee:
+ synth_chimpanzee_sample:
+ synth_chimpanzee_library:
+ synth_chimpanzee_lane: "000_reads/chimpanzee/*_R{Pair}_*.fastq.gz"
+
+gorilla:
+ synth_gorilla_sample:
+ synth_gorilla_library:
+ synth_gorilla_lane: "000_reads/gorilla/*_R{Pair}_*.fastq.gz"
+
+sumatran_orangutan:
+ synth_sumatran_orangutan_sample:
+ synth_sumatran_orangutan_library:
+ synth_sumatran_orangutan_lane: "000_reads/sumatran_orangutan/*_R{Pair}_*.fastq.gz"
+
+white_handed_gibbon:
+ synth_white_handed_gibbon_sample:
+ synth_white_handed_gibbon_library:
+ synth_white_handed_gibbon_lane: "000_reads/white_handed_gibbon/*_R{Pair}_*.fastq.gz"
diff --git a/paleomix/resources/examples/phylo_pipeline/alignment/000_prefixes/bonobo.fasta b/paleomix/resources/examples/phylo_pipeline/alignment/000_prefixes/bonobo.fasta
new file mode 100644
index 0000000..af07ceb
--- /dev/null
+++ b/paleomix/resources/examples/phylo_pipeline/alignment/000_prefixes/bonobo.fasta
@@ -0,0 +1,239 @@
+>gi|5835135|ref|NC_001644.1| Pan paniscus mitochondrion, complete genome
+GTTTATGTAGCTTACCCCCTTAAAGCAATACACTGAAAATGTTTCGACGGGTTTATATCACCCCATAAAC
+AAACAGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTAAGATTACACATGCAAGCATCCGTCCCGTGAG
+TCACCCTCTAAATCACCATGATCAAAAGGAACAAGTATCAAGCACACAGCAATGCAGCTCAAGACGCTTA
+GCCTAGCCACACCCCCACGGGAGACAGCAGTGATAAACCTTTAGCAATAAACGAAAGTTTAACTAAGCCA
+TACTAACCTCAGGGTTGGTCAATTTCGTGCTAGCCACCGCGGTCACACGATTAACCCAAGTCAATAGAAA
+CCGGCGTAAAGAGTGTTTTAGATCACCCCCCCCCCAATAAAGCTAAAATTCACCTGAGTTGTAAAAAACT
+CCAGCTGATACAAAATAAACTACGAAAGTGGCTTTAACACATCTGAACACACAATAGCTAAGACCCAAAC
+TGGGATTAGATACCCCACTATGCTTAGCCCTAAACTTCAACAGTTAAATTAACAAAACTGCTCGCCAGAA
+CACTACGAGCCACAGCTTAAAACTCAAAGGACCTGGCGGTGCTTCATATCCCTCTAGAGGAGCCTGTTCT
+GTAATCGATAAACCCCGATCAACCTCACCGCCTCTTGCTCAGCCTATATACCGCCATCTTCAGCAAACCC
+TGATGAAGGTTACAAAGTAAGCGCAAGTACCCACGTAAAGACGTTAGGTCAAGGTGTAGCCTATGAGGCG
+GCAAGAAATGGGCTACATTTTCTACCCCAGAAAATTACGATAACCCTTATGAAACCTAAGGGTCGAAGGT
+GGATTTAGCAGTAAACTAAGAGTAGAGTGCTTAGTTGAACAGGGCCCTGAAGCGCGTACACACCGCCCGT
+CACCCTCCTCAAGTATACTTCAAAGGATATTTAACTTAAACCCCTACGCATTTATATAGAGGAGATAAGT
+CGTAACATGGTAAGTGTACTGGAAAGTGCACTTGGACGAACCAGAGTGTAGCTTAACATAAAGCACCCAA
+CTTACACTTAGGAGATTTCAACTCAACTTGACCACTCTGAGCCAAACCTAGCCCCAAACCCCCTCCACCC
+TACTACCAAACAACCTTAACCAAACCATTTACCCAAATAAAGTATAGGCGATAGAAATTGTAAATCGGCG
+CAATAGATATAGTACCGCAAGGGAAAGATGAAAAATTACACCCAAGCATAATACAGCAAGGACTAACCCC
+TGTACCTTTTGCATAATGAATTAACTAGAAATAACTTTGCAAAGAGAACTAAAGCCAAGATCCCCGAAAC
+CAGACGAGCTACCTAAGAACAGCTAAAAGAGCACACCCGTCTATGTAGCAAAATAGTGGGAAGATTTATA
+GGTAGAGGCGACAAACCTACCGAGCCTGGTGATAGCTGGTTGTCCAAGATAGAATCTTAGTTCAACTTTA
+AATTTACCTACAGAACCCTCTAAATCCCCCTGTAAATTTAACTGTTAGTCCAAAGAGGAACAGCTCTTTA
+GACACTAGGAAAAAACCTTATGAAGAGAGTAAAAAATTTAATGCCCATAGTAGGCCTAAAAGCAGCCACC
+AATTAAGAAAGCGTTCAAGCTCAACACCCACAACCTCAAAAAATCCCAAGCATACAAGCGAACTCCTTAC
+GCTCAATTGGACCAATCTATTACCCCATAGAAGAGCTAATGTTAGTATAAGTAACATGAAAACATTCTCC
+TCCGCATAAGCCTACTACAGACCAAAATATTAAACTGACAATTAACAGCCCAATATCTACAATCAACCAA
+CAAGCCGTTATTACCCCCACTGTTAACCCAACACAGGCATGCACACAAGGAAAGGTTAAAAAAAGTAAAA
+GGAACTCGGCAAATCTTACCCCGCCTGTTTACCAAAAACATCACCTCTAGCATTACCAGTATTAGAGGCA
+CCGCCTGCCCGGTGACATATGTTTAACGGCCGCGGTACCCTAACCGTGCAAAGGTAGCATAATCACTTGT
+TCCTTAAATAGGGACTTGTATGAATGGCTCCACGAGGGTTTAGCTGTCTCTTACTTTCAACCAGTGAAAT
+TGACCTACCCGTGAAGAGGCGGGCATAACATAGCAAGACGAGAAGACCCTATGGAGCTTTAATTCATTAA
+TGCAAACAATACTTAACAAACCCACAGGTCCTAAACTATTAAACCTGCATTAAAAATTTCGGTTGGGGCG
+ACCTCGGAGCACAACCCAACCTCCGAGCAACACATGCTAAGACCTCACCAGTCAAAGCGAATTATTACAT
+CCAATTGATCCAATTACTTGACCAACGGAACAAGTTACCCTAGGGATAACAGCGCAATCCTATTCCAGAG
+TCCATATCAACAATAGGGTTTACGACCTCGATGTTGGATCAGGACATCCCGATGGTGCAGCCGCTATTAA
+AGGTTCGTTTGTTCAACGATTAAAGTCCTACGTGATCTGAGTTCAGACCGGAGTAATCCAGGTCGGTTTC
+TATCTGTTCTAAATTTCTCCCTGTACGAAAGGACAAGAGAAATGGGGCCTACTTCACAAAGCGCCTCCCC
+CAATAAATGATATTATCTCAATTTAACACCACACCTACACCCACTCAAGAACAGAGTTTGTTAAGATGGC
+AGAGCCCGGTAATCGCATAAAACTTAAAACTTTACAATCAGAGGTTCAATTCCTCTTCTTAACAACACAC
+CCATGACTAACCTCCTACTCCTCATTGTACCTGTCCTAATCGCAATAGCATTCCTAATGCTAACCGAACG
+AAAAATCCTAGGCTACATACAACTACGCAAAGGCCCCAACATTGTAGGCCCCTACGGACTATTACAGCCT
+TTCGCTGACGCCATAAAACTCTTCACCAAAGAACCCTTAAAACCCTCCACTTCAACTATTACCCTCTATA
+TTACCGCCCCAACCTTAGCCCTGACCATCGCCCTCTTACTATGAACCCCTCTCCCCATGCCCAACCCCCT
+AGTCAATCTTAACTTAGGCCTCCTATTTATTCTAGCCACCTCTAGCCTAGCCGTTTACTCAATCCTCTGA
+TCAGGATGAGCATCAAACTCGAACTACGCCTTAATCGGTGCATTACGAGCAGTAGCCCAAACAATCTCAT
+ACGAAGTCACTCTAGCCATTATTCTACTATCAACGCTACTAATAAGTGGCTCCTTCAATCTCTCTACCCT
+TATCACAACACAAGAACACCTCTGACTAATCCTGCCAACATGACCCTTGGCCATAATATGATTTATCTCT
+ACACTAGCAGAAACCAACCGGACTCCCTTCGACCTTACTGAAGGAGAATCTGAACTAGTCTCAGGCTTCA
+ACATCGAATATGCCGCAGGCCCATTTGCCCTATTCTTTATAGCCGAATACATAAACATTATTATAATAAA
+CACCCTCACTGCTACAATCTTCCTAGGAACAACATACAATACGCACTCCCCTGAACTCTACACGACATAC
+TTTGTCACCAAGGCTCTACTTTTAACCTCCCTGTTCCTATGAATTCGAACAACATATCCCCGACTTTGCT
+ACGACCAACTCATACACCTCCTATGAAAAAACTTCCTACCACTCACCCTAGCATCACTCATGTGATATAT
+TTCCATACCCACTACAATCTCCAGCATCCCCCCTCAAACCTAAGAAATATGTCTGATAAAAGAATTACTT
+TGATAGAGTAAATAATAGGAGTTTAAATCCCCTTATTTCTAGGACTATGAGAGTCGAACCCATCCCTGAG
+AATCCAAAATTCTCCGTGCCACCTATCACACCCCATCCTAAAGTAAGGTCAGCTAAATAAGCTATCGGGC
+CCATACCCCGAAAATGTTGGTTATACCCTTCCCGTACTAATTAATCCCCTAGCCCAACCCATCATCTACT
+CTACCATCTTCGCAGGCACATTCATTACAGTGCTAAGCTCACACTGATTTTTCACCTGAGTAGGCCTAGA
+AATAAATATACTAGCTTTTATCCCAGTCCTAACCAAAAAAATAAGCCCCCGCTCCACAGAAGCTGCCATC
+AAATACTTTCTCACACAAGCAACCGCGTCCATGATTCTCCTAATAGCTATCCTCTCCAACAACATACTCT
+CCGGACAATGAACCATAACCAACACTACCAATCAATACTCATCATTAATAATTATAACAGCTATGGCAAT
+AAAACTAGGTATAGCCCCCTTTCACTTCTGAGTTCCAGAAGTTACCCAAGGCACCCCCCTAATATCCGGC
+CTACTCCTCCTCACATGACAAAAACTAGCCCCCATCTCAATTATATACCAAATATCCTCATCACTGAACG
+TAAACCTTCTCCTCACCCTTTCAATCTTGTCCATTATAGCAGGCAGCTGAGGCGGACTAAACCAAACCCA
+ACTACGCAAAATCCTAGCATACTCCTCAATCACCCACATAGGCTGAATAATAGCAGTTCTACCATACAAC
+CCTAACATAACCATTCTTAATTTAACTATTTATATCATCCTAACTACTACCACATTCCTGCTACTCAACT
+TAAACTCCAGCACCACAACCCTACTACTATCTCGCACCTGAAACAAGCTAACATGACTAACTCCTCTAAT
+CCCATCCACCCTCCTCTCCCTAGGAGGCCTACCCCCACTAACCGGCTTCTTGCCCAAATGAGTTATCATC
+GAAGAATTCACAAAAAATAATAGCCTCATTATCCCCACCACCATAGCCATCATCACTCTCCTTAACCTCT
+ATTTCTACCTCCGCCTAATCTACTCCACCTCGATCACACTACTTCCTATATCCAATAATGTAAAAATAAA
+ATGACAATTTGAACATACAAAACCTACCCCCTTCCTCCCTACACTCATCACCCTTACCACACTACTTCTA
+CCCATCTCCCCTTTCATACTAATGATCTTATAGAAATTTAGGTTAAACACAGACCAAGAGCCTTCAAAGC
+TCTCAGTAAGTTACAATACTTAATTTCTGCAACAACTAAGGACTGCAAAACCCCACTCTGCATCAACTGA
+ACGCAAATCAGCCACTTTAATTAAGCTAAGCCCTTACTAGATTAATGGGACTTAAACCCACAAACATTTA
+GTTAACAGCTAAACACCCTAATCAGCTGGCTTCAATCTACTTCTCCCGCCGCAAGAAAAAAAGGCGGGAG
+AAGCCCCGGCAGGTTTGAAGCTGCTTCTTTGAATTTGCAATTCAATATGAAAATCACCTCAGAGCTGGTA
+AAAAGAGGCTTAACCCCTGTCTTTAGATTTACAGTCCAATGCTTCACTCAGCCATTTTACCCACCCTACT
+GATGTTCACCGACCGCTGACTATTCTCTACAAACCACAAAGATATTGGAACACTATACCTACTATTCGGC
+ACATGAGCTGGAGTTCTGGGCACAGCCCTAAGTCTCCTTATTCGAGCTGAATTAGGCCAACCAGGCAACC
+TTCTAGGTAACGACCACATCTATAATGTCATTGTCACAGCCCATGCGTTCGTAATAATCTTTTTCATAGT
+AATACCTATCATAATCGGAGGCTTCGGCAACTGGCTAGTTCCCTTGATAATTGGTGCCCCCGACATGGCA
+TTCCCCCGTATAAACAACATAAGCTTCTGACTCCTACCCCCTTCTCTCCTACTTCTACTTGCATCTGCCA
+TAGTAGAAGCCGGCGCCGGAACAGGTTGGACAGTCTACCCTCCCTTAGCAGGAAACTATTCGCATCCTGG
+AGCCTCCGTAGACCTAACCATCTTCTCCTTGCACCTGGCAGGCGTCTCCTCTATCCTAGGAGCCATTAAC
+TTCATCACAACAATCATTAATATAAAACCTCCTGCCATAACCCAATACCAAACACCCCTTTTCGTCTGAT
+CCGTCCTAATTACAGCAGTCTTACTTCTCCTATCCCTCCCAGTCCTAGCTGCTGGCATCACCATACTATT
+AACAGATCGTAACCTCAACACTACCTTCTTCGACCCAGCTGGGGGAGGAGACCCTATTCTATATCAACAC
+TTATTCTGATTTTTTGGCCACCCCGAAGTTTACATTCTTATCCTACCAGGCTTCGGAATAATTTCCCACA
+TTGTAACTTATTACTCAGGAAAAAAAGAACCATTCGGATACATAGGCATGGTTTGAGCTATAATATCAAT
+TGGTTTCCTAGGGTTTATTGTGTGAGCACACCATATATTTACAGTAGGAATAGACGTAGACACACGAGCC
+TATTTCACTTCCGCTACCATAATCATTGCTATTCCTACCGGCGTCAAAGTATTCAGCTGGCTCGCTACAC
+TTCACGGAAGCAATATGAAATGATCTGCCGCAGTACTCTGAGCCCTAGGGTTCATCTTTCTCTTCACCGT
+GGGTGGCCTAACCGGTATTGTACTAGCAAACTCATCATTAGACATCGTACTACACGACACATATTACGTC
+GTAGCCCACTTCCACTACGTCCTATCAATAGGAGCTGTATTCGCCATCATAGGAGGCTTCATTCACTGAT
+TCCCCCTATTTTCAGGCTATACCCTAGACCAAACCTATGCCAAAATCCAATTTGCCATCATATTCATTGG
+CGTAAACCTAACCTTCTTCCCACAACACTTCCTTGGCCTGTCTGGAATGCCCCGACGTTACTCGGACTAC
+CCTGATGCATACACCACATGAAATGTCCTATCATCCGTAGGCTCATTCATCTCCCTAACGGCAGTAATAT
+TAATAATTTTCATAATTTGAGAAGCCTTTGCTTCAAAACGAAAAGTCCTAATAGTAGAAGAGCCCTCCGC
+AAACCTGGAGTGACTGTATGGATGCCCCCCACCCTACCACACGTTCGAAGAACCCGTGTACATAAAATCT
+AGACAAAAAAGGAAGGAATCGAACCCCCTAAAGCTGGTTTCAAGCCAACCCCATGACCCCCATGACTTTT
+TCAAAAAGATATTAGAAAAACTATTTCATAACTTTGTCAAAGTTAAATTACAGGTTAAACCCCGTATATC
+TTAATGGCACATGCAGCGCAAGTAGGTCTACAAGACGCCACTTCCCCTATCATAGAAGAACTTATTATCT
+TTCATGACCATGCCCTCATAATTATCTTTCTCATCTGCTTCCTAGTCCTGTACGCCCTTTTCCTAACACT
+CACAACAAAACTAACTAATACTAGTATTTCAGACGCCCAGGAAATAGAAACCGTCTGAACTATCCTGCCC
+GCCATCATCCTAGTCCTTATTGCCCTTCCATCTCTACGTATCCTTTATATAACAGACGAGGTCAACGACC
+CCTCCTTTACTATTAAATCAATCGGCCATCAATGATATTGAACCTACGAATACACCGACTACGGCGGACT
+AATCTTCAACTCCTACATACTCCCCCCATTATTTCTAGAACCAGGCGACCTACGACTCCTTGACGTTGAT
+AACCGGGTGGTCCTCCCAGTTGAAGCCCCCGTTCGTATAATAATTACATCACAAGATGTTCTACACTCAT
+GAGCTGTCCCCACATTAGGCCTAAAAACAGACGCAATTCCCGGACGCCTAAACCAAACCACTTTCACCGC
+CACACGACCAGGAGTATACTATGGCCAATGCTCAGAAATCTGTGGAGCAAACCACAGTTTTATACCCATC
+GTCCTAGAGTTAATCCCCCTAAAAATCTTTGAAATAGGACCCGTATTCACTCTATAACACCTTCTCTACC
+CCTCTCCAAAGCTCACTGTAAAGCTAACCTAGCATTAACCTTTTAAGTTAAAGATTAAGAGGACCAACAC
+CTCTTTACAGTGAAATGCCCCAACTAAATACCGCCGTATGACCCACCACAATTACCCCTATACTCCTTAC
+ACTATTTCTTATCACCCAACTAAAAATATTAAACTCAAATTACCATCTACCCCCCTCACCAAAACCCATA
+AAAATAAAAAACTACAATAAACCCTGAGAACCAAAATGAACGAAAATCTGTTCGCTTCATTCGCTGCCCC
+CACAATCCTAGGCCTACCCGCCGCAGTACTGATCATTCTATTCCCCCCTCTACTGGTTCCCACCTCTAAA
+CATCTCATCAACAACCGACTAATTACCACCCAACAATGACTAATTCAACTGACCTCAAAACAAATAATAA
+CTATACACAACACTAAAGGACGAACCTGATCTCTCATACTAGTATCCTTAATTATTTTTATTGCCACAAC
+TAATCTTCTCGGACTTCTACCTCACTCATTCACACCAACCACCCAACTATCTATAAACCTAGCCATGGCT
+ATCCCCCTATGAGCAGGCACAGTAGTCATAGGCTTTCGCTTTAAGACTAAAAATGCCCTAGCCCACTTCT
+TACCGCAAGGCACACCTACACCCCTTATCCCCATACTAATTATCATCGAAACTATCAGCCTATTCATTCA
+ACCAATAGCCTTAGCCGTACGTCTAACCGCTAACATTACTGCAGGCCACCTACTCATGCACCTAATTGGA
+AGCGCCACACTAGCATTATCAACTATTAGTTTACCCTCCACACTCATTATCTTCACAATTCTAATCCTAC
+TGACTGTTCTAGAAATCGCCGTTGCCTTAATCCAAGCCTACGTTTTTACACTTCTAGTGAGCCTCTACCT
+GCACGACAACACATAATGGCCCACCAATCACATGCTTATCATATAGTAAAACCCAGCCCATGGCCCCTAA
+CAGGGGCCCTCTCAGCCCTCCTAATAACCTCTGGCCTAGCCATATGATTCCACTTCTACTCCACAACACT
+GCTCACACTAGGTTTACTAACTAACACATTAACCATATATCAATGATGACGCGATGTTATACGAGAGAGC
+ACATACCAAGGCCACCACACACCACCCGTCCAAAAAGGTCTCCGATACGGAATAATTCTTTTCATTACCT
+CAGAAGTTTTTTTCTTTGCAGGATTTTTTTGAGCTTTCTACCACTCCAGCCTAGCCCCTACCCCCCAGCT
+AGGAGGACACTGACCCCCAACAGGTATTACCCCACTAAATCCCCTAGAAGTTCCACTCCTAAACACATCT
+GTATTACTCGCATCAGGAGTATCAATTACTTGAGCCCATCACAGCTTAATAGAAAATAACCGAAACCAAA
+TAATTCAAGCACTACTCATTACAATTTTACTAGGTCTCTATTTCACCCTCCTACAAGCCTCAGAATACTT
+CGAATCCCCTTTTACCATTTCCGATGGCATCTACGGCTCAACATTCTTTGTAGCCACAGGCTTCCACGGG
+CTCCACGTCATTATTGGATCAACTTTCCTCACTATCTGCCTCATCCGCCAACTAATATTTCACTTCACAT
+CCAAACATCACTTCGGCTTTGAAGCCGCCGCCTGATACTGGCACTTCGTAGATGTAGTCTGACTATTTCT
+ATATGTCTCCATCTACTGATGAGGATCTTACTCTTTTAGTATAAGCAGTACCGTTAACTTCCAATTAACT
+AGTTTTGACAACATTCAAAAAAGAGTAATAAACTTCGTCCTAATTTTAATAACCAATACCCTTTTAGCCC
+TACTACTAATAATTATTACATTCTGATTACCACAACTCAACAGCTACATAGAAAAATCTAACCCTTACGA
+ATGTGGCTTCGACCCTATATCCCCCGCTCGCGTCCCCTTCTCCATAAAATTTTTCCTAGTAGCCATCACC
+TTCCTATTATTTGACCTAGAAATTGCCCTCCTATTACCCTTACCATGAGCCCTACAAACAGCCAACCTAC
+CACTAATAGTCATATCATCCCTCTTATTAATCACTATCCTAGCCCTAAGCCTCGCCTACGAATGATTACA
+AAAGGGGTTAGACTGAGCCGAATTGGTACATAGTTTAAACAAAACGAATGATTTCGACTCATTAGATTAT
+GATAATCATATTTACCAAATGCCTCTTATTTATATAAATATTATACTAGCATTTACCATCTCACTTCTAG
+GAATACTAGTATACCGCTCACACCTAATATCTTCTCTACTATGCCTAGAAGGAATAATACTATCATTATT
+CATCATAACCACCCTCATAACCCTCAATACCCACTCCCTCTTAGCCAATATTGTACCCATCACCATACTA
+GTCTTTGCTGCCTGCGAGGCAGCAGTAGGTCTAGCACTATTAGTTTCAATCTCTAACACGTATGGCCTAG
+ACTACGTACATAATCTAAACCTACTCCAATGCTAAAACTAATTATCCCAACAATTATATTACTACCACTA
+ACATGATTCTCTAAAAAACGTATAATTTGAATCAACACAACCACTCACAGCCTAATTATCAGCACCATTC
+CCCTACTATTTTTTAACCAAATCAACAACAACCTATTCAGCTGTTCCCTATCTTTCTCCTCCGACCCCCT
+AACAACCCCCCTCCTAATACTAACTGCCTGACTTCTACCCCTCACAATCATAGCAAGTCAGCGCCACCTA
+TCCAACGAACCACCATCACGAAAAAAACTCTACCTCTCCATACTAATCTCCCTCCAAATCTCCTTAATTA
+TAACATTCTCAGCCACAGAGCTAATTATATTTTACATCTTCTTCGAAACCACACTCATCCCCACCCTAGC
+CATCATCACCCGATGGGGTAACCAACCAGAACGCCTGAACGCAGGTACATACTTCCTATTCTATACCCTA
+GTAGGCTCCCTCCCCCTACTCATCGCGCTAATCTACACCCACAACACCCTAGGCTCACTAAATATCCTAT
+TGCTCACCCTCACAGCCCAAGAACTATCAAACACCTGAGCCAACAACTTAATATGACTAGCGTACACGAT
+AGCTTTCATGGTAAAAATACCACTTTACGGACTCCACCTATGACTCCCTAAAGCCCATGTCGAAGCCCCT
+ATTGCCGGATCAATGGTACTTGCTGCAGTACTCTTAAAATTAGGCGGCTATGGCATAATACGCCTCACAC
+TCATTCTCAACCCCCTAACAAAACATATAGCCTATCCCTTCCTCATGTTATCCTTATGAGGCATAATCAT
+AACAAGCTCCATCTGCCTGCGACAAACAGACCTAAAATCGCTCATTGCATACTCTTCAGTCAGCCACATA
+GCCCTCGTAGTAACAGCCATTCTCATCCAAACCCCCTGAAGCTTCACCGGCGCAGTCATTCTTATAATCG
+CCCACGGACTTACATCCTCATTACTATTCTGCCTAGCAAACTCAAATTACGAACGCACCCACAGTCGCAT
+CATAATTCTCTCCCAAGGACTTCAAACTCTACTCCCACTAATAGCCTTTTGATGACTCCTAGCAAGCCTC
+GCCAACCTCGCCCTACCCCCCACCATTAATCTCCTAGGAGAACTCTCCGTGCTAGTAACCTCATTCTCCT
+GATCAAATACTACCCTCCTACTCACAGGATTCAACATACTAATTACAGCCCTGTACTCCCTCTACATGTT
+TACCACAACACAATGAGGCTCACTCACCCACCACATTAATAACATAAAACCCTCATTCACACGAGAAAAC
+ACTCTCATATTTATACACCTATCCCCCATCCTCCTCCTATCCCTCAATCCTGATATTATCACTGGATTCA
+CCTCCTGTAAATATAGTTTAACCAAAACATCAGATTGTGAATCTGACAACAGAGGCTCACGACCCCTTAT
+TTACCGAGAAAGCTTATAAGAACTGCTAATTCATATCCCCATGCCTAACAACATGGCTTTCTCAACTTTT
+AAAGGATAACAGCCATCCGTTGGTCTTAGGCCCCAAAAATTTTGGTGCAACTCCAAATAAAAGTAATAAC
+CATGTATACTACCATAACCACCTTAACCCTAACTCCCTTAATTCTCCCCATCCTCACCACCCTCATTAAC
+CCTAACAAAAAAAACTCATATCCCCATTATGTGAAATCCATTATCGCGTCCACCTTTATCATTAGCCTTT
+TCCCCACAACAATATTCATATGCCTAGACCAAGAAGCTATTATCTCAAACTGGCACTGAGCAACAACCCA
+AACAACCCAGCTCTCCCTAAGCTTTAAACTAGACTACTTTTCCATAACATTTATCCCCGTAGCACTGTTC
+GTTACATGAGCCATCATAGAATTCTCACTATGATACATAAACTCAGACCCAAACATCAACCAATTCTTCA
+AATATTTACTTATCTTCCTAATTACCATACTAATCTTAGTCACCGCTAATAACCTATTCCAACTCTTCAT
+TGGCTGAGAAGGAGTAGGAATTATATCCTTTCTACTCATTAGCTGATGGTACGCCCGAACAGATGCCAAC
+ACAGCAGCCATCCAAGCAATCCTATATAACCGCATCGGTGACATTGGTTTTATCCTAGCCCTAGCATGAT
+TCCTCCTACACTCCAACTCATGAGACCCCCAACAAATAGTCCTCCTAAGTACTAATCCAAGCCTTACTCC
+ACTACTAGGCTTCCTCCTAGCAGCAGCAGGCAAATCAGCCCAATTAGGCCTTCACCCCTGACTCCCCTCA
+GCCATAGAAGGCCCTACCCCCGTTTCAGCCCTACTCCACTCAAGCACCATAGTCGTAGCAGGAGTCTTCC
+TACTCATCCGCTTCCACCCCCTAGCAGAAAATAACCCACTAATCCAAACTCTCACGCTATGCCTAGGCGC
+TATCACCACCCTATTCGCAGCAATCTGCGCCCTTACACAAAATGACATCAAAAAAATCGTAGCCTTCTCC
+ACTTCAAGTCAATTAGGACTCATAATAGTCACAATCGGTATCAACCAACCACACCTAGCATTCCTGCACA
+TCTGCACCCATGCTTTTTTCAAAGCCATACTATTTATATGCTCCGGATCCATTATCCACAACCTCAACAA
+TGAACAAGACATTCGAAAAATAGGAGGACTACTCAAAACCATACCCCTCACCTCAACCTCCCTCATCATT
+GGCAGCCTAGCATTAGCAGGAATACCTTTCCTCACAGGTTTCTACTCCAAAGACCTCATCATCGAAACTG
+CAAACATATCATACATAAACGCCTGAGCCCTATCTATTACTCTCATCGCCACCTCCCTAACAAGCGCCTA
+CAGCACCCGAATAATTCTCCTCACCTTAATAGGTCAACCTCGCTTCCCAACCCTCACTAACATTAACGAA
+AACAACCCCACTCTATTAAATCCCATTAAACGCCTAACGATCGGAAGCTTATTTGCAGGATTTTTCATTA
+CCAACAACATTCTCCCCATATCTACTTCCCAAATGACAATTCCCCTTTACTTGAAACTCACAGCCCTAAG
+CGTTACTTTCCTAGGACTTCTAACAGCCCTAGACCTCAATTACCTAACCAACAAACTCAAAATAAAATCC
+CCACCATATACATTTTACTTCTCTAATATGCTCGGATTCTATCCCAACATTATACACCGCTCAATCCCCT
+ATCTAGGCCTTCTCACAAGCCAAAACCTGCCCCTACTCCTTCTAGACCTGACCTGACTAGAAAAACTACT
+ACCCAAAACAATTTCACAGTACCAAGTCTCCGCTTCCATTACCACCTCAACCCAAAAAGGTATAATCAAA
+CTTTATTTCCTCTCTTTTCTCTTCCCTCTCATCTTAACCTTACTCCTAATCATATAACCTATTCCCCCGA
+GCAATCTCAATTACAATGTATACACCAACAAATAATGTCCAACCAGTAACTACTACCAACCAACGCCCAT
+AGTCATACAAAGCCCCCGCACCAATAGGATCCTCCCGAATTAGCCCCGGCCCCTCCCCTTCATAAATTAT
+TCAACTTCCCACGCTATTAAAATTTACTACAACCACTACCCCATCATACTCTTTTACCCACAACACTAAT
+CCTACCTCCATTGCCAGTCCCACTAAAACACTAACCAAAACCTCAACCCCTGACCCCCATGCCTCAGGGT
+ACTCCTCAATAGCCATCGCCGTAGTATACCCAAAAACAACCATTATTCCCCCCAAATAAATTAAAAAAAC
+CATTAAACCTATATAACCTCCCCCATAATTCAAAATGATAGTACACCCAACTACACCACTAACAATCAAT
+ACTAAGCCCCCATAAATAGGAGAAGGCTTAGAAGAAAACCCCACAAACCCTATTACTAAACTCACACTCA
+ATAAAAATAAAACATATGTCATTATTCTCGCACGGACTACAACCACGACCAATGATATGAAAAACCATCG
+TTGTATTTCAACTACAAGAACACCAATGACCCCAACACGCAAAATCAACCCACTAATAAAATTAATTAAT
+CACTCATTTATCGACCTCCCCACCCCATCCAATATTTCCACATGATGAAACTTCGGCTCACTTCTCGGCG
+CCTGCCTAATCCTTCAAATCACCACAGGACTATTCCTAGCTATACACTACTCACCAGACGCCTCAACCGC
+CTTCTCATCGATCGCCCACATTACCCGAGACGTAAACTATGGTTGAATCATCCGCTACCTTCACGCTAAC
+GGCGCCTCAATACTTTTCATCTGCCTCTTCCTACACGTCGGTCGAGGCCTATATTACGGCTCATTTCTCT
+ACCTAGAAACCTGAAACATTGGCATCATCCTCTTGCTCACAACCATAGCAACAGCCTTTATGGGCTATGT
+CCTCCCATGAGGCCAAATATCCTTCTGAGGGGCCACAGTAATTACAAACCTACTGTCCGCCATCCCATAC
+ATCGGAACAGACCTCGTCCAATGAGTCTGAGGAGGCTACTCAGTAGACAGCCCTACCCTTACACGATTCT
+TCACCCTCCACTTTATCCTACCCTTCATTATCACAGCCCTAACAACACTTCATCTCCTATTCTTACACGA
+AACAGGATCAAATAACCCCCTAGGAATCACCTCCCACTCCGACAAAATTACCTTCCACCCCTACTACACA
+ACCAAAGATATCCTTGGTTTATTCCTTTTCCTCCTCGCCCTAATAGTATTAACACTATTTTCACCAGACC
+TCCTAGGCGATCCAGACAATTACACCCTAGCCAACCCCCTAATCACCCCACCCCACATTAAGCCCGAGTG
+GTATTTTCTATTTGCCTACACAATTCTCCGATCCGTCCCCAACAAACTAGGAGGTGTTCTCGCCCTACTA
+CTATCTATCCTCATCCTAGCAGTAATCCCTATCCTCCACACATCCAAACAACAAAGCATAATATTTCGCC
+CACTAAGCCAACTGCTTTACTGACTCCTAGCCACAGACCTCCTTATCCTAACCTGAATCGGAGGACAACC
+AGTAAGCTACCCCTTCATCACCATCGGGCAAGTAGCATCCGTATTGTACTTCACAACAATCCTAATCCTA
+ATACCAATTATCTCCCTAATCGAAAACAAAATACTCGAATGAGCCTGCCCTTGTAGTATAAGCTAATACA
+CCGGTCTTGTAAACCGGAAACGAAAACTTTATTCCAAGGACAAATCAGAGAAAAAGTCTTTAACTCCACC
+ATCAGCACCCAAAGCTAAGATTCTAATTTAAACTATTCTCTGTTCTTTCATGGGGAAGCAAATTTAAGTG
+CCACCCAAGTATTGGCTCATTCACTATAACCGCTATGTATTTCGTACATTACTGCCAGCCACCATGAATA
+TTACATAGTACTATAATCATTTAACCACCTATAACACATAAAAACCTACATCCACATTAAAACCCCCCCC
+CCATGCATATAAGCACGAACAATAATCGACCTCCAACTGTCGAACATAAACACCCCCCCAAAGACACTCC
+CCCCCCACCCCGATACCAACAAACCTGACAGTCCTTAACAGTACATAGCACATACAATTATATACCGTAC
+ATAGCACATTACAGTCAAATCCATCCTCGCCCCCACGGATGCCCCCCCTCAGATAGGAATCCCTTGGCCA
+CCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGTACTCTCCTCGCTCCGGGCCCATAACACTTGGGG
+GTAGCTAAACTGAACTGTATCCGACATCTGGTTCCTACCTCAGGGCCATGAAGTTCAAAGGACTCCCACA
+CGTTCCCCTTAAATAAGGCATTCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCGC
+TCCATGCATTTGGTATTTTCGTCTGGGGGGTATGCACGCGATAGCATTGCGAAACGCTGGCCCCGGAGCA
+CCCTATGTCGCAGTATCTGTCTTTGGTTCCTGCCCCATTACGTTATTTATCGCACCTACGTTCAATATTA
+TTACCTAGCATGATTTACTAAAGCGTGTTAATTAATTAATGCTTGTAGGACATAACAATAACAGCAAAAT
+ACCACGTAACTGCTTTCCACACCAACATCATAACAAAAAATTTCCGCCAAACCCCCCCTCCCCCACTCCT
+GGCTACAGCACTCAAATTCATCTCTGCCAAACCCCAAAAACAAAGAACCCAGATACCAGCCTAACCAGAC
+CTCAAATTTCATCTTTTGGCGGTATGCATTTTTAACAGTCACCCCTCAACTAACATGTCCTCCCCCCTCA
+ACTCCCATTCCACTAGCCCCAACAACATAACCCCCTGCCCACCCCACTCAGCACATATACCGCTGCTAAC
+CCTATACCCTAAGCCAACCAAACCCCAAAGATATCCCCACACA
+
diff --git a/paleomix/resources/examples/phylo_pipeline/alignment/000_prefixes/chimpanzee.fasta b/paleomix/resources/examples/phylo_pipeline/alignment/000_prefixes/chimpanzee.fasta
new file mode 100644
index 0000000..459be34
--- /dev/null
+++ b/paleomix/resources/examples/phylo_pipeline/alignment/000_prefixes/chimpanzee.fasta
@@ -0,0 +1,239 @@
+>gi|5835121|ref|NC_001643.1| Pan troglodytes mitochondrion, complete genome
+GTTTATGTAGCTTACCCCCTCAAAGCAATACACTGAAAATGTTTCGACGGGTTTACATCACCCCATAAAC
+AAACAGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTAAGATTACACATGCAAGCATCCCCGCCCCGTG
+AGTCACCCTCTAAATCGCCATGATCAAAAGGAACAAGTATCAAGCACGCAGCAATGCAGCTCAAAACGCT
+TAGCCTAGCCACACCCCCACGGGAGACAGCAGTGATAAACCTTTAGCAATAAACGAAAGTTTAACTAAGC
+CATACTAACCTCAGGGTTGGTCAATTTCGTGCTAGCCACCGCGGTCATACGATTAACCCAAGTCAATAGA
+AACCGGCGTAAAGAGTGTTTTAGATCACCCCCCCATAAAGCTAAAATTCACCTGAGTTGTAAAAAACTCC
+AGCTGATACAAAATAAACTACGAAAGTGGCTTTAACACATCTGAATACACAATAGCTAAGACCCAAACTG
+GGATTAGATACCCCACTATGCTTAGCCCTAAACTTCAACAGTTAAATTAACAAAACTGCTCGCCAGAACA
+CTACGAGCCACAGCTTAAAACTCAAAGGACCTGGCGGTGCTTCATATCCCTCTAGAGGAGCCTGTTCTGT
+AATCGATAAACCCCGATCAACCTCACCGCCTCTTGCTCAGCCTATATACCGCCATCTTCAGCAAACCCTG
+ATGAAGGTTACAAAGTAAGCACAAGTACCCACGTAAAGACGTTAGGTCAAGGTGTAGCCTATGAGGTGGC
+AAGAAATGGGCTACATTTTCTACCCCAGAAAATTACGATAACCCTTATGAAACCTAAGGGTCAAAGGTGG
+ATTTAGCAGTAAACTAAGAGTAGAGTGCTTAGTTGAACAGGGCCCTGAAGCGCGTACACACCGCCCGTCA
+CCCTCCTCAAGTATACTTCAAAGGATACTTAACTTAAACCCCCTACGTATTTATATAGAGGAGATAAGTC
+GTAACATGGTAAGTGTACTGGAAAGTGCACTTGGACGAACCAGAGTGTAGCTTAACATAAAGCACCCAAC
+TTACACTTAGGAGATTTCAACTCAACTTGACCACTCTGAGCCAAACCTAGCCCCAAACCCCCTCCACCCT
+ACTACCAAACAACCTTAACCAAACCATTTACCCAAATAAAGTATAGGCGATAGAAATTGTAAACCGGCGC
+AATAGACATAGTACCGCAAGGGAAAGATGAAAAATTATACCCAAGCATAATACAGCAAGGACTAACCCCT
+GTACCTTTTGCATAATGAATTAACTAGAAATAACTTTGCAAAGAGAACCAAAGCTAAGACCCCCGAAACC
+AGACGAGCTACCTAAGAACAGCTAAAAGAGCACACCCGTCTATGTAGCAAAATAGTGGGAAGATTTATAG
+GTAGAGGCGACAAACCTACCGAGCCTGGTGATAGCTGGTTGTCCAAGATAGAATCTTAGTTCAACTTTAA
+ATTTACCTACAGAACCCTCTAAATCCCCTTGTAAACTTAACTGTTAGTCCAAAGAGGAACAGCTCTTTAG
+ACACTAGGAAAAAACCTTGTAAAGAGAGTAAAAAATTTAACACCCATAGTAGGCCTAAAAGCAGCCACCA
+ATTAAGAAAGCGTTCAAGCTCAACACCCACAACCTTAAAGATCCCAAACATACAACCGAACTCCTTACAC
+CCAATTGGACCAATCTATTACCCCATAGAAGAACTAATGTTAGTATAAGTAACATGAAAACATTCTCCTC
+CGCATAAGCCTACATCAGACCAAAATATTAAACTGACAATTAACAGCCTAATATCTACAATCAACCAACA
+AGCCATTATTACCCCCGCTGTTAACCCAACACAGGCATGCCCACAAGGAAAGGTTAAAAAAAGTAAAAGG
+AACTCGGCAAATCTTACCCCGCCTGTTTACCAAAAACATCACCTCTAGCATTACCAGTATTAGAGGCACC
+GCCTGCCCGGTGACATATGTTTAACGGCCGCGGTACCCTAACCGTGCAAAGGTAGCATAATCACTTGTTC
+CTTAAATAGGGACTTGTATGAATGGCTCCACGAGGGTTTAGCTGTCTCTTACTTTCAACCAGTGAAATTG
+ACCTACCCGTGAAGAGGCGGGCATAACATAACAAGACGAGAAGACCCTATGGAGCTTTAATTCATTAATG
+CAAACAATACTTAACAAACCTACAGGTCCTAAACTATTAAACCTGCATTAAAAATTTCGGTTGGGGCGAC
+CTCGGAGCACAACCCAACCTCCGAGCAATACATGCTAAGACCTCACCAGTCAAAGCGAATTACTACATCC
+AATTGATCCAATGACTTGACCAACGGAACAAGTTACCCTAGGGATAACAGCGCAATCCTATTCCAGAGTC
+CATATCAACAATAGGGTTTACGACCTCGATGTTGGATCAGGACATCCCGATGGTGCAGCCGCTATTAAAG
+GTTCGTTTGTTCAACGATTAAAGTCCTACGTGATCTGAGTTCAGACCGGAGTAATCCAGGTCGGTTTCTA
+TCTGTTCTAAATTTCTCCCTGTACGAAAGGACAAGAGAAATGAGGCCTACTTCACAAAGCGCCTTCCCCA
+ATAAATGATATTATCTCAATTTAGCGCCATGCCAACACCCACTCAAGAACAGAGTTTGTTAAGATGGCAG
+AGCCCGGTAATTGCATAAAACTTAAAACTTTACAATCAGAGGTTCAATTCCTCTTCTTGACAACACACCC
+ATGACCAACCTCCTACTCCTCATTGTACCCATCCTAATCGCAATAGCATTCCTAATGCTAACCGAACGAA
+AAATTCTAGGCTACATACAACTACGCAAAGGTCCCAACATTGTAGGTCCTTACGGGCTATTACAGCCCTT
+CGCTGACGCCATAAAACTCTTCACTAAAGAACCCTTAAAACCCTCCACTTCAACCATTACCCTCTACATC
+ACCGCCCCAACCCTAGCCCTCACCATTGCCCTCTTACTATGAACCCCCCTCCCCATACCCAACCCCCTAG
+TCAATCTTAACTTAGGCCTCCTATTTATTCTAGCCACCTCCAGCCTAGCCGTTTACTCAATCCTCTGATC
+AGGGTGAGCATCAAACTCGAACTACGCCTTAATCGGTGCACTACGAGCAGTAGCCCAAACAATCTCATAC
+GAAGTCACTCTAGCCATTATCCTACTGTCAACGCTACTAATAAGTGGCTCCTTCAATCTCTCTACCCTTG
+TCACAACACAAGAGCACCTCTGACTAATCCTGCCAACATGACCCCTGGCCATAATATGATTTATCTCTAC
+ACTAGCAGAGACCAACCGAACTCCCTTCGACCTTACTGAAGGAGAATCTGAACTAGTCTCAGGCTTTAAT
+ATCGAGTATGCCGCAGGCCCCTTTGCCCTATTTTTCATAGCCGAATACATAAACATTATTATAATAAACA
+CCCTCACTGCTACAATCTTCCTAGGAGCAACATACAATACTCACTCCCCTGAACTCTACACGACATATTT
+TGTCACCAAAGCTCTACTTCTAACCTCCCTGTTCCTATGAATTCGAACAGCATATCCCCGATTTCGCTAC
+GACCAGCTCATACACCTCCTATGAAAAAACTTCCTACCACTCACCCTAGCATCACTCATGTGATATATCT
+CCATACCCACTACAATCTCCAGCATCCCCCCTCAAACCTAAGAAATATGTCTGATAAAAGAATTACTTTG
+ATAGAGTAAATAATAGGAGTTCAAATCCCCTTATTTCTAGGACTATAAGAATCGAACTCATCCCTGAGAA
+TCCAAAATTCTCCGTGCCACCTATCACACCCCATCCTAAAGTAAGGTCAGCTAAATAAGCTATCGGGCCC
+ATACCCCGAAAATGTTGGTTACACCCTTCCCGTACTAATTAATCCCCTAGCCCAACCCATCATCTACTCT
+ACCATCCTTACAGGCACGCTCATTACAGCGCTAAGCTCACACTGATTTTTCACCTGAGTAGGCCTAGAAA
+TAAATATACTAGCTTTTATCCCAATCCTAACCAAAAAAATAAGCCCCCGCTCCACAGAAGCCGCCATCAA
+ATACTTTCTCACACAAGCAACTGCGTCCATAATTCTCCTGATAGCTATCCTCTCCAACAGCATACTCTCC
+GGACAATGAACCATAACCAATACTACCAATCAATACTCATCATTAATAATTATAATAGCAATGGCAATAA
+AACTAGGAATAGCCCCCTTTCACTTTTGAGTTCCAGAAGTTACCCAAGGCACCCCCCTAATATCCGGCCT
+ACTCCTCCTCACATGACAAAAATTAGCCCCTATTTCAATTATATACCAAATCTCCTCATCACTGAACGTA
+AACCTTCTCCTCACCCTTTCAATCTTGTCCATTATAGCAGGCAGCTGAGGCGGACTAAACCAAACCCAAC
+TACGCAAAATCCTAGCATACTCCTCAATCACCCACATAGGCTGAATAATAGCAGTCCTACCATATAACCC
+TAACATAACCATTCTTAATTTAACCATTTACATCATCCTAACTACTACCGCATTTCTGCTACTCAACTTA
+AACTCCAGCACCACAACCCTACTACTATCTCGCACCTGAAACAAGCTAACATGATTAACTCCCCTAATTC
+CATCCACCCTCCTCTCCCTAGGAGGCCTACCCCCACTAACTGGCTTCTTACCCAAATGAGTTATCATCGA
+AGAATTCACAAAAAATAATAGCCTCATCATCCCCACCATCATAGCCATCATCACTCTCCTTAACCTCTAT
+TTCTACCTACGCCTAATCTACTCCACCTCAATTACACTACTTCCCATATCTAATAACGTAAAAATAAAAT
+GACAATTCGAACATACAAAACCCACCCCCTTCCTCCCTACACTCATCACCCTTACCACACTGCTTCTACC
+CATCTCCCCCTTCATACTAATAATCTTATAGAAATTTAGGTTAAGCACAGACCAAGAGCCTTCAAAGCCC
+TCAGCAAGTTACAATACTTAATTTCTGCAACAACTAAGGACTGCAAAACCCCACTCTGCATCAACTGAAC
+GCAAATCAGCCACTTTAATTAAGCTAAGCCCTTACTAGATTAATGGGACTTAAACCCACAAACATTTAGT
+TAACAGCTAAACACCCTAATCAACTGGCTTCAATCTACTTCTCCCGCCGCAAGAAAAAAAGGCGGGAGAA
+GCCCCGGCAGGTTTGAAGCTGCTTCTTCGAATTTGCAATTCAATATGAAAATCACCTCAGAGCTGGTAAA
+AAGAGGCTTAACCCCTGTCTTTAGATTTACAGTCCAATGCTTCACTCAGCCATTTTACCCCACCCTACTG
+ATGTTCACCGACCGCTGACTATTCTCTACAAACCACAAAGATATTGGAACACTATACCTACTATTCGGTG
+CATGAGCTGGAGTCCTGGGCACAGCCCTAAGTCTCCTTATTCGGGCTGAACTAGGCCAACCAGGCAACCT
+CCTAGGTAATGACCACATCTACAATGTCATCGTCACAGCCCATGCATTCGTAATAATCTTCTTCATAGTA
+ATGCCTATTATAATCGGAGGCTTTGGCAACTGGCTAGTTCCCTTGATAATTGGTGCCCCCGACATGGCAT
+TCCCCCGCATAAACAACATAAGCTTCTGGCTCCTGCCCCCTTCTCTCCTACTTCTACTTGCATCTGCCAT
+AGTAGAAGCCGGCGCGGGAACAGGTTGAACAGTCTACCCTCCCTTAGCGGGAAACTACTCGCATCCTGGA
+GCCTCCGTAGACCTAACCATCTTCTCCTTACATCTGGCAGGCATCTCCTCTATCCTAGGAGCCATTAACT
+TCATCACAACAATTATTAATATAAAACCTCCTGCCATGACCCAATACCAAACACCCCTCTTCGTCTGATC
+CGTCCTAATCACAGCAGTCTTACTTCTCCTATCCCTCCCAGTCCTAGCTGCTGGCATCACCATACTATTG
+ACAGATCGTAACCTCAACACTACCTTCTTCGACCCAGCCGGGGGAGGAGACCCTATTCTATATCAACACT
+TATTCTGATTTTTTGGCCACCCCGAAGTTTATATTCTTATCCTACCAGGCTTCGGAATAATTTCCCACAT
+TGTAACTTATTACTCCGGAAAAAAAGAACCATTTGGATATATAGGCATGGTTTGAGCTATAATATCAATT
+GGCTTCCTAGGGTTTATCGTGTGAGCACACCATATATTTACAGTAGGGATAGACGTAGACACCCGAGCCT
+ATTTCACCTCCGCTACCATAATCATTGCTATTCCTACCGGCGTCAAAGTATTCAGCTGACTCGCTACACT
+TCACGGAAGCAATATGAAATGATCTGCCGCAGTACTCTGAGCCCTAGGGTTTATCTTTCTCTTCACCGTA
+GGTGGCCTAACCGGCATTGTACTAGCAAACTCATCATTAGACATCGTGCTACACGACACATACTACGTCG
+TAGCCCACTTCCACTACGTTCTATCAATAGGAGCTGTATTCGCCATCATAGGAGGCTTCATTCACTGATT
+CCCCCTATTCTCAGGCTATACCCTAGACCAAACCTATGCCAAAATCCAATTTGCCATCATGTTCATTGGC
+GTAAACCTAACCTTCTTCCCACAGCACTTCCTTGGCCTATCTGGGATGCCCCGACGTTACTCGGACTACC
+CCGATGCATACACCACATGAAATGTCCTATCATCCGTAGGCTCATTTATCTCCCTGACAGCAGTAATATT
+AATAATTTTCATGATTTGAGAAGCCTTTGCTTCAAAACGAAAAGTCCTAATAGTAGAAGAGCCCTCCGCA
+AACCTGGAATGACTATATGGATGCCCCCCACCCTACCACACATTCGAAGAACCCGTATACATAAAATCTA
+GACAAAAAAGGAAGGAATCGAACCCCCTAAAGCTGGTTTCAAGCCAACCCCATGACCTCCATGACTTTTT
+CAAAAAGATATTAGAAAAACTATTTCATAACTTTGTCAAAGTTAAATTACAGGTTAACCCCCGTATATCT
+TAATGGCACATGCAGCGCAAGTAGGTCTACAAGATGCTACTTCCCCTATCATAGAAGAACTTATTATCTT
+TCACGACCATGCCCTCATAATTATCTTTCTCATCTGCTTTCTAGTCCTATACGCCCTTTTCCTAACACTC
+ACAACAAAACTAACTAATACTAGTATTTCAGACGCCCAGGAAATAGAAACCGTCTGAACTATCCTGCCCG
+CCATCATCCTAGTCCTTATTGCCCTACCATCCCTGCGTATCCTTTACATAACAGACGAGGTCAACGACCC
+CTCCTTTACTATTAAATCAATCGGCCATCAATGATATTGAACCTACGAATACACCGACTACGGCGGGCTA
+ATCTTCAACTCCTACATACTCCCCCCATTATTTCTAGAACCAGGTGATCTACGACTCCTTGACGTTGATA
+ACCGAGTGGTCCTCCCAGTTGAAGCCCCCGTTCGTATAATAATTACATCACAAGATGTTCTACACTCATG
+AGCTGTTCCCACATTAGGCCTAAAAACAGACGCAATTCCCGGACGCCTAAACCAAACCACTTTCACCGCC
+ACACGACCAGGAGTATACTACGGCCAATGCTCAGAAATCTGTGGAGCAAACCACAGTTTTATACCCATCG
+TCCTAGAATTAATCCCTCTAAAAATCTTTGAAATAGGACCCGTATTCACTCTATAGCACCTTCTCTACCC
+CTCTCCAGAGCTCACTGTAAAGCTAACCTAGCATTAACCTTTTAAGTTAAAGATTAAGAGGACCGACACC
+TCTTTACAGTGAAATGCCCCAACTAAATACCGCCGTATGACCCACCATAATTACCCCCATACTCCTGACA
+CTATTTCTCGTCACCCAACTAAAAATATTAAATTCAAATTACCATCTACCCCCCTCACCAAAACCCATAA
+AAATAAAAAACTACAATAAACCCTGAGAACCAAAATGAACGAAAATCTATTCGCTTCATTCGCTGCCCCC
+ACAATCCTAGGCTTACCCGCCGCAGTACTAATCATTCTATTCCCCCCTCTACTGGTCCCCACTTCTAAAC
+ATCTCATCAACAACCGACTAATTACCACCCAACAATGACTAATTCAACTGACCTCAAAACAAATAATAAC
+TATACACAGCACTAAAGGACGAACCTGATCTCTCATACTAGTATCCTTAATCATTTTTATTACCACAACC
+AATCTTCTTGGGCTTCTACCCCACTCATTCACACCAACCACCCAACTATCTATAAACCTAGCCATGGCTA
+TCCCCCTATGAGCAGGCGCAGTAGTCATAGGCTTTCGCTTTAAGACTAAAAATGCCCTAGCCCACTTCTT
+ACCGCAAGGCACACCTACACCCCTTATCCCCATACTAGTTATCATCGAAACTATTAGCCTACTCATTCAA
+CCAATAGCCTTAGCCGTACGTCTAACCGCTAACATTACTGCAGGCCACCTACTCATGCACCTAATTGGAA
+GCGCCACACTAGCATTATCAACTATCAATCTACCCTATGCACTCATTATCTTCACAATTCTAATCCTACT
+GACTATTCTAGAGATCGCCGTCGCCTTAATCCAAGCCTACGTTTTTACACTTCTAGTGAGCCTCTACCTG
+CACGACAACACATAATGACCCACCAATCACATGCCTACCACATAGTAAAACCCAGCCCATGACCCCTAAC
+AGGGGCCCTCTCGGCCCTCCTAATAACCTCCGGCCTGGCCATATGATTCCACTTCTACTCCACAACACTA
+CTCACACTAGGCTTACTAACTAACACATTGACCATATATCAATGATGACGCGATGTTATACGAGAAGGCA
+CATACCAAGGCCACCACACACCACCCGTCCAAAAAGGTCTCCGATATGGGATAATTCTTTTTATTACCTC
+AGAAGTTTTTTTCTTTGCAGGATTTTTTTGAGCTTTCTACCACTCCAGCCTAGCCCCTACCCCCCAGCTA
+GGAGGACACTGGCCCCCAACAGGTATTACCCCACTAAATCCCCTAGAAGTCCCACTCCTAAACACATCTG
+TATTACTCGCATCAGGAGTATCAATTACTTGAGCCCATCACAGCTTAATAGAAAATAACCGAAACCAAAT
+AATTCAAGCACTGCTTATTACGATTCTACTAGGTCTTTATTTTACCCTCCTACAAGCCTCAGAATATTTC
+GAATCCCCTTTTACCATTTCCGATGGCATCTACGGCTCAACATTCTTTGTAGCCACAGGCTTCCACGGAC
+TCCACGTCATTATTGGATCAACTTTCCTCACTATCTGCCTCATCCGCCAACTAATATTTCACTTCACATC
+CAAACATCACTTCGGCTTTCAAGCCGCCGCCTGATACTGACACTTCGTAGATGTAGTCTGACTATTTCTA
+TATGTCTCTATTTACTGATGAGGATCTTACTCTTTTAGTATAAGTAGTACCGTTAACTTCCAATTAACTA
+GTTTTGACAACATTCAAAAAAGAGTAATAAACTTCGTCCTAATTTTAATAACCAATACCCTTCTAGCCCT
+ACTACTGATAATTATCACATTCTGACTACCACAACTCAACAGCTACATAGAAAAATCTACCCCTTACGAA
+TGTGGCTTCGACCCTATATCCCCCGCCCGCGTCCCCTTCTCCATAAAATTTTTCCTAGTAGCCATCACCT
+TCCTATTATTTGACCTAGAAATTGCCCTCCTATTGCCCTTACCTTGAGCCCTACAAACGGCCAACCTACC
+ACTAATAGTCACATCATCCCTCTTATTAATTACTATCCTAGCCCTAAGCCTCGCCTACGAATGATTACAA
+AAAGGGTTAGACTGAACCGAATTGGTATATAGTTTAAATAAAACGAATGATTTCGACTCATTAAATTATG
+ATAATCATATTTACCAAATGCCCCTTATTTATATAAATATTATACTAGCATTTACCATCTCACTTCTAGG
+AATACTAGTATATCGCTCACACCTAATATCTTCCCTACTATGCCTAGAAGGAATAATACTATCACTGTTC
+ATCATAGCCACCCTCATAACCCTCAATACTCACTCCCTCTTAGCCAATATTGTACCCATCACCATACTAG
+TCTTTGCTGCCTGCGAAGCAGCAGTAGGTCTAGCACTACTAGTTTCAATCTCTAACACATATGGCTTAGA
+CTACGTACATAACCTAAACCTACTCCAATGCTAAAACTAATCATCCCGACAATTATATTACTACCACTAA
+CATGATTCTCTAAAAAACGTATAATTTGAATCAACACAACCACTCACAGCCTAATTATCAGCACCATTCC
+CTTACTATTTTTTAACCAAATTAACAACAACCTATTCAGCTGTTCCCTGCCCTTCTCCTCCGACCCCTTA
+ACAACTCCCCTCCTAATATTAACTGCTTGACTTCTACCCCTCACAATCATAGCAAGCCAGCGCCACCTAT
+CCAACGAACCACTATCACGAAAAAAACTCTACCTCTCCATGCTAATTTCCCTCCAAATCTCCTTAATTAT
+AACATTCTCGGCCACAGAGCTAATTATATTTTATATCTTCTTCGAAACCACACTTATCCCCACCCTGGCT
+ATCATCACCCGATGGGGTAACCAACCAGAACGCCTGAACGCAGGTACATACTTCCTATTCTATACCCTAG
+TAGGCTCCCTCCCCCTACTCATCGCACTAATCTATACCCACAACACCCTAGGCTCACTAAATATCCTATT
+ACTCACTCTTACAACCCAAGAACTATCAAACACCTGAGCCAACAACTTAATATGACTAGCGTACACGATG
+GCTTTCATGGTAAAAATACCCCTTTACGGACTCCACCTATGACTCCCTAAAGCCCATGTCGAAGCCCCTA
+TTGCCGGGTCAATGGTACTTGCTGCAGTACTCTTAAAATTAGGTGGCTATGGCATAATACGCCTCACACT
+CATCCTCAACCCCCTAACAAAACATATAGCCTATCCCTTCCTCATGTTGTCCTTATGAGGTATAATCATA
+ACAAGCTCCATCTGCCTGCGACAAACAGACCTAAAATCGCTCATTGCATACCCTTCAGTCAGCCACATAG
+CCCTCGTAGTAACAGCCATTCTCATCCAAACCCCCTGAAGCTTCACCGGCGCAATTATCCTCATAATCGC
+CCACGGACTTACATCCTCATTATTATCCTGCCTAGCAAACTCAAATTATGAACGCACCCACAGTCGCATC
+ATAATTCTCTCCCAAGGACTTCAAACTCTACTCCCACTAATAGCCTTTTGATGACTCCTGGCAAGCCTCG
+CTAACCTCGCCCTACCCCCTACCATTAATCTCCTAGGGGAACTCTCCGTGCTAGTAACCTCATTCTCCTG
+ATCAAATACCACTCTCCTACTCACAGGATTCAACATACTAATCACAGCCCTGTACTCCCTCTACATGTTT
+ACCACAACACAATGAGGCTCACTCACCCACCACATTAATAGCATAAAGCCCTCATTCACACGAGAAAACA
+CTCTCATATTTTTACACCTATCCCCCATCCTCCTTCTATCCCTCAATCCTGATATCATCACTGGATTCAC
+CTCCTGTAAATATAGTTTAACCAAAACATCAGATTGTGAATCTGACAACAGAGGCTCACGACCCCTTATT
+TACCGAGAAAGCTTATAAGAACTGCTAACTCGTATTCCCATGCCTAACAACATGGCTTTCTCAACTTTTA
+AAGGATAACAGTTATCCATTGGTCTTAGGCCCCAAAAATTTTGGTGCAACTCCAAATAAAAGTAATAACC
+ATGTATGCTACCATAACCACCTTAGCCCTAACTTCCTTAATTCCCCCCATCCTCGGCGCCCTCATTAACC
+CTAACAAAAAAAACTCATACCCCCATTACGTGAAATCCATTATCGCATCCACCTTTATCATTAGCCTTTT
+CCCCACAACAATATTCATATGCCTAGACCAAGAAACTATTATCTCGAACTGACACTGAGCAACAACCCAA
+ACAACCCAACTCTCCCTGAGCTTTAAACTAGACTATTTCTCCATAACATTTATCCCCGTAGCACTGTTCG
+TTACATGATCCATCATAGAATTCTCACTATGATATATAGACTCAGACCCCAACATCAACCAATTCTTCAA
+ATACTTACTTATCTTCCTAATTACTATACTAATCCTAGTCACCGCTAACAACCTATTCCAACTCTTCATC
+GGCTGAGAAGGCGTAGGAATTATATCCTTTCTACTCATTAGCTGATGGTACGCCCGAACAGATGCCAACA
+CAGCAGCCATCCAAGCAATCCTATATAACCGTATCGGTGATATTGGTTTTGTCCTAGCCCTAGCATGATT
+TCTCCTACACTCCAACTCATGAGATCCACAACAAATAATCCTCCTAAGTACTAATACAGACCTTACTCCA
+CTACTAGGCTTCCTCCTAGCAGCAGCAGGCAAATCAGCTCAACTAGGCCTTCACCCCTGACTCCCCTCAG
+CCATAGAAGGCCCTACCCCTGTTTCAGCCCTACTCCACTCAAGCACCATAGTCGTAGCAGGAATCTTCCT
+ACTCATCCGCTTCTACCCCCTAGCAGAGAATAACCCACTAATCCAAACTCTCACGCTATGCCTAGGCGCT
+ATCACCACCCTATTCGCAGCAGTCTGCGCCCTCACACAAAATGACATCAAAAAAATCGTGGCCTTCTCCA
+CTTCAAGCCAACTAGGACTCATAATAGTTACAATCGGTATCAACCAACCACACCTAGCATTCCTTCACAT
+CTGCACCCACGCTTTCTTCAAAGCCATACTATTCATATGCTCCGGATCCATTATTCACAACCTCAATAAT
+GAGCAAGACATTCGAAAAATAGGAGGATTACTCAAAACCATACCCCTCACTTCAACCTCCCTCACCATTG
+GGAGCCTAGCATTAGCAGGAATACCCTTCCTCACAGGTTTCTACTCCAAAGACCTCATCATCGAAACCGC
+TAACATATCATACACAAACGCCTGAGCCCTATCTATTACTCTCATCGCCACCTCTCTGACAAGCGCCTAC
+AGCACCCGAATAATCCTCCTCACCCTAACAGGTCAACCTCGCTTCCCAACCCTCACCAACATTAACGAAA
+ACAACCCCACTCTGTTAAATCCCATTAAACGCCTAACCATTGGAAGCTTATTTGCAGGATTTCTCATTAC
+CAACAACATTCTCCCCATATCTACTCCCCAAGTGACAATTCCCCTTTACTTAAAACTTACAGCCCTAGGC
+GTTACTTCCCTAGGACTTCTAACAGCCCTAGACCTCAATTACCTAACCAGCAAGCTCAAAATAAAATCCC
+CACTATATACATTTCACTTCTCTAATATACTCGGATTCTACCCTAACATTATACACCGCTCGATCCCCTA
+TCTAGGCCTTCTTACAAGCCAAAACCTACCCCTACTTCTTCTAGACCTGACCTGACTAGAGAAACTATTA
+CCTAAAACAATTTCACAGTACCAAATCTCCGCTTCCATTACCACCTCAACCCAAAAAGGCATGATCAAAC
+TTTATTTCCTCTCTTTTTTCTTCCCTCTCATCTTAACCTTACTCCTAATCACATAACCTATTCCCCCGAG
+CAATCTCAATCACAATGTATACACCAACAAACAATGTCCAACCAGTAACTACTACTAACCAACGCCCATA
+ATCATATAAGGCCCCCGCACCAATAGGATCCTCCCGAATCAGCCCTGGCCCCTCCCCTTCATAAATTATT
+CAACTTCCCACGCTATTAAAATTTACCACAACCACCATCCCATCATACCCTTTTACCCATAACACTAATC
+CTACCTCCATCGCCAGTCCTACTAAAACACTAACCAAAACCTCAACCCCTGACCCCCATGCCTCAGGATA
+CTCCTCAATAGCCATAGCCGTAGTATACCCAAAAACAACCATTATTCCCCCCAAATAAATTAAAAAAACC
+ATTAAACCTATATAACCTCCCCCATAATTCAAAATGATGGCACACCCAACTACACCACTAACAATCAATA
+CTAAACCCCCATAAATGGGAGAAGGCTTAGAAGAAAACCCCACAAACCCTATCACTAAACTCACACTCAA
+TAAAAATAAAGCATATGTCATTATTCTCGCACGGACTACAACCACGACCAATGATATGAAAAACCATCGT
+TGTATTTCAACTACAAGAACACCAATGACCCCGACACGCAAAATTAACCCACTAATAAAATTAATTAATC
+ACTCATTTATCGACCTCCCCACCCCATCCAACATTTCCGCATGATGGAACTTCGGCTCACTTCTCGGCGC
+CTGCCTAATCCTTCAAATTACCACAGGATTATTCCTAGCTATACACTACTCACCAGACGCCTCAACCGCC
+TTCTCGTCGATCGCCCACATCACCCGAGACGTAAACTATGGTTGGATCATCCGCTACCTCCACGCTAACG
+GCGCCTCAATATTTTTTATCTGCCTCTTCCTACACATCGGCCGAGGTCTATATTACGGCTCATTTCTCTA
+CCTAGAAACCTGAAACATTGGCATTATCCTCTTGCTCACAACCATAGCAACAGCCTTTATGGGCTATGTC
+CTCCCATGAGGCCAAATATCCTTCTGAGGAGCCACAGTAATTACAAACCTACTGTCCGCTATCCCATACA
+TCGGAACAGACCTGGTCCAGTGAGTCTGAGGAGGCTACTCAGTAGACAGCCCTACCCTTACACGATTCTT
+CACCTTCCACTTTATCTTACCCTTCATCATCACAGCCCTAACAACACTTCATCTCCTATTCTTACACGAA
+ACAGGATCAAATAACCCCCTAGGAATCACCTCCCACTCCGACAAAATTACCTTCCACCCCTACTACACAA
+TCAAAGATATCCTTGGCTTATTCCTTTTCCTCCTTATCCTAATGACATTAACACTATTCTCACCAGGCCT
+CCTAGGCGATCCAGACAACTATACCCTAGCTAACCCCCTAAACACCCCACCCCACATTAAACCCGAGTGA
+TACTTTCTATTTGCCTACACAATCCTCCGATCCATCCCCAACAAACTAGGAGGCGTCCTCGCCCTACTAC
+TATCTATCCTAATCCTAACAGCAATCCCTGTCCTCCACACATCCAAACAACAAAGCATAATATTTCGCCC
+ACTAAGCCAACTGCTTTACTGACTCCTAGCCACAGACCTCCTCATCCTAACCTGAATCGGAGGACAACCA
+GTAAGCTACCCCTTCATCACCATCGGACAAATAGCATCCGTATTATACTTCACAACAATCCTAATCCTAA
+TACCAATCGCCTCTCTAATCGAAAACAAAATACTTGAATGAACCTGCCCTTGTAGTATAAACTAATACAC
+CGGTCTTGTAAACCGGAAACGAAAACTTTCTTCCAAGGACAAATCAGAGAAAAAGTAATTAACTTCACCA
+TCAGCACCCAAAGCTAAGATTCTAATTTAAACTATTCTCTGTTCTTTCATGGGGAAGCAAATTTAGGTAC
+CACCTAAGTACTGGCTCATTCATTACAACCGCTATGTATTTCGTACATTACTGCCAGCCACCATGAATAT
+CGTACAGTACCATATCACCCAACTACCTATAGTACATAAAATCCACTCCCACATCAAAACCTTCACTCCA
+TGCTTACAAGCACGCACAACAATCAACTCCCAACTGTCGAACATAAAACACAATTCCAACGACACCCCTC
+CCCCACCCCGATACCAACAGACCTATCTCCCCTTGACAGAACATAGTACATACAACCATACACCGTACAT
+AGCACATTACAGTCAAACCCCTCCTCGCCCCCACGGATGCTCCCCCTCAGATAGGAATCCCTTGGTCACC
+ATCCTCCGTGAAATCAATATCCCGCACAAGAGTGACTCTCCTCGCTCCGGGCCCATAACATCTGGGGGTA
+GCTAAAGTGAACTGTATCCGACATCTGGTTCCTACCTCAGGGCCATGAAGTTCAAAAGACTCCCACACGT
+TCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCAGTCACGGGAGCCTTCCA
+TGCATTTGGTATTTTCGTCTGGGGGGTGTGCACGCGATAGCATTGCGAAACGCTGGCCCCGGAGCACCCT
+ATGTCGCAGTATCTGTCTTTGATTCCTGCCCCATTGTATTATTTATCGCACCTACGTTCAATATTACGAC
+CTAGCATACCTACTAAAGTGTGTTGATTAATTAATGCTTGCAGGACATAACAACAGCAGCAAAATGCTCA
+CATAACTGCTTTCCACACCAACATCATAACAAAAAATTCCCACAAACCCCCCCTTCCCCCCGGCCACAGC
+ACTCAAACAAATCTCTGCCAAACCCCAAAAACAAAGAACCCAGACGCCAGCCTAGCCAGACTTCAAATTT
+CATCTTTAGGCGGTATGCACTTTTAACAGTCACCCCTCAATTAACATGCCCTCCCCCCTCAACTCCCATT
+CTACTAGCCCCAGCAACGTAACCCCCTACTCACCCTACTCAACACATATACCGCTGCTAACCCCATACCC
+TGAACCAACCAAACCCCAAAGACACCCCTACACA
+
diff --git a/paleomix/resources/examples/phylo_pipeline/alignment/000_prefixes/gorilla.fasta b/paleomix/resources/examples/phylo_pipeline/alignment/000_prefixes/gorilla.fasta
new file mode 100644
index 0000000..2984f3d
--- /dev/null
+++ b/paleomix/resources/examples/phylo_pipeline/alignment/000_prefixes/gorilla.fasta
@@ -0,0 +1,237 @@
+>gi|195952353|ref|NC_011120.1| Gorilla gorilla gorilla mitochondrion, complete genome
+GTTTATGTAGCTTACCTCCCCAAAGCAATACACTGAAAATGTTTCGACGGGCTCACATCACCCCATAAAC
+AAATAGGTTTGGTCCTAGCCTTTCTATTAACTCTTAGTAGGATTACACATGCAAGCATCCCCGCCCCAGT
+GAGTCACCCTCTAAATCACCACGATCAAAAGGAACAAGCATCAAGTACGCAGAAATGCAGCTCAAAACGC
+TTAGCCTAGCCACACCCCCACGGGAGACAGCAGTGATAAACCTTTAGCAATAAACGAAAGTTTAACTAAG
+CCATACTAACCCCAGGGTTGGTCAATTTCGTGCCAGCCACCGCGGTCACACGATTAACCCAAGCCAATAG
+AAATCGGCGTAAAGAGTGTTTTAGATCAATCCCCCAATAAAGCTAAAATTCACCTGAGTTGTAAAAAACT
+CCAGCTGATATAAAATAAACTACGAAAGTGGCTTTAATATATCTGAACACACAATAGCTAGGACCCAAAC
+TGGGATTAGATACCCCACTATGCCTAGCCCTAAACTTCAACAGTTAAATTAACAAGACTGCTCGCCAGAA
+CACTACGAGCCACAGCTTAAAACTCAAAGGACCTGGCGGTGCTTCACATCCTTCTAGAGGAGCCTGTTCT
+GTAATCGATAAACCCCGATCAACCTCACCACCTCTTGCTCAGCCTATATACCGCCATCTTCAGCAAACCC
+TGACGAAGGCCACAAAGTAAGCACAAGTACCCACGTAAAGACGTTAGGTCAAGGTGTAGCCCATGAGGTG
+GCAAGAAATGGGCTACATTTTCTACTTCAGAAAACTACGATAACCCTTATGAAACCTAAGGGTAGAAGGT
+GGATTTAGCAGTAAACTAAGAGTAGAGTGCTTAGTTGAACAGGGCCCTGAAGCGCGTACACACCGCCCGT
+CACCCTCCTCAAGTATACTTCAAAGGACATTTAACTAAAACCCCTACGCATCTATATAGAGGAGATAAGT
+CGTAACATGGTAAGTGTACTGGAAAGTGCACTTGGACGAACCAGAGTGTAGCTTAACACAAAGCACCCAA
+CTTACACTTAGGAGATTTCAACTCAACTTGACCGCTCTGAGCAAAACCTAGCCCCAAACCCACCCCACAT
+TACTACCAAACAACTTTAATCAAACCATTTACCCAAATAAAGTATAGGCGATAGAAATTGTAAATCGGCG
+CAATAGATATAGTACCGCAAGGGAAAGATGAAAAAATATAACCAAGCACGACACAGCAAGGACTAACCCC
+TGTACCTTCTGCATAATGAATTAACTAGAAATAACTTTGCAAAGAGAACCGAAGCTAAGACCCCCGAAAC
+CAGACGAGCTACCTAAGAACAGCTAAAAGAGCACACCCGTCTATGTAGCAAAATAGTGGGAAGATTCCTA
+GGTAGAGGCGACAAACCTACCGAGCCTGGTGATAGCTGGTTGTCCAAGACAGAATCTTAGTTCAACTTTA
+AATTTACCCACAGAACCCTCCAAATCCTCCCGTAAATTTAACTGTTAGTCCAAAGAGGAACAGCTCTTTG
+GACACTAGGAAAAAACCTTGTAAAGTGAGTAAAAAATTTAATACCCATAGTAGGCCTAAAAGCAGCCACC
+AATTAAGAAAGCGTTCAAGCTCAACACTATCACCCAAAATAATCCCAAACACACAACTGAACTCCTTACA
+CCCAATTGGACCAATCTATTACCCTATAGAAGAGCTAATGTTAGTATAAGTAACATGAAACCATTCTCCT
+CCGCATAAGCCTACATCAGACCAAAATATTAAACTGACAATTAACAACTCAATATCTACAACTAACCAAC
+AAGCCATTATTACCCCCACTGTTAACCCAACACAGGCATGCCCACAAGGAAAGGTTGAAAAAAGTAAAAG
+GAACTCGGCAAATTTTACCCCGCCTGTTTACCAAAAACATCACCTCTAGCATTACCAGTATTAGAGGCAC
+CGCCTGCCCAGTGACATGCGTTCAACGGCCGCGGTACCCTAACCGTGCAAAGGTAGCATAATCACTTGTT
+CCTTAAATAGGGACTCGTATGAATGGCTCCACGAGGGTTCAGCTGTCTCTTACTTTTGACCAGTGAAATT
+GACCTGCCCGTGAAGAGGCGGACATAACATAACAAGACGAGAAGACCCTATGGAGCTTTAATTCATTAAT
+GCAAGCAACACTTAATAAACCCACAGGTCCTAAACTATTAAACCTGCATTAAAAATTTCGGTTGGGGCGA
+CCTCGGAGTATAATCTAACCTCCGAACAACACATGCCAAGACTTCACCAGTCAAAGCGAGCTACCACATT
+TAATTGATCCAATGACTTGACCAACGGAACAAGTTACCCTAGGGATAACAGCGCAATCCTATTCTAGAGT
+CCATATCAACAGTAGGGTTTACGACCTCGATGTTGGATCAGGACATCCCGATGGTGCAGCCGCTATCAAA
+GGTTCGTTTGTTCAACGATTAAAGTCCTACGTGATCTGAGTTCAGACCGGAGTAATCCAGGTCGGTTTCT
+ATCTATTCCGTGTTTCTCCCTGTACGAAAGGACAAGAGAAACGAGGCCTACTTCACAAAGCGCCTCCCCC
+CGTAAATGATACTATCTCAATTTAATATAGCGCCCACATCTACTCAAGAATAGGGTTTGTTAAGATGGCA
+GAGCCCGGTAATCGCATAAAACTTAAAACTTTATAGTCAGAGGTTCAATTCCTCTTCTTAACAATATATC
+CATGGCTAACCTTCTACTCCTCATTGTACCTATCCTAATCGCCATAGCATTCCTAATGCTAACCGAACGA
+AAAATTTTAGGCTATATACAACTACGTAAAGGCCCCAATGTCGTAGGCCCCTACGGGCTACTACAACCCT
+TCGCTGACGCCATAAAACTCTTCACCAAAGAGCCCCTAAAACCCTCAACATCAACCATCACCCTCTACAT
+CACCGCCCCAACCTTAGCTCTCACTATTGCCCTACTATTATGAACCCCCCTCCCCATACCCAACCCCCTA
+GTCAATCTCAATCTAGGCCTCCTATTCATCCTAGCCACCTCTAGCCTAGCCGTTTACTCCATCCTCTGAT
+CAGGATGGGCATCAAACTCAAACTACGCTTTGATCGGCGCACTACGAGCAGTAGCCCAAACAATCTCATA
+TGAAGTCACCCTGGCCATTATCCTACTATCAACACTACTGATGAATGGTTCCTTTAACCTCTCCACCCTC
+ATCATAACACAAGAACACCTCTGACTGCTCCTACCAACATGACCCTTAGCTATAATATGATTTATCTCCA
+CATTAGCAGAAACCAACCGGACCCCCTTCGACCTCGCTGAAGGAGAATCTGAACTAGTCTCAGGCTTTAA
+TATCGAATATGCCGCAGGCCCCTTCGCCCTATTTTTCATAGCCGAGTACATAAATATTATCATAATAAAT
+ACTCTCACCACTATAATTTTCCTAGGAACAACATACAATGCCCACTCCCCTGAACTCTACACAGTATGCT
+TCATCACCAAAACCCTGCTTCTAACCTCTCTATTCCTATGAATTCGAACTGCATATCCTCGATTTCGCTA
+CGACCAACTCATGCACCTCCTGTGAAAAAACTTCTTGCCACTCACCCTGGCACTACTCATATGATATATT
+TCCATACCTACCACAATCTCCAGCATTCCCCCTCAAACCTAAGAAATATGTCTGATAAAAGAGTTACTTT
+GATAGAGTAAATAATAGAGGTTTAAACCCCCTTATTTCTAGGACTATGAGAATTGAACCCATCCCTGAGA
+ATCCAAAATTCTCCGTGCCACCTGTCACACCCCATCCTAAAGTAAGGTCAGCTAAATAAGCTATCGGGCC
+CATACCCCGAAAATGTTGGTCACATCCTTCCCGTACTAATTAACCCCCTGGCCCAACCCATCATCTACTC
+TACCATCTTCGCAGGCACTCTTATTACAGCACTAAGCTCCCACTGATTTTTTGCCTGAGTAGGCCTAGAA
+ATAAACATACTAGCTTTTATCCCAGTCCTAACCAAAAAAATAAATCCCCGCTCCACAGAAGCCGCCATCA
+AATATTTCCTCACACAAGCAACTGCATCCATAATCCTCCTAATAGCCATCCTCTCCAACAACATACTCTC
+CGGACAATGAACCACAACCAATGCCACTAATCAATACTCATCATTAATGATCGTAGTAGCTATAGCTATA
+AAACTAGGAATAGCCCCCTTTCACTTCTGAGTGCCAGAAGTTACCCAAGGCACCCCCCTAATGTCTGGCC
+TACTCCTCCTCACATGACAAAAACTAGCCCCTATGTCAATCATATACCAAATTTCCTCGTCAACAAATGT
+AAGCCTTCTCCTCACTCTTTCAATCCTATCCATCCTAGCAGGCAGCTGAGGCGGACTAAACCAAACTCAA
+CTACGCAAGATTCTAGCATACTCCTCAATCACCCATGTAGGATGAATAATAGCAGTTCTACCATATAACC
+CTAATATAACTATTCTTAATCTGACCATTTATATTATCCTCACTACTACCACATTCCTATTACTCAACCT
+AAACTCCAGCACCACAACCTTACTACTATCTCGTACTTGAAATAAACTGACATGATTAACACCTCTAATC
+CCCTCCACCCTCCTCTCCCTAGGAGGTCTACCCCCACTAACCGGCTTCCTACCCAAATGGCTTATTATCG
+AAGAATTCACAAAAAATAACGACCTCATTACCCCCACCATTATGGCCATCATCACCCTCCTCAACCTCTA
+TTTCTATCTACGCCTAATTTACTCCACCTCTATCACACTACTACCCATATCCAACAACGTAAAGATAAAA
+TGACAGCTCGAATATACTAAACCCACTCCCTTCCTCCCTACACTTATCACACTCACCACCCTACTTCTAC
+CCATCTCCCCCTTCATACTAATAGTTCTATAGAAATTTAGGTTAAACATAGACCAAGAGCCTTCAAAGCC
+CTTAGTAAGTTACAACACTTAATTTCTGCAACAACTAAGGACTGCAAAACCCTACTCTGCATCAACTGAA
+CGCAAATCAGCCACTTTAATTAAGCTAAGCCCTTACTAGATCAATGGGACTCAAACCCACAAACATTTAG
+TTAACAGCTAAACACCCTAGTCAACTGGCTTCAATCTACTTCTCCCGCCGCAAGAAAAAAAGGCGGGAGA
+AGCCCCGGCAGGTTTGAAGCTGCTTCTTCGAATTTGCAATTCAATATGAAATTCACCTCGGAGCTGGTAA
+AAAGAGGCCCAGCCTCTGTCTTTAGATTTACAGTCCAATGCCTTACTCAGCCATTTTACCTCCTTTTTTC
+CACTGATGTTCACCGACCGCTGATTATTCTCTACAAACCATAAAGATATTGGAACACTATATCTACTATT
+CGGCGCATGAGCTGGAGTCCTAGGCACAGCCCTAAGTCTCCTTATTCGAGCAGAACTTGGTCAACCAGGC
+AACCTTCTAGGTAACGATCACATCTATAATGTTATCGTCACAGCCCATGCGTTCGTAATAATTTTCTTCA
+TAGTAATGCCTATCATAATCGGAGGCTTTGGCAACTGGCTAGTACCCTTAATAATTGGTGCCCCCGACAT
+GGCATTCCCCCGCATAAACAACATAAGCTTCTGACTCCTTCCCCCTTCTTTCCTACTTCTGCTCGCATCC
+GCTATAGTAGAAGCCGGCGCAGGGACTGGTTGGACAGTCTACCCTCCCTTAGCAGGAAATTATTCCCACC
+CCGGAGCTTCTGTAGACCTAACCATTTTTTCCCTACACCTAGCAGGCATCTCCTCTATTCTAGGGGCCAT
+CAACTTCATTACAACAATCATCAATATAAAACCCCCCGCCATAACCCAATACCAAACACCCCTTTTCGTC
+TGATCCGTCCTAATCACAGCAGTCTTACTTCTTCTATCTCTCCCAGTACTAGCTGCTGGAATTACCATAT
+TATTAACAGACCGTAACCTCAACACCACCTTTTTCGACCCAGCCGGAGGAGGAGATCCTATCCTATACCA
+ACACTTATTCTGATTTTTTGGACACCCCGAAGTTTACATTCTAATCCTACCAGGCTTCGGAATAATCTCC
+CACATTGTAACTTATTACTCCGGAAAAAAAGAACCATTCGGATATATAGGTATAGTCTGAGCTATAATAT
+CAATTGGTTTCCTGGGATTTATTGTGTGAGCCCACCACATATTTACAGTAGGAATAGACGTAGATACACG
+AGCCTACTTCACCTCCGCTACCATAATCATCGCTATCCCCACCGGCGTCAAAGTATTCAGCTGACTCGCT
+ACACTCCATGGAAGTAATACCAAATGATCTGCCGCAATGCTCTGAGCCCTAGGGTTCATTTTTCTCTTCA
+CTGTAGGCGGCCTAACCGGCATCGTACTAGCAAACTCGTCATTAGATATCGTGCTGCACGACACATATTA
+CGTCGTAGCTCACTTCCACTATGTCCTATCTATAGGAGCTGTGTTCGCCATCATAGGGGGCTTTATTCAC
+TGATTTCCCCTATTCTCAGGCTACACTCTAGATCAAACCTACGCCAAAATCCACTTTGCCATCATATTCA
+TTGGCGTTAATCTAACCTTCTTCCCACAACACTTTCTTGGCCTATCTGGAATACCCCGACGTTACTCGGA
+CTACCCCGATGCATATACTACATGAAATATCCTGTCATCCGTGGGCTCATTCATTTCCCTAACAGCAGTA
+ATATTAATAATTTTTATAATCTGAGAAGCCTTCGCCTCAAAACGAAAAGTCCTAATAATCGAAGAACCCT
+CCACAAATCTGGAGTGACTGTATGGATGCCCTCCACCCTATCATACATTTGAAGAGCCTGTATATATAAA
+GTCTAAACAAAAAAGGAAGGAATCGAACCCCCCAAAGCTGGTTTCAAGCCAACCCCATGACCTTCATGAC
+TTTTTCAAAAAAGATATTAGAAAAACTATTTCATAACTTTGTCAAGGTTAAATTACGGGTTAAACCCCGT
+ATATCTTAATGGCACATGCAGCGCAAGTAGGTCTACAAGACGCTACTTCTCCTATCATAGAAGAACTAAT
+CATCTTTCATGATCATGCCCTCATAATCATTTTTCTCATCTGCTTCTTAGTCCTGTACGCCCTTTTCCTA
+ACACTCACAACAAAGCTAACTAACACCAACATCTCAGACGCCCAAGAAATAGAGACCATCTGAACCATCC
+TGCCCGCTATTATCTTAGTCCTGATCGCCCTCCCATCTCTACGAATCCTCTATATGACAGATGAAATCAA
+TGACCCCTCCTTCACTATCAAATCAATCGGTCACCAATGATACTGAACCTACGAATACACCGACTATGGT
+GGATTGATCTTTAACTCCTATATACTCCCCCCATTATTCCTAGAACCAGGTGACCTACGACTCCTTGACG
+TCGACAACCGAGTAGTCCTTCCAGTTGAAGCCCCCGTCCGTATAATAATTACATCCCAAGATGTCTTACA
+CTCATGAGCTGTTCCCACATTAGGCCTAAAAACAGACGCAATCCCCGGACGCCTGAACCAAACCACATTC
+ACCGCCACACGACCAGGAGTATACTACGGCCAGTGCTCAGAAATCTGTGGAGCCAACCACAGTTTTATGC
+CCATTGTTCTAGAGCTAATCCCCCTAAAAATCTTTGAAATAGGACCCGTATTCGCCCTATAATACCCCTC
+TCATCCCTCTCCAGAGCTCACTGTAAAGCTAACCTAGCGTTAACCTTTTAAGTTAAAGATTAAGAGTATC
+GGCACCTCTTTGCAGTGAAATGCCCCAGCTAAATACCACCGTATGGCCCACCATAATTGCCCCAATACTC
+CTCACACTATTTCTCATTACCCAACTAAAAGTTTTAAACACAAATTACCACCTACCCCCCTTACCAAAAA
+CTATAAAAATAAAAAACTTCTGTAAACCCTGAGAATCAAAATGAACGAAAATTTATTCGCTTCATTCATT
+GCCCCCACAATCCTAGGCTTACCCGCCGCAGTATTAATTATCCTACTTCCCCCTCTACTGATCCCCACCT
+CCAAATATCTCATCAACAACCGACTGATTGCCACCCAACAGTGACTAATCCAACTAACCTCAAAACAAAT
+AATAACTATACATAACGCCAAGGGACGAACCTGATCCCTTATGTTAATATCATTAATTATTTTTATTGCC
+ACAACCAACCTCCTCGGACTCTTGCCCCACTCATTCACACCAACTACCCAGCTATCTATAAACCTGGCCA
+TAGCCATCCCCCTGTGAGCAGGCGCAGTAACTACAGGCTTTCGCTCCAAGACTAAAAATGCCCTAGCCCA
+CCTACTACCACAAGGCACCCCTACACCCCTTATCCCTATACTAGTCATCATTGAAACCATCAGCCTATTC
+ATCCAACCAATAGCCCTAGCTGTACGCCTAACCGCTAACATCACTGCAGGTCACCTACTTATGCACCTAA
+TCGGAAGCGCCACACTAGCAATATCAACTACCAATCTTCCCTCAACACTCATTATCTTTACAGTCCTAAT
+TTTATTAACTATACTAGAAATCGCTGTCGCCCTCATCCAAGCCTACGTTTTCACACTTTTAGTGAGCCTC
+TACCTGCACGAGAACACATAATGATCCACCAATCACATGCCTATCACATAGTAAAACCCAGCCCATGACC
+CCTAACAGGGGCCCTCTCAGCCCTCCTAATAACCTCAGGCTTAGCCATATGATTCCACTTCCACTCTACA
+ACCCTACTCATACTAGGCCTACTAACCAACATACTAACTATATACCAATGATGACGCGATGTAATGCGAG
+AAAGCACGTACCAAGGCCACCATACACTACCCGTCCAAAAAGGCTTACGATATGGAATGATCCTATTTAT
+TACCTCAGAAGTCTTTTTCTTCGCAGGATTCTTCTGAGCTTTCTACCACTCCAGCCTAGCCCCTACCCCT
+CAACTAGGAGCACACTGACCCCCAACAGGCATCACCCCACTAAACCCCCTAGAAGTTCCACTTTTAAATA
+CATCTGTACTGCTCGCATCAGGTGTCTCAATTACCTGAGCCCACCATAGCCTAATAGAAAATAACCGTAA
+CCAAATAATTCAAGCACTACTTATCACAATTTTACTGGGCCTCTACTTCACCCTCCTACAAGCCTCAGAG
+TACTTTGAAGCCCCCTTTACCATTTCCGATGGTATCTATGGCTCAACATTTTTTGTAGCCACAGGCTTTC
+ACGGGCTCCACGTCATCATTGGGTCAACTTTCCTCACTATCTGTCTTATTCGCCAACTAATATTTCACTT
+TACATCCAAACACCACTTCGGCTTCGAAGCCGCCGCTTGGTATTGGCACTTCGTAGATGTAGTCTGACTA
+TTCCTATACGTCTCCATCTACTGATGAGGGTCCTACTCTTTTAGTATAATTAGTACCGTTAACTTCCAAT
+TAACCAGTTTTGGTAGTACCCAAAAAAGAGTAATAAACTTCGCCCTGATCTTAATAACCAACACCCTTCT
+AGCCCTACTACTAATAATTATTACATTTTGACTACCGCAACTTAACAGCTACATAGAAAAAACTAACCCC
+TACGAATGTGGTTTCGACCCCGTATCCCCCGCCCGCATTCCTTTCTCCATAAAATTTTTCCTAGTAGCCA
+TCACTTTCTTACTATTTGACCTAGAAATTGCCCTCCTATTGCCCCTGCCATGAGCCCTACAAACAACCAA
+CTTACCACTAATAGTCATGTCATCCCTCTTATTAATCATTATCCTAACCCTAAGTCTAGCCTACGAATGA
+CTGCAAAAAGGACTAGACTGGGCCGAATTGGTACATAGTTTAAACAAAACGAATGATTTCGACTCATTAA
+ATTATGATAATCATATTTACCAAATGCCCCTTATTTACATAAACATTATACTAGCATTTACCATCTCACT
+TCTAGGAATACTAGTGTACCGTTCACACCTAATATCCTCCCTACTATGCCTAGAAGGAATAATATTATCA
+CTATTCATTATAGCTACTCTCATAACCCTCAATACCCACTCCCTCTTAGCTAACATTGTACCCATCACCA
+TACTAGTCTTTGCTGCCTGTGAAGCAGCAGTAGGCCTAGCCCTACTAGTCTCAATCTCTAACACATATGG
+TCTTGACTATGTACAAAACCTAAACCTGCTCCAATGCTAAAACTAATCGCCCCAACAATTATACTACTAC
+CGCTAACATGACTCTCCAAAAAACATATAATTTGAATCAACACAACCACCCACAGCCTAATCATCAGCAT
+CATTCCCCTACTATTTTTTAACCAAATTAATAACAATCTATTTAGTTACTCCCTATCTTTTTCCTCCGAC
+CCCCTAACGACCCCCCTCCTAATATTAACTACCTGACTCCTACCCCTTACAATTATAGCAAGCCAGCGCC
+ACCTATCCAACGAACCACTATCACGAAAAAAACTCTATCTCTCCATATTAATCTCCCTCCAAATCTCCTT
+AATCATAACATTCACAGCCACAGAACTAATCATATTCTATATCTTCTTCGAAGCCACACTTATCCCTACC
+CTAGTTATCATCACCCGATGAGGCAACCAACCCGAACGTCTAAATGTGGGTACATACTTCCTATTCTACA
+CCCTAGTAGGATCCCTCCCCCTACTTATCGCATTAATCCACACTCACAACACCTTAGGCTCACTAAATAT
+TCTATTACTCACCCTTACTGCCCAAGAACTACCTAACTCCTGAGCCAACAACTTAATATGACTGGCATAC
+ACAATAGCCTTTATAGTGAAAATACCACTTTACGGGCTCCACCTATGACTCCCTAAGGCCCATGTTGAGG
+CCCCCATTGCTGGCTCAATGATGCTCGCTGCAGTACTCTTAAAATTAGGTGGCTATGGCATAATACGCCT
+CATACTCATTCTTAACCCCCTAACAAAACACATAGCTTATCCTTTCCTCGCCCTATCTTTATGGGGTATA
+ATCATAACAAGCTCCATCTGCCTACGGCAAACAGATTTAAAATCGCTCATTGCATACTCCTCAATTAGCC
+ACATAGCCCTTGTAGTAGCAGCTATCCTTATCCAAACCCCCTGAAGCTTCACCGGCGCAGTTGTTCTTAT
+AATTGCCCACGGACTTACATCATCATTATTATTCTGCCTAGCAAACTCAAACTACGAACGAACCCACAGC
+CGCATCATAATTCTCTCTCAAGGACTCCAAACCCTACTCCCACTAATAGCCCTTTGATGACTTCTGGCAA
+GCCTCGCCAACCTCGCCTTACCCCCCACCATTAACCTACTAGGAGAGCTCTCCGTACTAGTAACCACATT
+CTCCTGATCAAACACCACCCTTTTACTTACAGGATCTAACATACTAATTACAGCCCTGTACTCCCTTTAT
+ATATTTACCACAACACAATGAGGCCCACTCACACACCACATCACCAACATAAAACCCTCATTTACACGAG
+AAAACATCCTCATATTCATGCACCTATCCCCCATCCTCCTCCTATCCCTCAACCCCGATATTATCACCGG
+GTTTACCTCCTGTAAATATAGTTTAACCAAAACATCAGATTGTGAATCTGATAACAGAGGCTCACAACCC
+CTTATTTACCGAGAAAGCTCGTAAGAGCTGCTAACTCATACCCCCGTGCTTAACAACATGGCTTTCTCAA
+CTTTTAAAGGATAACAGCTATCCATTGGTCTTAGGACCCAAAAATTTTGGTGCAACTCCAAATAAAAGTA
+ATAACTATGTACGCTACCATAACCACCTTAGCCCTAACTTCCTTAATTCCCCCTATCCTTACCACCTTCA
+TCAATCCTAACAAAAAAAGCTCATACCCCCATTACGTAAAATCTATCGTCGCATCCACCTTTATCATCAG
+CCTCTTCCCCACAACAATATTTCTATGCCTAGACCAAGAAGCTATTATCTCAAGCTGACACTGAGCAACA
+ACCCAAACAATTCAACTCTCCCTAAGCTTTAAACTAGATTATTTCTCCATAATATTTATCCCTGTAGCAC
+TGTTCGTCACATGATCCATCATAGAATTCTCACTATGATATATAAACTCAGACCCCAACATCAACCAATT
+CTTCAAATACCTGCTCATCTTTCTAATCACCATACTAATCCTAGTCACCGCTAATAACCTATTTCAACTC
+TTTATCGGCTGAGAAGGTGTGGGAATTATATCTTTTCTACTCATTGGCTGATGATACGCTCGAACAGACG
+CCAACACAGCAGCCGTCCAAGCAATTCTATACAACCGTATTGGTGACATTGGTTTTATCCTAGCCCTAGC
+ATGATTTCTCCTACACTCCAACTCATGAGACCCACAACAAATATCTCTCTTAAACACCAACCCTAACCTT
+ATCCCACTTCTAGGTTTCCTCCTAGCAGCAGCAGGCAAATCAGCTCAACTAGGCCTCCACCCCTGACTCC
+CCTCGGCCATAGAAGGCCCCACCCCCGTCTCAGCCCTACTCCACTCAAGCACTATAGTCGTGGCAGGGGT
+CTTCCTACTCATCCGCTTCCACCATTTAGCAGAAAACAACTCACTAGCTCAAACTCTTACACTATGCCTA
+GGCGCTATTACTACTCTATTCGCGGCAGTCTGCGCTCTCACACAAAATGATATCAAAAAAATCGTAGCCT
+TCTCCACTTCAAGCCAATTAGGGCTCATAGTAGCCACAATTGGCATCGGCCAGCCACACCTAGCATTTCT
+ACACATCTGCACCCACGCTTTCTTTAAAGCCATATTATTTATATGCTCCGGATCCATTATCCATAACCTC
+AATAATGAGCAAGACATCCGAAAAATAGGAGGTCTGCTCAAAGCCATACCCCTTACTTCAACCTCCCTAG
+CCATTGGCAGCCTAGCACTAATAGGAATACCTTTTCTCACAGGCTTCTACTCCAAAGACCTCATCATCGA
+AACCGCAAACATATCACACACAAACGCCTGGGCCCTATCAATCATTCTCATCGCCACCTCTCTTACGAGT
+GCCTACAGCACTCGAATAATCCTTCTCACCCTGACAGGTCAACCTCGCTTCCCAACCTTTGCCAACATCA
+ACGAAAACTACTCCACCCTATTAAATCCTATTAAACGCCTAACAATTGGAAGCCTGTTTGCAGGATTCTT
+CATCACCAACAACATCCTCCCTACATCCGTACCCCAAATGACAATCCCACTTTACTTAAAACTTACAGCT
+CTAAGCATCACTCTCCTAGGACTCCTGACAGCCCTAGACCTCAACTACCTCACCAACAAACTCAAAATAA
+AACACCCACCACACACATTTTACTTCTCCAACATACTCGGATTTTACCCCAACATCACACACCGCACAAT
+TCCCTATCTAGGCCTTCTCATAAGTCAAAATCTACCCCTACTCCTCCTAGACCTAATTTGACTAGAAAAA
+CTACTACCCAAAACAATCTCACAGCATCAAATCTCAGCCTCCATCACTACCTCTACCCAAAAAGGTTTGA
+TCAAACTGTACTTCCTCTCTTTCTTCTTCCCTCTTCTTATAATCCTTCTTCTAATCACATAACCTATTAC
+CCCGAGCAATCTCAATCACAATATATACACCAACAAATAACGTCCAACCAGTAACCACTACCAATCAACG
+CCCATAATCATACAAAGCCCCCGCACCAATAGGATCTTCCCGAATTAACCCTGAACCTTCCCCCTCATAA
+ATCATTCAACTCCCCACATTATTAAAATTCACCACAACCACCACCCCATCATACTCTTTCACCCACAACA
+CCAGCCCCACCTCCATTGCTAACCCCACTAAGACACTCACCAAGACCTCAACCCCAGACCCCCACGCCTC
+AGGATACTCCTCAATGGCCATCGCTGTAGTATACCCAAAAACAACCATCATCCCCCCTAAATAAATTAAA
+AAAACCATTAAACCCATATAACCTCCCCCACAGTTTAAAATAATAGCACACCCAACCACACCACTAACAA
+TCAACACTAAACCCCCATAAATAGGAGAAGGCTTAGAGGAAAACCCCACAAACCCCATTACTAAACCCAC
+ACTCAATAAAAATAAAACATATGTCATCATTCTCGCACGGACCACGACCGCGACTAATGATATGAAAAAC
+CATCGTTGTACTTCAACTACAAGAACATCAATGACCCCTATACGCAAAACTAACCCACTAGCAAAACTAA
+TTAACCACTCATTCATTGACCTCCCTACCCCGTCCAACATCTCCACATGATGAAACTTCGGCTCACTCCT
+TGGTGCCTGCTTAATCCTTCAAATCACCACAGGGCTATTCCTAGCCATACACTACTCACCTGATGCCTCA
+ACCGCCTTCTCATCAATCGCCCACATCACCCGAGATGTAAACTATGGCTGAACCATCCGCTACCTCCACG
+CTAACGGCGCCTCAATATTCTTCATTTGCCTCTTTCTACACATCGGCCGGGGCCTATACTACGGCTCATT
+TCTCCACCAAGAAACCTGAAACATCGGCATCATCCTCCTACTCACAACCATAGCAACAGCCTTCATAGGC
+TATGTCCTCCCATGAGGCCAAATATCCTTCTGAGGGGCCACAGTAATCACAAACTTGCTATCCGCCATCC
+CGTACATCGGAACAGATCTAGTCCAATGAGTTTGAGGTGGTTACTCAGTAGATAGCCCTACCCTTACACG
+ATTCTTTACCTTCCACTTTATCCTACCCTTCATTATCACAGCCCTAACAACCCTCCATCTCCTATTTCTA
+CACGAAACAGGATCAAACAACCCTCTAGGCATCCCCTCCCACTCTGACAAAATCACCTTCCACCCCTACT
+ACACAATCAAAGACATCCTAGGCCTATTCTTCTTTCTCCTGACCTTGATAACATTAACACTATTCTCACC
+AGACCTCCTAGGAGACCCAGACAACTACACTTTAGCCAACCCCCTAAACACCCCACCCCACATCAAACCC
+GAATGATATTTCCTATTTGCCTACGCAATTCTCCGATCTGTCCCCAATAAACTAGGAGGCGTCTTAGCTC
+TATTACTATCCATTCTCATCCTAACAATAATTCCTATTCTCCACATATCCAAACAACAAAGCATAATATT
+CCGCCCATTAAGCCAACTACTCTACTGATTCCTAATCGCAAACCTCTTCACCCTAACCTGAATCGGAGGA
+CAACCAGTAAGCTACCCCTTCATTACCATTGGGCAAGTAGCATCCGTACTATACTTCACGACAATCCTAT
+TCCTGATACCAATCACATCCCTGATCGAAAACAAAATACTCAAATGAACCTGCCCTTGTAGTACAGACCA
+ATACACCAGTCTTGTAAACCGGAAACGAAGACCTCCTTCCAAGGGCATATTCAGAGAAAAAGTCCTCGAC
+TCCACCATCAGCACCCAAAGCTAATATTCTAATTTAAACTATTCTCTGTTCTTTCATGGGGAGACAAATT
+TGGGTACCACCCAAGTATTGGCTAACCCATCAATAATTATCATGTATGTCGTGCATTCCTGCCAGACACC
+ATGAATAATGCACAGCACCACAAATGTCCGATCACCTGTAACACATACAACCCCCCCCTTCCCCCCCCCC
+TCCTCCACCCAATGGAATATCAACTAATCCATTCCTCATAAAAAGTACATAGCACATAAAGTCATTTATC
+GTACATAGCACATTCTAGTTAAATCATCCTTGCCCCCACGGATGCCCCCCCTCAGATAGGAGTCCCTTGA
+ACACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCGCTCCGGGCCCATAACACTT
+GGGGGTAGCTAAAAATGAACTGTATCCGGCATCTGGTTCCTACTTCAGGGTCATAACACCTAAAGCGCTT
+CACACGTTCCCCTTAAATAAGACATCACGATGGATCACAGGTCTATCACCCTATTAACCACTCACGGGAG
+CTCTCCATGCATTTGGTATTTTTCGTCGGGGGGTGTGCACGCGATAGCATTGCGAAACGCTGGAACCGGA
+GCACCACATGTCGCAGTATCTGTCTTTGATTCCTGCCCCATACCATTATTTATCGCACCTACGTTCAATA
+TTACAGCCGAGCGCAGCATGCTTTATGATGCTAATTAATTCATGCTTGTTGGACATAAAACAACCAGGTG
+GACGTGAACACAACCACCCTTCACACCAAAAACACAACAAAAAATCCAACCCCCCCCCCCCCGCACTAGA
+AAACAGCCCCACCAAACTCCAAATTTCATCTTTTGGCGGTATGCATTTTTAACAGTTACCCCTCAACTAA
+CATAGCACCAACCCCACCAGTACAACCCCACCCGCCCTAGCAACACACACTGCTGCTGATCCTATACCCC
+GAATTAACCAAACCCCAAAGACACCCCACACA
+
diff --git a/paleomix/resources/examples/phylo_pipeline/alignment/000_prefixes/rCRS.fasta b/paleomix/resources/examples/phylo_pipeline/alignment/000_prefixes/rCRS.fasta
new file mode 100644
index 0000000..a2d2531
--- /dev/null
+++ b/paleomix/resources/examples/phylo_pipeline/alignment/000_prefixes/rCRS.fasta
@@ -0,0 +1,239 @@
+>NC_012920_1
+GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGG
+GTATGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTC
+CTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACTTACTAAAGTGTGTTA
+ATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCACTTTCCACACAGACATC
+ATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCA
+AACCCCAAAAACAAAGAACCCTAACACCAGCCTAACCAGATTTCAAATTTTATCTTTTGGCGGTATGCAC
+TTTTAACAGTCACCCCCCAACTAACACATTATTTTCCCCTCCCACTCCCATACTACTAATCTCATCAATA
+CAACCCCCGCCCATCCTACCCAGCACACACACACCGCTGCTAACCCCATACCCCGAACCAACCAAACCCC
+AAAGACACCCCCCACAGTTTATGTAGCTTACCTCCTCAAAGCAATACACTGAAAATGTTTAGACGGGCTC
+ACATCACCCCATAAACAAATAGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTAAGATTACACATGCAA
+GCATCCCCGTTCCAGTGAGTTCACCCTCTAAATCACCACGATCAAAAGGAACAAGCATCAAGCACGCAGC
+AATGCAGCTCAAAACGCTTAGCCTAGCCACACCCCCACGGGAAACAGCAGTGATTAACCTTTAGCAATAA
+ACGAAAGTTTAACTAAGCTATACTAACCCCAGGGTTGGTCAATTTCGTGCCAGCCACCGCGGTCACACGA
+TTAACCCAAGTCAATAGAAGCCGGCGTAAAGAGTGTTTTAGATCACCCCCTCCCCAATAAAGCTAAAACT
+CACCTGAGTTGTAAAAAACTCCAGTTGACACAAAATAGACTACGAAAGTGGCTTTAACATATCTGAACAC
+ACAATAGCTAAGACCCAAACTGGGATTAGATACCCCACTATGCTTAGCCCTAAACCTCAACAGTTAAATC
+AACAAAACTGCTCGCCAGAACACTACGAGCCACAGCTTAAAACTCAAAGGACCTGGCGGTGCTTCATATC
+CCTCTAGAGGAGCCTGTTCTGTAATCGATAAACCCCGATCAACCTCACCACCTCTTGCTCAGCCTATATA
+CCGCCATCTTCAGCAAACCCTGATGAAGGCTACAAAGTAAGCGCAAGTACCCACGTAAAGACGTTAGGTC
+AAGGTGTAGCCCATGAGGTGGCAAGAAATGGGCTACATTTTCTACCCCAGAAAACTACGATAGCCCTTAT
+GAAACTTAAGGGTCGAAGGTGGATTTAGCAGTAAACTAAGAGTAGAGTGCTTAGTTGAACAGGGCCCTGA
+AGCGCGTACACACCGCCCGTCACCCTCCTCAAGTATACTTCAAAGGACATTTAACTAAAACCCCTACGCA
+TTTATATAGAGGAGACAAGTCGTAACATGGTAAGTGTACTGGAAAGTGCACTTGGACGAACCAGAGTGTA
+GCTTAACACAAAGCACCCAACTTACACTTAGGAGATTTCAACTTAACTTGACCGCTCTGAGCTAAACCTA
+GCCCCAAACCCACTCCACCTTACTACCAGACAACCTTAGCCAAACCATTTACCCAAATAAAGTATAGGCG
+ATAGAAATTGAAACCTGGCGCAATAGATATAGTACCGCAAGGGAAAGATGAAAAATTATAACCAAGCATA
+ATATAGCAAGGACTAACCCCTATACCTTCTGCATAATGAATTAACTAGAAATAACTTTGCAAGGAGAGCC
+AAAGCTAAGACCCCCGAAACCAGACGAGCTACCTAAGAACAGCTAAAAGAGCACACCCGTCTATGTAGCA
+AAATAGTGGGAAGATTTATAGGTAGAGGCGACAAACCTACCGAGCCTGGTGATAGCTGGTTGTCCAAGAT
+AGAATCTTAGTTCAACTTTAAATTTGCCCACAGAACCCTCTAAATCCCCTTGTAAATTTAACTGTTAGTC
+CAAAGAGGAACAGCTCTTTGGACACTAGGAAAAAACCTTGTAGAGAGAGTAAAAAATTTAACACCCATAG
+TAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTTCAAGCTCAACACCCACTACCTAAAAAATCCCAAAC
+ATATAACTGAACTCCTCACACCCAATTGGACCAATCTATCACCCTATAGAAGAACTAATGTTAGTATAAG
+TAACATGAAAACATTCTCCTCCGCATAAGCCTGCGTCAGATTAAAACACTGAACTGACAATTAACAGCCC
+AATATCTACAATCAACCAACAAGTCATTATTACCCTCACTGTCAACCCAACACAGGCATGCTCATAAGGA
+AAGGTTAAAAAAAGTAAAAGGAACTCGGCAAATCTTACCCCGCCTGTTTACCAAAAACATCACCTCTAGC
+ATCACCAGTATTAGAGGCACCGCCTGCCCAGTGACACATGTTTAACGGCCGCGGTACCCTAACCGTGCAA
+AGGTAGCATAATCACTTGTTCCTTAAATAGGGACCTGTATGAATGGCTCCACGAGGGTTCAGCTGTCTCT
+TACTTTTAACCAGTGAAATTGACCTGCCCGTGAAGAGGCGGGCATAACACAGCAAGACGAGAAGACCCTA
+TGGAGCTTTAATTTATTAATGCAAACAGTACCTAACAAACCCACAGGTCCTAAACTACCAAACCTGCATT
+AAAAATTTCGGTTGGGGCGACCTCGGAGCAGAACCCAACCTCCGAGCAGTACATGCTAAGACTTCACCAG
+TCAAAGCGAACTACTATACTCAATTGATCCAATAACTTGACCAACGGAACAAGTTACCCTAGGGATAACA
+GCGCAATCCTATTCTAGAGTCCATATCAACAATAGGGTTTACGACCTCGATGTTGGATCAGGACATCCCG
+ATGGTGCAGCCGCTATTAAAGGTTCGTTTGTTCAACGATTAAAGTCCTACGTGATCTGAGTTCAGACCGG
+AGTAATCCAGGTCGGTTTCTATCTACNTTCAAATTCCTCCCTGTACGAAAGGACAAGAGAAATAAGGCCT
+ACTTCACAAAGCGCCTTCCCCCGTAAATGATATCATCTCAACTTAGTATTATACCCACACCCACCCAAGA
+ACAGGGTTTGTTAAGATGGCAGAGCCCGGTAATCGCATAAAACTTAAAACTTTACAGTCAGAGGTTCAAT
+TCCTCTTCTTAACAACATACCCATGGCCAACCTCCTACTCCTCATTGTACCCATTCTAATCGCAATGGCA
+TTCCTAATGCTTACCGAACGAAAAATTCTAGGCTATATACAACTACGCAAAGGCCCCAACGTTGTAGGCC
+CCTACGGGCTACTACAACCCTTCGCTGACGCCATAAAACTCTTCACCAAAGAGCCCCTAAAACCCGCCAC
+ATCTACCATCACCCTCTACATCACCGCCCCGACCTTAGCTCTCACCATCGCTCTTCTACTATGAACCCCC
+CTCCCCATACCCAACCCCCTGGTCAACCTCAACCTAGGCCTCCTATTTATTCTAGCCACCTCTAGCCTAG
+CCGTTTACTCAATCCTCTGATCAGGGTGAGCATCAAACTCAAACTACGCCCTGATCGGCGCACTGCGAGC
+AGTAGCCCAAACAATCTCATATGAAGTCACCCTAGCCATCATTCTACTATCAACATTACTAATAAGTGGC
+TCCTTTAACCTCTCCACCCTTATCACAACACAAGAACACCTCTGATTACTCCTGCCATCATGACCCTTGG
+CCATAATATGATTTATCTCCACACTAGCAGAGACCAACCGAACCCCCTTCGACCTTGCCGAAGGGGAGTC
+CGAACTAGTCTCAGGCTTCAACATCGAATACGCCGCAGGCCCCTTCGCCCTATTCTTCATAGCCGAATAC
+ACAAACATTATTATAATAAACACCCTCACCACTACAATCTTCCTAGGAACAACATATGACGCACTCTCCC
+CTGAACTCTACACAACATATTTTGTCACCAAGACCCTACTTCTAACCTCCCTGTTCTTATGAATTCGAAC
+AGCATACCCCCGATTCCGCTACGACCAACTCATACACCTCCTATGAAAAAACTTCCTACCACTCACCCTA
+GCATTACTTATATGATATGTCTCCATACCCATTACAATCTCCAGCATTCCCCCTCAAACCTAAGAAATAT
+GTCTGATAAAAGAGTTACTTTGATAGAGTAAATAATAGGAGCTTAAACCCCCTTATTTCTAGGACTATGA
+GAATCGAACCCATCCCTGAGAATCCAAAATTCTCCGTGCCACCTATCACACCCCATCCTAAAGTAAGGTC
+AGCTAAATAAGCTATCGGGCCCATACCCCGAAAATGTTGGTTATACCCTTCCCGTACTAATTAATCCCCT
+GGCCCAACCCGTCATCTACTCTACCATCTTTGCAGGCACACTCATCACAGCGCTAAGCTCGCACTGATTT
+TTTACCTGAGTAGGCCTAGAAATAAACATGCTAGCTTTTATTCCAGTTCTAACCAAAAAAATAAACCCTC
+GTTCCACAGAAGCTGCCATCAAGTATTTCCTCACGCAAGCAACCGCATCCATAATCCTTCTAATAGCTAT
+CCTCTTCAACAATATACTCTCCGGACAATGAACCATAACCAATACTACCAATCAATACTCATCATTAATA
+ATCATAATAGCTATAGCAATAAAACTAGGAATAGCCCCCTTTCACTTCTGAGTCCCAGAGGTTACCCAAG
+GCACCCCTCTGACATCCGGCCTGCTTCTTCTCACATGACAAAAACTAGCCCCCATCTCAATCATATACCA
+AATCTCTCCCTCACTAAACGTAAGCCTTCTCCTCACTCTCTCAATCTTATCCATCATAGCAGGCAGTTGA
+GGTGGATTAAACCAAACCCAGCTACGCAAAATCTTAGCATACTCCTCAATTACCCACATAGGATGAATAA
+TAGCAGTTCTACCGTACAACCCTAACATAACCATTCTTAATTTAACTATTTATATTATCCTAACTACTAC
+CGCATTCCTACTACTCAACTTAAACTCCAGCACCACGACCCTACTACTATCTCGCACCTGAAACAAGCTA
+ACATGACTAACACCCTTAATTCCATCCACCCTCCTCTCCCTAGGAGGCCTGCCCCCGCTAACCGGCTTTT
+TGCCCAAATGGGCCATTATCGAAGAATTCACAAAAAACAATAGCCTCATCATCCCCACCATCATAGCCAC
+CATCACCCTCCTTAACCTCTACTTCTACCTACGCCTAATCTACTCCACCTCAATCACACTACTCCCCATA
+TCTAACAACGTAAAAATAAAATGACAGTTTGAACATACAAAACCCACCCCATTCCTCCCCACACTCATCG
+CCCTTACCACGCTACTCCTACCTATCTCCCCTTTTATACTAATAATCTTATAGAAATTTAGGTTAAATAC
+AGACCAAGAGCCTTCAAAGCCCTCAGTAAGTTGCAATACTTAATTTCTGTAACAGCTAAGGACTGCAAAA
+CCCCACTCTGCATCAACTGAACGCAAATCAGCCACTTTAATTAAGCTAAGCCCTTACTAGACCAATGGGA
+CTTAAACCCACAAACACTTAGTTAACAGCTAAGCACCCTAATCAACTGGCTTCAATCTACTTCTCCCGCC
+GCCGGGAAAAAAGGCGGGAGAAGCCCCGGCAGGTTTGAAGCTGCTTCTTCGAATTTGCAATTCAATATGA
+AAATCACCTCGGAGCTGGTAAAAAGAGGCCTAACCCCTGTCTTTAGATTTACAGTCCAATGCTTCACTCA
+GCCATTTTACCTCACCCCCACTGATGTTCGCCGACCGTTGACTATTCTCTACAAACCACAAAGACATTGG
+AACACTATACCTATTATTCGGCGCATGAGCTGGAGTCCTAGGCACAGCTCTAAGCCTCCTTATTCGAGCC
+GAGCTGGGCCAGCCAGGCAACCTTCTAGGTAACGACCACATCTACAACGTTATCGTCACAGCCCATGCAT
+TTGTAATAATCTTCTTCATAGTAATACCCATCATAATCGGAGGCTTTGGCAACTGACTAGTTCCCCTAAT
+AATCGGTGCCCCCGATATGGCGTTTCCCCGCATAAACAACATAAGCTTCTGACTCTTACCTCCCTCTCTC
+CTACTCCTGCTCGCATCTGCTATAGTGGAGGCCGGAGCAGGAACAGGTTGAACAGTCTACCCTCCCTTAG
+CAGGGAACTACTCCCACCCTGGAGCCTCCGTAGACCTAACCATCTTCTCCTTACACCTAGCAGGTGTCTC
+CTCTATCTTAGGGGCCATCAATTTCATCACAACAATTATCAATATAAAACCCCCTGCCATAACCCAATAC
+CAAACGCCCCTCTTCGTCTGATCCGTCCTAATCACAGCAGTCCTACTTCTCCTATCTCTCCCAGTCCTAG
+CTGCTGGCATCACTATACTACTAACAGACCGCAACCTCAACACCACCTTCTTCGACCCCGCCGGAGGAGG
+AGACCCCATTCTATACCAACACCTATTCTGATTTTTCGGTCACCCTGAAGTTTATATTCTTATCCTACCA
+GGCTTCGGAATAATCTCCCATATTGTAACTTACTACTCCGGAAAAAAAGAACCATTTGGATACATAGGTA
+TGGTCTGAGCTATGATATCAATTGGCTTCCTAGGGTTTATCGTGTGAGCACACCATATATTTACAGTAGG
+AATAGACGTAGACACACGAGCATATTTCACCTCCGCTACCATAATCATCGCTATCCCCACCGGCGTCAAA
+GTATTTAGCTGACTCGCCACACTCCACGGAAGCAATATGAAATGATCTGCTGCAGTGCTCTGAGCCCTAG
+GATTCATCTTTCTTTTCACCGTAGGTGGCCTGACTGGCATTGTATTAGCAAACTCATCACTAGACATCGT
+ACTACACGACACGTACTACGTTGTAGCCCACTTCCACTATGTCCTATCAATAGGAGCTGTATTTGCCATC
+ATAGGAGGCTTCATTCACTGATTTCCCCTATTCTCAGGCTACACCCTAGACCAAACCTACGCCAAAATCC
+ATTTCACTATCATATTCATCGGCGTAAATCTAACTTTCTTCCCACAACACTTTCTCGGCCTATCCGGAAT
+GCCCCGACGTTACTCGGACTACCCCGATGCATACACCACATGAAACATCCTATCATCTGTAGGCTCATTC
+ATTTCTCTAACAGCAGTAATATTAATAATTTTCATGATTTGAGAAGCCTTCGCTTCGAAGCGAAAAGTCC
+TAATAGTAGAAGAACCCTCCATAAACCTGGAGTGACTATATGGATGCCCCCCACCCTACCACACATTCGA
+AGAACCCGTATACATAAAATCTAGACAAAAAAGGAAGGAATCGAACCCCCCAAAGCTGGTTTCAAGCCAA
+CCCCATGGCCTCCATGACTTTTTCAAAAAGGTATTAGAAAAACCATTTCATAACTTTGTCAAAGTTAAAT
+TATAGGCTAAATCCTATATATCTTAATGGCACATGCAGCGCAAGTAGGTCTACAAGACGCTACTTCCCCT
+ATCATAGAAGAGCTTATCACCTTTCATGATCACGCCCTCATAATCATTTTCCTTATCTGCTTCCTAGTCC
+TGTATGCCCTTTTCCTAACACTCACAACAAAACTAACTAATACTAACATCTCAGACGCTCAGGAAATAGA
+AACCGTCTGAACTATCCTGCCCGCCATCATCCTAGTCCTCATCGCCCTCCCATCCCTACGCATCCTTTAC
+ATAACAGACGAGGTCAACGATCCCTCCCTTACCATCAAATCAATTGGCCACCAATGGTACTGAACCTACG
+AGTACACCGACTACGGCGGACTAATCTTCAACTCCTACATACTTCCCCCATTATTCCTAGAACCAGGCGA
+CCTGCGACTCCTTGACGTTGACAATCGAGTAGTACTCCCGATTGAAGCCCCCATTCGTATAATAATTACA
+TCACAAGACGTCTTGCACTCATGAGCTGTCCCCACATTAGGCTTAAAAACAGATGCAATTCCCGGACGTC
+TAAACCAAACCACTTTCACCGCTACACGACCGGGGGTATACTACGGTCAATGCTCTGAAATCTGTGGAGC
+AAACCACAGTTTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAAATAGGGCCCGTATTT
+ACCCTATAGCACCCCCTCTACCCCCTCTAGAGCCCACTGTAAAGCTAACTTAGCATTAACCTTTTAAGTT
+AAAGATTAAGAGAACCAACACCTCTTTACAGTGAAATGCCCCAACTAAATACTACCGTATGGCCCACCAT
+AATTACCCCCATACTCCTTACACTATTCCTCATCACCCAACTAAAAATATTAAACACAAACTACCACCTA
+CCTCCCTCACCAAAGCCCATAAAAATAAAAAATTATAACAAACCCTGAGAACCAAAATGAACGAAAATCT
+GTTCGCTTCATTCATTGCCCCCACAATCCTAGGCCTACCCGCCGCAGTACTGATCATTCTATTTCCCCCT
+CTATTGATCCCCACCTCCAAATATCTCATCAACAACCGACTAATCACCACCCAACAATGACTAATCAAAC
+TAACCTCAAAACAAATGATAACCATACACAACACTAAAGGACGAACCTGATCTCTTATACTAGTATCCTT
+AATCATTTTTATTGCCACAACTAACCTCCTCGGACTCCTGCCTCACTCATTTACACCAACCACCCAACTA
+TCTATAAACCTAGCCATGGCCATCCCCTTATGAGCGGGCACAGTGATTATAGGCTTTCGCTCTAAGATTA
+AAAATGCCCTAGCCCACTTCTTACCACAAGGCACACCTACACCCCTTATCCCCATACTAGTTATTATCGA
+AACCATCAGCCTACTCATTCAACCAATAGCCCTGGCCGTACGCCTAACCGCTAACATTACTGCAGGCCAC
+CTACTCATGCACCTAATTGGAAGCGCCACCCTAGCAATATCAACCATTAACCTTCCCTCTACACTTATCA
+TCTTCACAATTCTAATTCTACTGACTATCCTAGAAATCGCTGTCGCCTTAATCCAAGCCTACGTTTTCAC
+ACTTCTAGTAAGCCTCTACCTGCACGACAACACATAATGACCCACCAATCACATGCCTATCATATAGTAA
+AACCCAGCCCATGACCCCTAACAGGGGCCCTCTCAGCCCTCCTAATGACCTCCGGCCTAGCCATGTGATT
+TCACTTCCACTCCATAACGCTCCTCATACTAGGCCTACTAACCAACACACTAACCATATACCAATGATGG
+CGCGATGTAACACGAGAAAGCACATACCAAGGCCACCACACACCACCTGTCCAAAAAGGCCTTCGATACG
+GGATAATCCTATTTATTACCTCAGAAGTTTTTTTCTTCGCAGGATTTTTCTGAGCCTTTTACCACTCCAG
+CCTAGCCCCTACCCCCCAATTAGGAGGGCACTGGCCCCCAACAGGCATCACCCCGCTAAATCCCCTAGAA
+GTCCCACTCCTAAACACATCCGTATTACTCGCATCAGGAGTATCAATCACCTGAGCTCACCATAGTCTAA
+TAGAAAACAACCGAAACCAAATAATTCAAGCACTGCTTATTACAATTTTACTGGGTCTCTATTTTACCCT
+CCTACAAGCCTCAGAGTACTTCGAGTCTCCCTTCACCATTTCCGACGGCATCTACGGCTCAACATTTTTT
+GTAGCCACAGGCTTCCACGGACTTCACGTCATTATTGGCTCAACTTTCCTCACTATCTGCTTCATCCGCC
+AACTAATATTTCACTTTACATCCAAACATCACTTTGGCTTCGAAGCCGCCGCCTGATACTGGCATTTTGT
+AGATGTGGTTTGACTATTTCTGTATGTCTCCATCTATTGATGAGGGTCTTACTCTTTTAGTATAAATAGT
+ACCGTTAACTTCCAATTAACTAGTTTTGACAACATTCAAAAAAGAGTAATAAACTTCGCCTTAATTTTAA
+TAATCAACACCCTCCTAGCCTTACTACTAATAATTATTACATTTTGACTACCACAACTCAACGGCTACAT
+AGAAAAATCCACCCCTTACGAGTGCGGCTTCGACCCTATATCCCCCGCCCGCGTCCCTTTCTCCATAAAA
+TTCTTCTTAGTAGCTATTACCTTCTTATTATTTGATCTAGAAATTGCCCTCCTTTTACCCCTACCATGAG
+CCCTACAAACAACTAACCTGCCACTAATAGTTATGTCATCCCTCTTATTAATCATCATCCTAGCCCTAAG
+TCTGGCCTATGAGTGACTACAAAAAGGATTAGACTGAACCGAATTGGTATATAGTTTAAACAAAACGAAT
+GATTTCGACTCATTAAATTATGATAATCATATTTACCAAATGCCCCTCATTTACATAAATATTATACTAG
+CATTTACCATCTCACTTCTAGGAATACTAGTATATCGCTCACACCTCATATCCTCCCTACTATGCCTAGA
+AGGAATAATACTATCGCTGTTCATTATAGCTACTCTCATAACCCTCAACACCCACTCCCTCTTAGCCAAT
+ATTGTGCCTATTGCCATACTAGTCTTTGCCGCCTGCGAAGCAGCGGTGGGCCTAGCCCTACTAGTCTCAA
+TCTCCAACACATATGGCCTAGACTACGTACATAACCTAAACCTACTCCAATGCTAAAACTAATCGTCCCA
+ACAATTATATTACTACCACTGACATGACTTTCCAAAAAACACATAATTTGAATCAACACAACCACCCACA
+GCCTAATTATTAGCATCATCCCTCTACTATTTTTTAACCAAATCAACAACAACCTATTTAGCTGTTCCCC
+AACCTTTTCCTCCGACCCCCTAACAACCCCCCTCCTAATACTAACTACCTGACTCCTACCCCTCACAATC
+ATGGCAAGCCAACGCCACTTATCCAGTGAACCACTATCACGAAAAAAACTCTACCTCTCTATACTAATCT
+CCCTACAAATCTCCTTAATTATAACATTCACAGCCACAGAACTAATCATATTTTATATCTTCTTCGAAAC
+CACACTTATCCCCACCTTGGCTATCATCACCCGATGAGGCAACCAGCCAGAACGCCTGAACGCAGGCACA
+TACTTCCTATTCTACACCCTAGTAGGCTCCCTTCCCCTACTCATCGCACTAATTTACACTCACAACACCC
+TAGGCTCACTAAACATTCTACTACTCACTCTCACTGCCCAAGAACTATCAAACTCCTGAGCCAACAACTT
+AATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCT
+AAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCT
+ATGGTATAATACGCCTCACACTCATTCTCAACCCCCTGACAAAACACATAGCCTACCCCTTCCTTGTACT
+ATCCCTATGAGGCATAATTATAACAAGCTCCATCTGCCTACGACAAACAGACCTAAAATCGCTCATTGCA
+TACTCTTCAATCAGCCACATAGCCCTCGTAGTAACAGCCATTCTCATCCAAACCCCCTGAAGCTTCACCG
+GCGCAGTCATTCTCATAATCGCCCACGGGCTTACATCCTCATTACTATTCTGCCTAGCAAACTCAAACTA
+CGAACGCACTCACAGTCGCATCATAATCCTCTCTCAAGGACTTCAAACTCTACTCCCACTAATAGCTTTT
+TGATGACTTCTAGCAAGCCTCGCTAACCTCGCCTTACCCCCCACTATTAACCTACTGGGAGAACTCTCTG
+TGCTAGTAACCACGTTCTCCTGATCAAATATCACTCTCCTACTTACAGGACTCAACATACTAGTCACAGC
+CCTATACTCCCTCTACATATTTACCACAACACAATGGGGCTCACTCACCCACCACATTAACAACATAAAA
+CCCTCATTCACACGAGAAAACACCCTCATGTTCATACACCTATCCCCCATTCTCCTCCTATCCCTCAACC
+CCGACATCATTACCGGGTTTTCCTCTTGTAAATATAGTTTAACCAAAACATCAGATTGTGAATCTGACAA
+CAGAGGCTTACGACCCCTTATTTACCGAGAAAGCTCACAAGAACTGCTAACTCATGCCCCCATGTCTAAC
+AACATGGCTTTCTCAACTTTTAAAGGATAACAGCTATCCATTGGTCTTAGGCCCCAAAAATTTTGGTGCA
+ACTCCAAATAAAAGTAATAACCATGCACACTACTATAACCACCCTAACCCTGACTTCCCTAATTCCCCCC
+ATCCTTACCACCCTCGTTAACCCTAACAAAAAAAACTCATACCCCCATTATGTAAAATCCATTGTCGCAT
+CCACCTTTATTATCAGTCTCTTCCCCACAACAATATTCATGTGCCTAGACCAAGAAGTTATTATCTCGAA
+CTGACACTGAGCCACAACCCAAACAACCCAGCTCTCCCTAAGCTTCAAACTAGACTACTTCTCCATAATA
+TTCATCCCTGTAGCATTGTTCGTTACATGGTCCATCATAGAATTCTCACTGTGATATATAAACTCAGACC
+CAAACATTAATCAGTTCTTCAAATATCTACTCATCTTCCTAATTACCATACTAATCTTAGTTACCGCTAA
+CAACCTATTCCAACTGTTCATCGGCTGAGAGGGCGTAGGAATTATATCCTTCTTGCTCATCAGTTGATGA
+TACGCCCGAGCAGATGCCAACACAGCAGCCATTCAAGCAATCCTATACAACCGTATCGGCGATATCGGTT
+TCATCCTCGCCTTAGCATGATTTATCCTACACTCCAACTCATGAGACCCACAACAAATAGCCCTTCTAAA
+CGCTAATCCAAGCCTCACCCCACTACTAGGCCTCCTCCTAGCAGCAGCAGGCAAATCAGCCCAATTAGGT
+CTCCACCCCTGACTCCCCTCAGCCATAGAAGGCCCCACCCCAGTCTCAGCCCTACTCCACTCAAGCACTA
+TAGTTGTAGCAGGAATCTTCTTACTCATCCGCTTCCACCCCCTAGCAGAAAATAGCCCACTAATCCAAAC
+TCTAACACTATGCTTAGGCGCTATCACCACTCTGTTCGCAGCAGTCTGCGCCCTTACACAAAATGACATC
+AAAAAAATCGTAGCCTTCTCCACTTCAAGTCAACTAGGACTCATAATAGTTACAATCGGCATCAACCAAC
+CACACCTAGCATTCCTGCACATCTGTACCCACGCCTTCTTCAAAGCCATACTATTTATGTGCTCCGGGTC
+CATCATCCACAACCTTAACAATGAACAAGATATTCGAAAAATAGGAGGACTACTCAAAACCATACCTCTC
+ACTTCAACCTCCCTCACCATTGGCAGCCTAGCATTAGCAGGAATACCTTTCCTCACAGGTTTCTACTCCA
+AAGACCACATCATCGAAACCGCAAACATATCATACACAAACGCCTGAGCCCTATCTATTACTCTCATCGC
+TACCTCCCTGACAAGCGCCTATAGCACTCGAATAATTCTTCTCACCCTAACAGGTCAACCTCGCTTCCCC
+ACCCTTACTAACATTAACGAAAATAACCCCACCCTACTAAACCCCATTAAACGCCTGGCAGCCGGAAGCC
+TATTCGCAGGATTTCTCATTACTAACAACATTTCCCCCGCATCCCCCTTCCAAACAACAATCCCCCTCTA
+CCTAAAACTCACAGCCCTCGCTGTCACTTTCCTAGGACTTCTAACAGCCCTAGACCTCAACTACCTAACC
+AACAAACTTAAAATAAAATCCCCACTATGCACATTTTATTTCTCCAACATACTCGGATTCTACCCTAGCA
+TCACACACCGCACAATCCCCTATCTAGGCCTTCTTACGAGCCAAAACCTGCCCCTACTCCTCCTAGACCT
+AACCTGACTAGAAAAGCTATTACCTAAAACAATTTCACAGCACCAAATCTCCACCTCCATCATCACCTCA
+ACCCAAAAAGGCATAATTAAACTTTACTTCCTCTCTTTCTTCTTCCCACTCATCCTAACCCTACTCCTAA
+TCACATAACCTATTCCCCCGAGCAATCTCAATTACAATATATACACCAACAAACAATGTTCAACCAGTAA
+CTACTACTAATCAACGCCCATAATCATACAAAGCCCCCGCACCAATAGGATCCTCCCGAATCAACCCTGA
+CCCCTCTCCTTCATAAATTATTCAGCTTCCTACACTATTAAAGTTTACCACAACCACCACCCCATCATAC
+TCTTTCACCCACAGCACCAATCCTACCTCCATCGCTAACCCCACTAAAACACTCACCAAGACCTCAACCC
+CTGACCCCCATGCCTCAGGATACTCCTCAATAGCCATCGCTGTAGTATATCCAAAGACAACCATCATTCC
+CCCTAAATAAATTAAAAAAACTATTAAACCCATATAACCTCCCCCAAAATTCAGAATAATAACACACCCG
+ACCACACCGCTAACAATCAATACTAAACCCCCATAAATAGGAGAAGGCTTAGAAGAAAACCCCACAAACC
+CCATTACTAAACCCACACTCAACAGAAACAAAGCATACATCATTATTCTCGCACGGACTACAACCACGAC
+CAATGATATGAAAAACCATCGTTGTATTTCAACTACAAGAACACCAATGACCCCAATACGCAAAACTAAC
+CCCCTAATAAAATTAATTAACCACTCATTCATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAA
+ACTTCGGCTCACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAGCCATGCACTA
+CTCACCAGACGCCTCAACCGCCTTTTCATCAATCGCCCACATCACTCGAGACGTAAATTATGGCTGAATC
+ATCCGCTACCTTCACGCCAATGGCGCCTCAATATTCTTTATCTGCCTCTTCCTACACATCGGGCGAGGCC
+TATATTACGGATCATTTCTCTACTCAGAAACCTGAAACATCGGCATTATCCTCCTGCTTGCAACTATAGC
+AACAGCCTTCATAGGCTATGTCCTCCCGTGAGGCCAAATATCATTCTGAGGGGCCACAGTAATTACAAAC
+TTACTATCCGCCATCCCATACATTGGGACAGACCTAGTTCAATGAATCTGAGGAGGCTACTCAGTAGACA
+GTCCCACCCTCACACGATTCTTTACCTTTCACTTCATCTTGCCCTTCATTATTGCAGCCCTAGCAACACT
+CCACCTCCTATTCTTGCACGAAACGGGATCAAACAACCCCCTAGGAATCACCTCCCATTCCGATAAAATC
+ACCTTCCACCCTTACTACACAATCAAAGACGCCCTCGGCTTACTTCTCTTCCTTCTCTCCTTAATGACAT
+TAACACTATTCTCACCAGACCTCCTAGGCGACCCAGACAATTATACCCTAGCCAACCCCTTAAACACCCC
+TCCCCACATCAAGCCCGAATGATATTTCCTATTCGCCTACACAATTCTCCGATCCGTCCCTAACAAACTA
+GGAGGCGTCCTTGCCCTATTACTATCCATCCTCATCCTAGCAATAATCCCCATCCTCCATATATCCAAAC
+AACAAAGCATAATATTTCGCCCACTAAGCCAATCACTTTATTGACTCCTAGCCGCAGACCTCCTCATTCT
+AACCTGAATCGGAGGACAACCAGTAAGCTACCCTTTTACCATCATTGGACAAGTAGCATCCGTACTATAC
+TTCACAACAATCCTAATCCTAATACCAACTATCTCCCTAATTGAAAACAAAATACTCAAATGGGCCTGTC
+CTTGTAGTATAAACTAATACACCAGTCTTGTAAACCGGAGATGAAAACCTTTTTCCAAGGACAAATCAGA
+GAAAAAGTCTTTAACTCCACCATTAGCACCCAAAGCTAAGATTCTAATTTAAACTATTCTCTGTTCTTTC
+ATGGGGAAGCAGATTTGGGTACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTACA
+TTACTGCCAGCCACCATGAATATTGTACGGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCA
+ATCCACATCAAAACCCCCTCCCCATGCTTACAAGCAAGTACAGCAATCAACCCTCAACTATCACACATCA
+ACTGCAACTCCAAAGCCACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTTAACAGTACATAG
+TACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCCCC
+TCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCG
+CTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGTC
+ATAAAGCCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATG
+
diff --git a/paleomix/resources/examples/phylo_pipeline/alignment/000_prefixes/rCRS.fasta.fai b/paleomix/resources/examples/phylo_pipeline/alignment/000_prefixes/rCRS.fasta.fai
new file mode 100644
index 0000000..85aa680
--- /dev/null
+++ b/paleomix/resources/examples/phylo_pipeline/alignment/000_prefixes/rCRS.fasta.fai
@@ -0,0 +1 @@
+NC_012920_1 16569 13 70 71
diff --git a/paleomix/resources/examples/phylo_pipeline/alignment/000_prefixes/sumatran_orangutan.fasta b/paleomix/resources/examples/phylo_pipeline/alignment/000_prefixes/sumatran_orangutan.fasta
new file mode 100644
index 0000000..b4ccf08
--- /dev/null
+++ b/paleomix/resources/examples/phylo_pipeline/alignment/000_prefixes/sumatran_orangutan.fasta
@@ -0,0 +1,238 @@
+>gi|5835834|ref|NC_002083.1| Pongo abelii mitochondrion, complete genome
+GTTTATGTAGCTTATTCTATCCAAAGCAATGCACTGAAAATGTCTCGACGGGCCCACACGCCCCATAAAC
+AAATAGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTGAGGTTACACATGCAAGCATCCCCGCCCCAGT
+GAGTCGCCCTCCAAGTCACTCTGACTAAGAGGAGCAAGCATCAAGCACGCAACAGCGCAGCTCAAGACGC
+TCAGCCTAGCCACACCCCCACGGGAGACAGCAGTGATAAGTCTTTAGCAATAAACGAAAGTTCAACTAAG
+CTACACTAACCCCAGGGTTGGTCAACTTCGTGCCAGCCACCGCGGTCACACGATTAGCCCAAGTTAATAG
+AGATCGGCGTAGAGAGTGTTTTAGATTCTTTTTCTCCCCAATAAAGCTAAAATTTACCTGAGTTGTAGAA
+AACTTAAGCTAATACAAAATAAACTACGAAAGTGGCTTTAATATATCTGAACACACAATAGCTAAGGCCC
+AAACTGGGATTAGATACCCCACTATGCTTAGCCCTAAACTTTAACAGTTAAATCAACAAAACTGCTCGCC
+AGAACACTACGAGCCACAGCTTAAAACTCAAAGGACCTGGCGGTGCTTCATATCCCTCTAGAGGAGCCTG
+TTCTGTAATCGATAAACCCCGATCAACCTCACCACCCCTTGCTCAGCCTATATACCGCCATCTTCAGCAA
+ACCCTGATGAAGGCCACGAAGTAAGCGCAAGCATCCACATAAAGACGTTAGGTCAAGGTGTAGCCCATGG
+AGTGGCAAGAAATGGGCTACATTTTCTACTTCAGAAAACTACGATAGCCCTCATGAAACCTGAGGGTCGA
+AGGTGGATTTAGCAGTAAACTAAGAGTAGAGTGCTTAGTTGAACAGGGCCCTGAAGCGCGTACACACCGC
+CCGTCACCCTCTTCAAGTATATTTCAGGGACTACCTAACTAAAACCCCCACGCATCTATATAGAGGAGGC
+AAGTCGTAACATGGTAAGCGTACTGGAAAGTGCGCTTGGACGAACCAGAGGGTAGCTTAACACAAAGCAC
+CCGGCTTACACCTGGGAGATTTCAATTCAACCTGGCCCCTCTGAGCTAACCCTAGCCCCAAACCCAACCC
+ACCCTACTACCAACCAACCCTAACCAAACCATTCACCCAAACAAAGTATAGGCGATAGAAATTACAATCC
+GGCGCAATAGACACAGTACCGTAAGGGAAAGATGAAAAAACACAACCAAGCACAACATAGCAAGGACTAA
+CCCCTGTACCTTTTGCATAATGAATTAACTAGAAACAACTTTGCAAGGAGAGCCAAAGCCAAGACCCCCG
+AAACCAGACGAGCTACCCATAAACAGCTAAAAGAGCACACCCGTCTATGTAGCAAAATAGTGGGAAGATT
+TATGGGTAGAGGCGACAAACCTACCGAGCCTGGTGATAGCTGGTTGTCCAAGACAGAATCTTAGTTCAAC
+TTTAAATTTACTTACAGAACCCCTAATCCCCTCGTAAATTTAATTGCTAGTCTAAAGAGGAACAGCTCTT
+TAGACACTAGGAAAAAACCTTAAAAAGAGAGTAAAAAACACAACACCCATAGTGGGCCCAAAAGCAGCCA
+TCAATTAAGAAAGCGTTCAAGCTCGACACCTAAACACCAAAAAATACCAAACACAAAACTGAACTCCTTA
+CTCCCCATTGGACTAATCTATTGCCCCATAGAAGAAACAATGTTAGTATAAGTAACATGAAGATATTCTC
+CCCCGCATAAGTCTACGTCAGACCGAAACATCACACTGACAATTAACGGTCCAATATGCATAGTTAACAA
+ATAAACTATTATTTTTTCCCCCCGTTAATCCAACACAGGCATGCCTATAAGGAAAGGTTAAAAAAAGTAA
+AAGGAACTCGGCAAATCTCACCCCGCCTGTTTACCAAAAACATCACCTCTAGCATTACCAGTATTAGAGG
+CACCGCCTGCCCGGTGACATACGTTTAACGGCCGCGGTACCCTGACCGTGCAAAGGTAGCATAATCACTT
+GTTCCTTAAATGGGGACTTGTATGAATGGCTTCACGAGGGTTCGACTGTCTCTTACTTTTAACCAGTGAA
+ATTGACCTGCCCGTGAAGAGGCGGGCATAACATAACAAGACGAGAAGACCCTATGGAGCTTCAATTTACC
+AGTGCAAATAACATACAACAAGCCCACAGGCCCTAAATCACCAAACCTGCACTGAAGATTTCGGTTGGGG
+CGACCTCGGAGCACAACCCAACCTCCGAGAAACACATGTTAAGACCTCACAAGTCAAAACGAACTTCCAC
+ACACAATTGATCCAACAACTTGACCAACGGAACAAGTTACCCTAGGGATAACAGCGCAATCCTGTTCTAG
+AGTCCATATCAACAACAGGGTTTACGACCTCGATGTTGGATCAGGACATCCTAATGGTGCAGCCGCTATT
+AAAGGTTCGTTTGTTCAACGATTAAAGTCCTACGTGATCTGAGTTCAGACCGGAGCAATCCAGGTCGGTT
+TCTATCTATTTCACATTTCTCCCTGTACGAAAGGACAAGAGAAATGGGGCCTACTTCACATAAGCGCCTT
+TCCCAAACAAATGATATCATCTCAATTTAACACCACACCAACACCCACCCAAGAAAAGGGCTATGTTAAG
+ATGGCAGAGCCCGGTAACTGCATAAAATTTAAAGCTTTACAGTCAGAGGTTCAACTCCTCTTCTTAACAA
+TATGCCCATAATCAACCTCCTACTCCTCATTATATCCATCCTAATCGCCATAGCATTTCTAATGCTAACC
+GAACGAAAAATCCTAGGCCACACACAACTACGCAAAGGGCCCAACATTGTGGGCCCCTACGGCTTACTAC
+AACCCTTTGCCGACGCCCTAAAACTATTCACCAAAGAACCCCTAAAACCCTCCACATCAACCATCACCCT
+TTACATTATTTCCCCCGCCCTAGCCCTTACCATTGCCCTCCTACTATGAACCCCCCTCCCTATGCCCATC
+CCCCTAATCAACCTCAACTTAGGCCTCCTATTTATCCTAGCCGCGTCAAGCCTAACCGTCTACTCCATCC
+TCTGATCAGGATGAGCATCTAACTCAAACTACGCCCTAATCGGCGCATTGCGGGCGGTAGCCCAAACGAT
+CTCATACGAAATTACCCTAGCCCTTATCCTGTTATCAGTACTACTAATAAGCGGCTCTTTTAACCTCTCC
+GCCCTCATCACAACACAAGAACACTCATGACTACTTCTACCATCATGACCTCTAGCCCTAATATGATTTA
+TTTCAACACTAGCAGAAACCAACCGAGCCCCCTTCGACCTCACCGAAGGAGAATCCGAACTAGTTTCGGG
+CTTTAACACTGAATACGCCGCAGGTCCATTCGCCCTATTCTTCATAGCCGAATATACAAACATTATCTTA
+ATAAACGCCCTCACCACTATAATTTTCCTAGGAACAACATTCAACATCCACTCCCCAGAACTCTACACAA
+CCCTCTTCACCATCAAAACCCTACTCCTAACCTCCCTATTCCTATGAATTCGATCAACATACCCCCGATT
+CCGCTACGACCAACTCATGCACCTTCTATGAAAAAATTTCCTGCCACTCACCCTAGCACTACTAATATGA
+CACATCTCCGTACCCATTGCAACCTCCGGCATTCCCCCACAAACCTAAGAAATATGTCTGACAAAAGAGT
+TACTTTGATAGAGTAAAAAATAGAGGTCTAAATCCCCTTATTTCTAGGATTATGGGAGTTGAACCCACCC
+CTGAGAATCCAAAATTCTCCGTGCCACCCATCACACCCTATCCTAAAGTAAGGTCAGCTAAATAAGCTAT
+CGGGCCCATACCCCGAAAATGTTGGTTATACCCTTCCCGTACTAATTAACCCCTTGGCCCAACCCATCAT
+TTACCCCACCATCTTCACAGGCACGCTCATTACAGCACTGAGCTCCCACTGATTCTTTGCCTGACTGGGA
+CTAGAAATAAATATACTCGCTTTCATCCCAGTCCTAACCAAAAAAACAAGCCCCCGCTCCACAGAAGCCG
+CCATTAAATATTTCCTCACACAGGCAACCGCATCCATAATCCTCCTGATAGCCATCCTCTACAACAACAT
+ACTTTCCGGACAGTGAACCACAACCAACACCACCAACCCATATTCATCTCTAATAATCGTAACCGCCCTA
+GCAATGAAGCTAGGAATAGCCCCCTTCCACTTTTGAGTCCCAGAAGTCACCCAAGGAGTCCCCCTGACAT
+CCGGCTTACTCCTCCTTACATGACAAAAATTAGCCCCCATTTCAATTATATACCAAATATCTTCATCGGT
+AGACACAAACATCCTCCTCACCCTCTCAATTCTATCTATCCTAGTAGGCGGCTGAGGCGGACTAAACCAA
+ACCCAACTACGCAAAATCCTGGCATACTCCTCAATCACCCATATAGGATGAATAATAGCAGTACTACCAT
+ATAACCCAGACATCACTATCCTCAACCTAATCATCTACATCATCCTGACAACTACCGCATTCCTAATCCT
+CGACTTAAACTCTAGTGTCACAATCCTAATATTAACCCGCACCTGGAACAAGCTGACATGACTAATACCC
+TTAATCCCATCAACCTTATTATCCCTAGGGGGCCTGCCACCACTAACCGGCTTCCTGCCCAAATGAGCCA
+TCATTGAAGAATTTGCAAAAAATGGCAATCTCATTACCCCCACAATCATGGCTATTATCACCCTCCTCAA
+CCTCTACTTCTACGTACGCCTAATCTACGCCACCTCAATCACACTACTCCCCATATCTAACAACGCAAAA
+ATGAAATGACAGTTCGAAAACACAAAACCCACCCCTCTTCTCCCCACACTCACCATTCTTACCACCCTAC
+TCCTACCTATCTCCCCTCTCATCCTATCTATCTCATAGAAATTTAGGTTAACACAGACCAAGAGCCTTCA
+AAGCCCTCAGCAAGTCACAGCACTTAATTTCTGTAACACTAAGGACTGCAAAGCCCCGCTCTGCATCAAC
+TGAACGCAAACCAGCCACTTTAATTAAGCTAAGCCCTCCCTAGACCGATGGGACTTAAACCCACAAACAT
+TTAGTTAACAGCTAAACACCCTAATCAATTGGCTTCAGTCCACTTCTCCCGCCGCGGGGAAAAAGGCGGG
+AGAAGCCCCGGCAGGCCTTAAAGCTGCTCCTTCGAATTTGCAATTCAACATGACAATCACCTCGGGGCTG
+GTAAAAAGAGGTCTAACCCCTGTTCTTAGATTTACAGCCTAATGCCTTAACTCGGCCATTTTACCCCCCC
+CCCCCCTTTTTTTCTCCACTAATGTTCGCCGACCGCTGGCTATTCTCCACGAACCACAAAGACATCGGGA
+CACTATACCTGTTATTCGGCGCATGGGCTGGAGTCCTAGGCACTGCCCTAAGCCTCCTCATTCGAGCTGA
+ACTGGGCCAACCCGGCAACCTTCTAGGCAATGACCATATCTACAATGTCATCGTCACAGCTCATGCATTC
+GTAATAATTTTCTTTATAGTCATACCCATTATAATTGGAGGCTTTGGCAACTGACTAGTGCCCCTAATAA
+TCGGCGCCCCCGATATAGCATTCCCGCGCATAAATAATATAAGCTTCTGACTCCTCCCCCCCTCCTTTCT
+CCTACTGCTCGCTTCTGCTACAGTAGAGGCTGGCGCAGGAACAGGCTGAACAGTCTATCCGCCCCTAGCA
+GGAAACTACTCTCACCCAGGAGCCTCTGTAGACTTAACAATCTTCTCTTTACACCTAGCAGGCATTTCCT
+CTATCCTAGGAGCTATCAATTTCATCACAACAATTATTAATATAAAACCCCCTGCAATATCCCAATACCA
+AACCCCCCTCTTCGTCTGATCAGTCTTGATCACAGCAGTCCTACTTCTCCTTTCCCTCCCAGTCCTAGCC
+GCTGGCATCACCATACTACTAACAGATCGCAACCTAAACACCACATTCTTTGACCCAGCCGGAGGTGGAG
+ATCCCATCCTATATCAGCACCTATTCTGATTTTTTGGCCACCCTGAAGTCTACATTCTCATCCTGCCGGG
+TTTCGGCATAATCTCCCACATCGTAACACACTATTCCGGAAAAGAAGAGCCATTTGGGTACATAGGCATA
+GTCTGAGCCATAGTCTCAATTGGCTTCCTGGGCTTTATCGTATGGGCCCACCACATATTCACAGTAGGAA
+TAGACGTGGACACACGAGCCTACTTCACCTCCGCTACCATAATCATTGCCATCCCCACCGGCGTCAAAGT
+ATTTAGCTGACTCGCTACACTCCACGGAAGCAACACTAAATGATCTGCCGCAATCCTCTGAGCCTTAGGA
+TTCATTTTCCTCTTCACCGTAGGCGGCCTAACAGGCATCGTACTAGCAAACTCATCACTAGACATTGTAT
+TACACGATACATACTACGTTGTAGCCCACTTTCATTACGTCCTATCAATAGGAGCTGTATTCGCCATCAT
+GGGAGGCTTCATCCACTGGTTCCCACTATTCTCAGGCTACACCTTAGACCAGACCTATGCTAAAATTCAC
+TTCATCACCATATTTATCGGCGTAAATTTAACTTTCTTCCCACAACATTTCCTCGGCCTGTCAGGCATAC
+CCCGACGCTACTCCGACTACCCCGACGCGTACACCACCTGAAATATTTTATCATCCGCAGGCTCATTTAT
+CTCCCTAACAGCAGTCATACTAATAATTTTCATAATTTGAGAAGCCTTCGCCTCAAAACGAAAAGTCCCA
+ATAGTTGAACAACCCTCCACAAGCCTAGAGTGATTGTACGGATGCCCCCCACCCTACCACACATTTGAAG
+AACCCGTCTATATAAAACCAGAACAAAAAAGGAAGGAATCGAACCTCCTAAAGCTGGTTTCAAGCCAACC
+CCACAACCTCCATGACTTTTTCAAGAGATACTAGAAAAACCATTTCATGACTTTGTCAAAGTTAAGTTAC
+AGGCCAAACCCTGTGTATCTTAATGGCGCACGCAGCACAGGTAGGTTTACAAGACGCTACCTCTCCTATC
+ATAGAAGAATTGGTCATCTTTCACGACCACGCCCTCATAATCATTTTCCTAATCTGCTTCCTAGTCCTGT
+ACGCCCTATTCCTAACACTCACAACAAAACTCACCAACACCAGCATCTCAGACGCCCAAGAGATAGAGAC
+TATTTGAACTATCCTACCGGCCATCATCCTAATTCTAATCGCCCTCCCATCCCTACGCATCCTCTACTTA
+ACAGACGAGATCAACGACCCTTCCTTCACCATCAAATCAATCGGTCATCAATGATACTGAACCTACGAGT
+ACACTGACTACGGTGGATTGATCTTCAACTCTTACATGCTCCCACCACTATTCCTAGAACCAGGCGACCT
+TCGACTCCTCGACGTCGACAACCGAGTAGTCCTCCCAGTCGAAGCTCCCGTTCGCATAATAATCACATCC
+CAAGACGTCTTACACTCATGAACTGTACCCTCACTAGGCCTGAAAACGGACGCAATCCCCGGACGCCTAA
+ACCAAACCACATTCACTGCCACGCGACCAGGAGTGTACTATGGCCAATGCTCAGAAATCTGTGGAGCTAA
+CCACAGCTTTATGCCTATCGTCCTAGAACTAATCCCCCTAAAAATCTTCGAAATAGGGCCCGTATTCACT
+TTATAACTTCCCCCACCCCCACAACCCATCCTACCCCCTTTCCTGAGGCCCACTGCAAAGCTAATCTAGC
+ATTAACCTTTTAAGTTAAAGACTAAGAGAATCAACCCCTCTTTGCAGTGAAATGCCCCAACTAAATACCA
+CCACATGGCCCACCATCATCACCCCAATACTCCTTGCACTATTCCTCATCACTCAACTAAAACTACTAAA
+CTCACACCTCCACCCACCCACCCCACCAAAATTCACTAAACCAAAACTCCACGCCAAACCCTGAGGACCA
+AAATGAACGAAAGTCTATTTACCCCATTCATTACCCCCACAGTACTAGGCCTCCCCGCCGCAGTACTAGT
+CATCTTATTTCCCCCCTTACTGATCCCCACCTCCAAACATCTCATCAACAACCGACTAATTATTATCCAA
+CAATGACTAATCCGACTCATCCTAAAACAAATAATAACCACCCATAACGCTAAAGGACGAACTTGATCCC
+TCATACTAACGTCCCTAATCATTTTCATCGCCTCAACCAACCTCCTAGGACTCCTCCCCTACTCATTTAC
+ACCAACCACCCAACTATCCATAAATTTAGCTATAGCAATTCCCTTATGAGCAAGCACGGTAGCTATGGGC
+CTTCGCTTCAAAGCCAAAATTACCCTAACCCACCTCTTACCACAAGGTACCCCCACACCTCTCATCCCTA
+TACTAATTATTATTGAAACCGTCAGCCTTTTCATTCAACCACTAGCCTTAGCCGTACGCCTAACTGCTAA
+CATCACTGCAGGCCACCTACTCATGCACCTAATCGGAAGCTCTGCACTAGCTATACTAGCCATCAACCTC
+CCCCTAACCCTCATCACCCTTACAATCTTAACCCTGCTAACAATCCTGGAGACTGCCATCGCCCTAATTC
+AAGCCTACGTCTTCACACTTCTAGTAAGCCTCTACCTGCACGACAACTCATAATGGCCCATCAATCACAC
+GCCTACCACATAGTAAAACCTAGCCCATGACCCCTAACAGGAGCTCTCTCAGCCCTCCTAACAACATCTG
+GCCTAACCATGTGATTCCACTTCCACTCCACAACCCTACTATTAACAGGCCTACTAACCAATGCACTAAC
+CATATACCAATGGTGACGAGATGTAGTGCGAGAAAGCACATACCAAGGCCACCACACACTACCCGTCCAA
+AAAGGCCTCCGATATGGAATAATCCTATTCATCACTTCAGAAGTCTTTTTCTTCGCCGGATTCTTCTGAG
+CATTCTACCACTCCAGCCTAGCCCCCACCCCTCAACTTGGAGGACACTGACCCCCAACAGGCATTATCCC
+CCTCAACCCCCTAGAAGTCCCACTCCTAAACACATCCGTACTACTCGCATCAGGAGTCTCAATTACCTGA
+GCCCATCACAGCCTGATGGAAAATAATCGAACCCAAATAATTCAAGCACTACTCATCACAATCTTACTAG
+GCATCTACTTCACTCTCCTTCAGGCTTCAGAATACATTGAAGCTCCTTTCACCATCTCTGACGGCATCTA
+CGGCTCAACATTCTTCATAGCCACGGGATTCCACGGCCTCCACGTCATTATCGGATCAACTTTCCTCACT
+GTATGCCTAGCCCGCCAGCTATTATTCCACTTCACATCCAAACATCACTTTGGCTTTGAGGCCGCCGCCT
+GATACTGGCACTTTGTAGACGTAGTCTGACTGTTTCTGTACGTCTCCATCTACTGATGAGGTTCCTACTC
+TTTTAGTATAAACAGTACCGTTAACTTCCAATTAACTAGTTTTGACAACGCCCAAAAAAGAGTAATTAAC
+TTCGTCCTAGCTCTAACAGTCAACACCCTCCTAGCCCTGCTACTAATAACCATCACATTCTGACTACCAC
+AACTCTACCCCTACATAGAAAAATCCGACCCATACGAATGTGGATTTGACCCCGCATACCCCGCTCGCAT
+TCCTTTCTCCATAAAATTTTTCTTAGTAGCCATCACCTTCCTACTATTCGACCTAGAAATCGCCCTGCTA
+CTACCCCTGCCATGGGCCCTACAAACAACCAACTTACCACTAATAACTACATCATCACTTATATTAATTA
+TCATCCTAGCCCTAGGCCTAACTTACGAATGATCACAAAAAGGATTAGACTGAGCCGAATTGGTAAATAG
+TTTAAACAAAACAAATGATTTCGACTCATTAAATTATGACAGCCATATTTACCAAATGCCCCTTATCTAC
+ATAAATATCACACTAGCATTCACCATATCACTCCTAGGCATACTAGTCTACCGCTCACACCTAATATCTT
+CTCTACTATGTCTAGAAGGAATAATATTATCATTGTTCATTATAATTACTCTCATAACCCTCAACACCCA
+CTCTCTCCTAGCTAACATCATACCCATCACCATGCTAGTCTTCGCTGCCTGCGAAGCAGCAGTAGGCCTC
+GCCCTACTAGCCTCAATCTCCAATACATACGGCCTAGACTACGTCAACAACCTAAACCTACTTCAATGCT
+AAAACTAATTATCCCAACAATCATACTGCTGCCCCTAACATGACTCTCCAAAACGCACATAATCTGAATC
+AACACCACCACCCACAGCCTAATCATCAGCTCCATCCCCCTACTATTCCTCAATCAAACCAACAGCAACC
+TGTACAGCTACTCCCTTCTTTTCTCCTCCGACCCCTTATCAACCCCCCTTCTAATACTAACAACCTGACT
+CCTACCCCTCATAATTATAGCAAGCCAACACCATCTATCCAACGAACCCCCATCACGAAAAAAATTATAC
+CTCACCATACTAATCTCTCTTCAAATCTCCCTAATCATAACATTCACAGCCACAGAGCTAATTATATTTT
+ATATCCTCTTCGAAACCACTCTCATCCCCACCCTAGTCATTATCACCCGCTGAGGCAACCAGCCAGAGCG
+CTTAAATGCAGGCACATACTTTCTATTCTACACACTAGTAGGCTCCCTCCCCCTACTCATTGCCCTAATC
+CACACCTACAACACCCTAGGCTCGCTTAACATTGTATTACTAACTCTCACCGCCCGGGAGCTAACAGACT
+CCTGATCCAACAGCCTAATATGACTAGCGTACACAATAGCTTTCATAGTAAAAATACCCCTCTACGGACT
+ACACCTATGACTCCCTAAAGCCCATGTAGAAGCCCCCATTGCCGGCTCAATAGTACTCGCCGCAGTGCTC
+TTAAAACTAGGTGGTTACGGTATAATACGCCTTATCCCCATTCTCAATCCCCTAACTAAACACATAGCCT
+ACCCCTTTATCATACTATCCCTATGAGGCATAATCATAACAAGCTCCATCTGCTTACGACAAACCGACCT
+AAAATCACTCATCGCATACTCCTCAGTCAGCCACATAGCGCTTGTTGTAGCAGCTATCCTCATTCAAACC
+CCCTGAAGCTTCACCGGCGCAACCACCCTCATAATTGCCCATGGACTCACATCCTCCCTACTGTTCTGCC
+TAGCAAACTCAAACTACGAACGAACCCACAGCCGCATCATAATCCTCTCTCAAGGCCTTCAAACTCTACT
+CCCCCTAATAGCCCTCTGATGACTTCTAGCAAGCCTCACTAACCTTGCCCTACCACCCACCATCAACCTA
+CTAGGAGAACTCTCCGTACTAATAGCCATATTCTCTTGATCTAACATCACCATCCTACTAACAGGACTCA
+ACATACTAATCACAACCCTATACTCTCTCTATATATTCACCACAACACAACGAGGTACACCCACACATCA
+CACCAACAACATAAAACCTTCTTTCACACGTGAAAACACCCTCATGCTCATACACCTATCCCCCATTCTC
+CTCTTGTCCCTCAACCCCAGCATCATCGCTGGATTCGCCTACTGTAAATATAGTTTAACCAAAACATCAG
+ATTGTGAATCTAATAATAGGGCCCACAACCCCTTATTTACCGAGAAAGCTCACAAGAACTGCTAACTCTC
+ACCCCATGTGTAACAACATGGCTTTCTCAACTTTTAAAGGATAACAGCTATCCCTTGGTCTTAGGACCCA
+AAAATTTTGGTGCAACTCCAAATAAAAGTAACAGCCATGTTTACCACCATAACTGCCCTCACCTTGACTT
+CCCTAATCCCCCCCATTACCGCTACCCTCATTAACCCCAACAAAAAAAACTCATACCCCCACTATGTAAA
+AACTGCCATCGCATCCGCCTTTACTATCAGCCTTATCCCAACAACAATATTTATCTGCCTAGGACAAGAA
+ACCATCGTCACAAACTGATGCTGAACAACCACCCAGACACTACAACTCTCACTAAGCTTCAAACTTGACT
+ACTTCTCCATAACATTCCTCCCCGTAGCACTACTCATCACTTGATCCATTATAGAATTTTCACTATGGTA
+TATAGCCTCAGACCCAAACATCAACCAATTTCTCAAATTCCTCCTTATTTTCCTAATCACCATAATTATC
+CTAGTCACTGCCAATAACCTACTCCAACTCTTCATCGGCTGAGAGGGCGTAGGGATCATATCCTTCCTGC
+TCATTAGTTGATGATACGCCCGAACAGACGCCAACACGGCAGCTATTCAAGCAATCCTATACAATCGTAT
+CGGCGATATTGGCTTCATCCTGGCTCTAGCATGATTCCTCCTACACTCCAACTCATGGGAACTACAACAA
+GTATTCCTCCTAAACAATAACCCTAACCTCCTCCCACTACTAGGACTCCTCCTAGCCGCAGCTGGCAAAT
+CAGCCCAACTAGGCCTTCACCCCTGACTACCCTCAGCCATAGAAGGCCCAACCCCCGTCTCAGCCCTACT
+TCACTCAAGCACCATGGTCGTGGCTGGGGTCTTCCTACTCATCCGCTTTCACCCATTAACAGAAAACAGC
+CCACATATCCAAACCCTTACACTATGCTTAGGGGCCATCACCACCCTGTTCGCAGCAATCTGCGCCCTCA
+CACAAAACGACATTAAGAAAATCGTAGCTTTCTCCACCTCAAGTCAACTAGGACTTATAATGGTCACAAT
+TGGCATTAACCAGCCACACCTGGCACTCCTCCACATCTGCACCCACGCCTTCTTCAAAGCCCTTTTATTC
+ATATGTTCTGGGTCCATCATCCACAACCTCAACAATGAGCAAGACATCCGAAAAATAGGAGGACTACTCA
+AAACCATACCCCTAACCTCAACCTCCCTCACTATCAGCAGCCTAGCCCTCGCAGGAATACCCTTCCTCTC
+AGGCTTCTACTCCAAAGACCTCATTATCGAGACCGCAAACATATCCTATACCAACACCTGAGCCCTGTCT
+ATCACTCTCATCGCCACCTCCTTAACAGGCGCCTACAGCACTCGAATAATCCTCCACACCCTTACAAGCA
+AACCCCACTTCCCAACCCCAATCTCTATCAATGAAAACAACCCCACTCTACTTAAACCCATCAAGCGCCT
+TATGCTAGGAAGCCTATTCGCAGGATTCCTAATCACCAACAACATCCCCCCTATATCCCTGCCCCAAGTA
+ACAACCCCCCCTTACCTAAAACTCGCAGCTCTAGCTGCCACCCTCCTAGGTCTCCTAGTAGCCCTAGACT
+TAAACTACCTAGCCAACAAACTCAAGACAAAAACCCCTCCACCCACATTCTATTTCTCCATCATACTCGG
+ATTCTACCCTAGCATCATCCACCGCATAATCCCCCACCTAAGCCTTCTCATAAGCCAAAACTTATCCCTA
+CTCCTACTAGACCTAACCTGACTAAAAAAACTAATACCCAAAACAATCTCACAACACCAAACCTCAGCCT
+CCATCACTATTTCAACCCAAAAAGGTTTAATCAAACTCTACTTCCTCTCTTTCCTCATCCCACTCCTCCT
+AATCCTCCTTATAATCTCATAACCTATTACCCCGAGCAATCTCAATTACAACATAAACACCAACAAATAA
+CGTTCAACCAGTAACCACCACCAACCAACGCCCATAATCATATAAAGCCCCCGCACCAATAGGATCCTCC
+CGAATCAACCCCGACCCTTCCCCTTCATAAATTATCCAGCTCCCCACGCTATTAAAATTCACCACTACCA
+CCACTCCATCATACTCTTTTACCCACAACACCAGCCCCACTTCCATCACTAATCCCACCAGAACACTCAC
+CAATACCTCAACCCCTGACCCCCATGCCTCAGGATATTCCTCAATAGCTATTGCCGTAGTATACCCAAAA
+ACAACCATCATACCCCCTAAATAAATTAAAAAAACCATTAAACCCATATAACCTCCCCCACAATTTAAAA
+TAACTGCACACCCAACCGCACCACTAATAATCAACACTAAACCCCCATAAATAGGAGAGGGCTTAGAAGA
+AAACCCCACGAACCCTATCACTAAAATTACACTCAACAGAAACAAAGCATATGTCATTGTTCTCGCATAG
+ACTGTGACTATGACCAATGGTATGAAAAAACATCGTTGTACCTCAACTACAAGAACACTAATGACCTCAA
+CACGTAAAACCAACCCACTAATAAAATTAATCAACCACTCACTTATCGACCTCCCCACCCCATCAAACAT
+CTCCGCATGATGGAACTTCGGCTCACTCCTAGGCGCCTGCTTAATCATCCAAATCACCACTGGACTATTC
+CTAGCTATACATTATTCACCAGACGCCTCCACTGCCTTTTCATCAATCGCCCACATCACTCGAGATGTAA
+ACTACGGCTGAATAATTCGCCACCTCCACGCTAACGGCGCCTCAATATTCTTTATCTGCCTCTTCTTACA
+TATCGGCCGAGGCCTATACTATGGCTCATTCACCCACCTAGAAACCTGAAACATCGGCATCATCCTACTA
+TTTACAACTATAATAACAGCCTTCATAGGTTACGTCCTCCCATGAGGCCAAATATCCTTCTGAGGAGCCA
+CAGTAATCACAAATCTACTGTCCGCCATCCCATACATTGGAACAGACCTGGTCCAATGAGTCTGAGGTGG
+CTACTCAGTAAATAGCCCCACTCTAACACGATTCTTCACCCTACACTTCATACTACCCTTCATTATTACA
+GCCCTAACAACTCTACACCTCTTATTCCTACACGAAACAGGATCAAATAACCCCCTGGGAATCCCCTCCC
+ATTCCGACAAAATCACCTTCCACCCCTACTACACAATCAAAGACATCCTAGGCCTACTCCTTTTTCTCCT
+CGCCCTAATAACACTAACACTACTCTCACCAGACCTCCTAAGCGACCCAGACAACTACACCTTAGCTAAC
+CCCCTAAGCACCCCACCCCACATTAAACCCGAATGATATTTCCTATTCGCCTACGCAATCCTACGATCCG
+TCCCCAACAAACTAGGAGGTGTAATAGCCCTCATACTATCCATCCTAATCCTAACAACAATCCCTGCCCT
+TCACATGTCCAAGCAACAGAGCATAACATTTCGCCCATTGAGCCAATTCCTATATTGACTTTTAATCGCC
+GACCTTCTAATTCTCACCTGAATTGGAGGGCAACCAGTAAGCTACCCCTTCATCACCATTAGCCAAGTAG
+CATCCACATTGTACTTCACTACTATCCTTCTACTTATACCAGCCTCTTCCCTGATCGAAAACCACATACT
+CAAATGAACCTGCCCCTGTAGTACAAATAAGTACACCAGCCTTGTAACCTGAAAATGAAGACCCTCTTCC
+ATGGGCAAAAAAAATCAGAGAAAAAGCACTTAACTTCACCGTCAGCCCCCAAAGCCAACATTCTAATTTT
+AAACTACTCTCTGTTCTTTCATGGGGGACCAGATTTGGGTGCCACCCCAGTACTGACCCATTTCTAACGG
+CCTATGTATTTCGTACATTCCTGCTAGCCAACATGAATATCACCCAACACAACAATCGCTTAACCAACTA
+TAATGCATACAAAACTCCAACCACACTCGACCTCCACACCCCGCTTACAAGCAAGTACCCCCCCATGCCC
+CCCCACCCAAACACATACACCGATCTCTCCACATAACCCCTCAACCCCCAGCATATCAACAGACCAAACA
+AACCTTAAAGTACATAGCACATACTATCCTAACCGCACATAGCACATCCCGTTAAAACCCTGCTCATCCC
+CACGGATGCCCCCCCTCAGTTAGTAATCCCTTACTCACCATCCTCCGTGAAATCAATATCCCGCACAAGA
+GTGCTACTCCCCTCGCTCCGGGCCCATAAAACCTGGGGGTAGCTAAAGTGAGCTGTATCCGGCATCTGGT
+TCTTACTTCAGGGCCATAAAACCCAAGATCGCCCACACGTTCCCCTTAAATAAGACATCACGATGGATCA
+CAGGCCTATCACCCTATTAATCACTCACGGGAGCTCTCCATGCATCTGGTATTTTTTCGGGGGGGGATGC
+ACGCGATAGCATCGCGGGCCGCTGGAACCGGAGCACCCTATGTCGCAGGATCTGTCTTTGATTCCTACCT
+CATGCCATTATTAATCGCGCCTAATATCCAATATCCTAGCCCCACCCTCAGTGTTTGAAGCTGCTATTTA
+ATTTATGCTAGAGGACATAAAATTACCAAAAAAAAATAAACGAACTCTCAACAACCCTACCCCATCAACC
+CAACAAAATCCAATTTTTATCTTTAGGCTATGTGCACTTTCAACAGGCACCCCTCAACTAACACAATCTC
+CTTCTTATCCCACCCACCAACCCCCCCCCCCCCTTCCTCCCTCTTTCTCCATTTTCCCCACAAACACCGC
+TACTACCCCCACACCCCAGACCAACCCAACCCAAAAGACACCCCGCACG
+
diff --git a/paleomix/resources/examples/phylo_pipeline/alignment/000_prefixes/white_handed_gibbon.fasta b/paleomix/resources/examples/phylo_pipeline/alignment/000_prefixes/white_handed_gibbon.fasta
new file mode 100644
index 0000000..6f29939
--- /dev/null
+++ b/paleomix/resources/examples/phylo_pipeline/alignment/000_prefixes/white_handed_gibbon.fasta
@@ -0,0 +1,276 @@
+>ENA|X99256|X99256.1 Hylobates lar complete mitochondrial DNA sequence
+GTTTATGTAGCTTAACTACCCAAAGCAAAACACTGAAAATGTCGAGACGGCTCACCCGCC
+CCATAAACAAATAGGTTTGGTCCTAGCCTTTCTATTAGCCCCTGGCAAGATTACACATGC
+AAGCATCCCCGCCCCGGTGAAATCGCCCTTCAAATCACCCGTGATCAAAAGGAGCAGGTA
+TCAAGCACGCAGTAATGCAGCTCAAAACACCCTGCCTAGCCACACCCCCACGGGAGACAG
+CAGTGATAAACCTTTAGCAATAAACGAAAGTTTAACTAAGCTATGCCAACCCAGGGTTGG
+TCAACTTCGTGCCAGCCACCGCGGTCATACGATTAACCCCAGTTAATAGAGCCCGGCGTA
+AAGAGTGTTTTAGATTGCTCCCTAATAAAGCTAAGCTCCATCCAAGTCGTAAAAAACTCT
+GGCTGCTATAAAATAAACTACGAAAGTGGCTTTAACACCTCTGAATACACAATAGCTGAG
+ACCCAAACTGGGATTAGATACCCCACTATGCTCAGCCCTAAACTTCAACAGTCAAATCAA
+CAAGACTGCTCGCCAGAACACTACGAGCAACAGCTTAAAAATCAAAGGACCTGGCGGTGC
+TTCACACCCCCCTAGAGGAGCCTGTCCTATAATCGATAAACCCCGTTCAACCTCACCATC
+TCTTGCTCAGCCTATATACCGCCATCTTCAGCAAACCCTGACAAAGGCTATAAAGTAAGC
+ACAAACACCCACATAAAGACGTTAGGTCAAGGTGTAGCCCATGAGATGGGAAGAGATGGG
+CTACATTTTCTATGCCAGAAAACCACGATAACCCTCATGAAACTTGAGCGGTCGAAGGAG
+GATTTAGCAGTAAATTAAGAATAGAGTGCTTAGTTGAACAAGGCCCTGAAGCGCGTACAC
+ACCGCCCGTCACCCTCCTCAAGCATATTTCAAGGAACCCCTAACTAAATACTCCACGCAT
+CTATGTAGAGGAGACAAGTCGTAACATGGTAAGTGTACTGGAAAGTGCACTTGGACAAAC
+CAAGGTGTAGCTTAACACAAAGCACCCGGCTTACACCCGGGAGATTTCAATTAACTTGAC
+CACCCTGAGCCAACCCTAGCCCCAAATCACCCCAACCCTACTATCAAGTAACATCAACCA
+AACCATTTACCCGTACAAAGTATAGGCGATAGAAATTACCAACCTGGCGCAATAGATACA
+GTACCGCAAGGGAAAGATGAAAAACACGACCAAGCACAAAACAGCAAAGATAAACCCCTG
+TACCTTCTGCATAATGAATTAACTAGACACAACTTAGCAAGGAGGCCCAAAGCTAAGAGC
+CCCGAAACCAGACGAGCTACCTAAGAACCGCTGAAAGAGCACACCCGTCTATGTAGCAAA
+ATAGTGGGAAGATTCATAGGTAGAGGTGACAAGCCTACCGAGCCTGGCGATAGCTGGTTG
+TCCAAGACAGAATCTTAGTTCAACTTTAAATCTACCCGTAGAACCCCTAAATCTTCTTGT
+AAATTTAACTGTTAGTCTAAAGAGGAACAGCTCTTTAGACTCTAGGAAAAAACCTTATAA
+AGAGAGTAAAAAGTGTAAACCCCATAGTTGGCCTAAAAGCAGCCACCAATTAAGAAAGCG
+TTCAAGCTCAACACCACCTATCCAACAAATCCCAAACACACAACTGAACTCCTTCCACCA
+CATTGGACCAATCTATCATTTTATAGAAGAAATAATGTTAGTATAAGTAACATGAATAAC
+ATTCTCCCCCGCATAAACCTATATCAGACCAAAAAACTTCGCTGACAGTTAACAGCCCAA
+TATCTAAAACCAACTGATAAACCATTATTGCCCACACTGTCAACCCAACATAGGCATGCC
+CACAAGGAAAGGTTAAAAAAAGTAAAAGGAACTCGGCAAACACTACCCCGCCTGTTTACC
+AAAAACATCACCTCTAGCATTACCAGTATTAGAGGCACCGCCTGCCCAGTGACACATGTT
+CAACGGCCGCGGTACCCTAACCGTGCAAAGGTAGCATAATCACTTGTTCCTTAAATGGGG
+ACTTGTATGAATGGCTCCACGAGGGTTCAGCTGTCTCTTACTTTCAACCAGTGAAATTGA
+CCTGTCCGTGAAGAGGCGGACATAACCTAACAAGACGAGAAGACCCTATGGAGCTTTAGT
+CTATCAATGCAAACAACATTCAATAAACCAACAGGTCATAAATTACCAAACCTGCATCGA
+AGACTTCGGTTGGGGCGACCTCGGAGCATAGACTAACCTCCGAGCAGTATATGCTAAGAC
+CACACCAGTCAAAACGAAACTCCATGTGCAATTGACCCAATAACTTGATCAACGGAACAA
+GTTACCCTAGGGATAACAGCGCAATCCTATTCTAGAGTCCATATCAACAATAGGGTTTAC
+GACCTCGATGTTGGATCAGGACATCCCGATGGTGCAGCCGCTATCAAAGGTTCGTTTGTT
+CAACGATTAAAGTCCTACGTGATCTGAGTTCAGACCGGAGTAATCCAGGTCGGTTTCTAT
+CTGTTCTATATTTCTCCCTGTACGAAAGGACAAGAGAAATAGGGCCCACTTCGCAAAGCG
+CCCTCCCTCCTTGGATGATATTATCTCAATCCACAACATGCCCAAACCCGCTCAAGAACA
+GAGCCTGTTAAGATGGCAGAGCCCGGCAATTGCATAAAACTTAAGACTTTATAATCAGAG
+GTTCAATCCCTCTTCTTAACAGCATGCCCATAATCAACCTCCTGCTCCTCATCCTACCCA
+CCCTAATCGCCATAGCATTCCTAATGTTAACCGAACGAAAAATCCTAGGCTACACACAAC
+TACGCAAAGGCCCCAACATCGTAGGCCCCTATGGCCTACTACAGCCTTTCGCCGACGCAA
+TAAAGCTCTTCACCAAAGAACCCCTAAAACCATCTACATCAACCACCGCCCTCTATATCA
+TCGCCCCAACCTTAGCCCTTACCATTGCCCTCCTACTATGAACCCCCCTCCCCATACCCA
+ACCCCCTAATCAACCTCAACTTGGGCCTCCTATTTATCCTGGCTACATCTAGCCTAACCG
+TCTACTCCATCCTATGATCAGGATGAGCATCAAACTCTAACTACGCCCTAATCGGCGCAC
+TGCGAGCAGTAGCCCAAACAATTTCATATGAAGTCACTTCGGCCATTATCTTACTATCAG
+TACTACTTATGAGCGGCTCATTTAACCTCTCCACCCTCATTACAACACAAGAGCACATCT
+GACTGCTCCTACCAACATGGCCCCTAGCTATAATATGATTTATCTCTACATTAGCAGAAA
+CCAATCGAACCCCCTTCGACCTCACCGAAGGAGAGTCAGAATTAGTCTCAGGATTCAATA
+CCGAATATGCTGCCGGCCCATTCGCCCTATTCTTCATAGCCGAATATGTAAACATCATTA
+TGATAAACGCCCTAACCACCATAATCTTCCTAGGTACCACACACAACGCCCACCGCCCAG
+AACTTTACACCACATGCTTCACCATCAAAACCCTACTCTTAACCTCCCTATTTCTATGAA
+TCCGAACAACATACCCCCGATTCCGCTACGACCAACTCATGTACCTCCTATGAAAAAACT
+TCTTACCACTCACCCTAACACTACTAATATGATATATTTCTCTATCCACCATAATTGCCA
+GCATTCCCCCACAGACCTAAGAAATACGTCTGACGAAAGAGTTACTTTGATAGAGTAAAT
+AATAGGGGTTTAAATCCCCTTATTTCTAGAACCATAGGAGTCGAACCCATCCCTGAGAAC
+CCAAAACTCTCCGTGCCACCCGTCGCACCCTGTTCTAAGTAAGGTCAGCTAAATAAGCTA
+TCGGGCCCATACCCCGAAAATGTTGGTTATACCCTTCCCGTACTAATTAATCCCCTAGCT
+CAACCCATCATCTACTCCACCATTTTCGCAGGTACACTCATTACCGCATCAAGCTCACAC
+TGATTCCTCACCTGGGTGGGATTAGAAATAAACATACTAGCCTTCATCCCAGTTCTGACA
+AAAAAAATAAATCCCCGCTCCACAGAAGCTGCTATCAAATATTTCCTCGTACAAGCAACC
+GCATCCATAATTCTCATAATAGCCATTCTCTCCAACAACCTACTTTCCGGGCAGTGAACT
+ATAGCCAACATCACCAACCAATATTCATCAACAATAATATTAATAGCCCTGGCTATAAAA
+CTGGGAATAGCCCCCTTTCACTTCTGGGTCCCAGAAGTCACCCAGGGAACTACCCTTATA
+TCCGGCCTACTCCTCCTCACATGACAAAAACTGGCCCCTATCTCAATCATATACCAAATC
+TTCCCAGTGGTAAACGTAAACATCCTCCTCGCCTTTTCAATCTTATCTATCATGGTAGGC
+AGCTGAGGCGGACTGAACCAAACCCAACTACGCAAAATTCTAGCATACTCCTCAATCACC
+CACGTAGGCTGAATAATGGCCGTACTACCATACAACCCAGACATCACCATCTTCAACCTA
+ATCATCTACATCGTGCTAACAACCACCGCATTCCTAGCACTCAACCTGAATTCCAGCACC
+ACAACCCTACTATTATCTCGCTCTTGAAACAAACTAACCTGATTACTGCCCCTAATCCCA
+TCCACCCTATTATCACTAGGAGGCCTACCCCCACTAACCGGATTCCTACCCAAATGACTC
+GTAATTGAAGAACTCACAAAGAACGGAACACTCATTATCCCAACTGCCATAGCCATCATC
+ACCCTTATCAACCTATACTTCTACATACGCCTAATCTACTCCACCTCAATCACGCTGCTT
+CCCACATCCAACAACGTAAAAATAAAGTGACAGTTTGAAAACACAAAACCCACATTTCTC
+CTCCCCACACTTATGACCCTCACCACTCTCCTCCTACCAATCGCCCCACTTACATTCCCC
+ACCCCATAGAAATTTAGGTTAAACACAGACCAAGAGCCTTCAAAGCCCTCAGTAAGTTAA
+CAAAACTTAATTTCTGCAACACGCCCAAGGACTGCAAAGCCCACTTTGCATCAACCGAAC
+GCAAATCAGTCACTTTAATTAAGCTAAGCCCTTACTAGATCGATGGGACTTAAACCCACA
+AAAATTTAGTTAACAGCTAAACACCCTAGACAACCTGGCTTCAATCTACTTCTCCCGCCG
+CGGGAAAAAAAGGCGGGAGAAGCCCCGGCAGGATTGAAGCTGCTCCTTTGAATTTGCAAT
+TCAACGTGAAAACCACTTCGGGACTGGCAAAAAGAGGTTCCACCTCTGTCCTTAGATTTA
+CAGTCTAATGCTTTACTCAGCCATTTTACCCCATTCTACTAATGTTCGCCGACCGCTGGT
+TATTCTCCACAAACCATAAAGATATTGGAACACTATACTTACTATTTGGCGCATGGGCCG
+GGGTTCTGGGCACGGCCTTAAGCCTCCTCATTCGAGCCGAACTGGGTCAACCCGGCAATC
+TCCTAGGCAATGACCATATCTATAACGTCATTGTAACGGCCCACGCATTCGTCATAATCT
+TCTTCATAGTAATACCCATCATAATTGGGGGCTTTGGCAACTGGCTCGTCCCTCTGATAA
+TCGGCGCTCCCGATATGGCATTCCCTCGTATAAATAACATAAGCTTCTGACTTCTTCCCC
+CCTCATTCCTACTGCTGCTTGCCTCCGCTATAGTAGAAGCCGGCGCCGGAACAGGATGAA
+CGGTCTACCCTCCGCTGGCAGGAAACTACTCCCACCCAGGAGCCTCTGTCGACCTAACCA
+TTTTTTCTCTACACCTGGCCGGAGTATCATCTATCCTAGGGGCTATTAACTTCATTACCA
+CAATCATCAACATAAAACCCCCAGCCATATCCCAATACCAAACACCCCTCTTTGTCTGAT
+CCGTCCTAATTACAGCCGTCCTACTCCTCCTCTCCCTACCAGTCCTAGCCGCCGGCATTA
+CTATACTACTAACGGACCGCAACCTCAACACTACTTTCTTTGACCCCGCTGGAGGAGGAG
+ACCCTATCCTATATCAACACCTATTCTGATTCTTCGGTCACCCCGAAGTTTATATTCTCA
+TCCTACCAGGCTTCGGAATGATCTCACATATCGTAACACACTACTCAGGAAAAAAAGAAC
+CGTTCGGATATATAGGCATAGTCTGAGCCATAATATCAATTGGCTTCCTAGGTTTCATTG
+TCTGAGCCCACCATATATTCACAGTAGGTATGGACGTAGACACACGAGCCTATTTCACCT
+CTGCCACCATAATTATCGCCATCCCCACCGGCGTCAAGGTATTTAGCTGACTCGCCACAC
+TCCATGGAAGCGACACCAAATGGTCCGCCGCAGTGCTCTGAGCCCTAGGCTTCATCTTTC
+TCTTCACGGTAGGAGGCTTGACTGGCATCGTACTGGCAAACTCATCACTGGATATTGTAC
+TTCACGATACATATTATGTCGTAGCCCACTTCCACTACGTCTTATCCATAGGAGCCGTAT
+TCGCCATCATAGGAGGCTTCGTCCACTGATTCCCCTTATTCTCGGGCTACACTTTAGATC
+AAACCTACGCCAAAATTCACTTTGCCATTATATTTGTTGGGGTAAACTTAACCTTCTTCC
+CACAACACTTCCTTGGCCTCTCCGGAATACCACGACGTTACTCTGACTACCCCGATGCAT
+ACACTACCTGAAATATCCTATCCTCTGTAGGCTCATTTATTTCCCTAACAGCAGTAATAC
+TGATAATTTTTATAATCTGAGAAGCCTTCGCTTCAAAACGAAAAATTCTAATAATCGAAC
+AACCCTCCACCAACCTAGAATGGCTGTACGGATGCCCGCCGCCCTATCACACATTCGAAG
+AGCCCGTCTATATAAAGCCTAGACAAAAAAGGAAGGAATCGAACCCCCTAAAACTGGTTT
+CAAGCCAGCCCCATAACCTCTATGACTTTTTCAAAAAGATATTAGAAAAACTATTTCATA
+ACTTTGTCAAAGTTAAGTTACAGGTTCAAACCCCGTATATCTTAATGGCACATGCAACTC
+AAGTAGGCCTACAAGACGCTACATCCCCTATCATAGAAGAACTAATCTCTTTCCACGACC
+ACGCCCTTATAATCATCTTCCTCATCAGCTTCCTAGTCCTATATGCCCTCTTCCTAACAC
+TCACAACAAAACTAACCAACACTAACATTACGGATGCCCAAGAAATAGAAACCGTCTGAA
+CAATCCTGCCTGCTATTATTCTAGTCCTAATCGCCCTCCCGTCCCTCCGCATCCTTTACC
+TGACAGACGAGATCAACGACCCCTCCTTTACTATCAAAGCAATCGGCCATCAATGATACT
+GGGCCTACGAATATACAGACTACGGTGGGCTGATCTTTAATTCTTACATGCTTCCACCAT
+TATTTCTAGAACCAGGGGATCTCCGACTCCTTGAAGTCGATAACCGAGTGGTTCTTCCAA
+TTGAAGCCCCTGTCCGTATAATAATTACATCACAAGACGTCCTACACTCATGAACTGTCC
+CCTCCCTGGGTCTAAAAACAGACGCCATCCCAGGGCGCCTAAACCAAACCACATTCACCG
+CTACACGCCCAGGGGTATATTACGGCCAATGCTCAGAGATCTGTGGGGCCAACCATAGCT
+TTATACCAATTGTCCTAGAACTAATTCCCTTAAAAATCTTTGAAATAGGGCCTGTATTCA
+CTCTATAGCCCCTCCCCGCCTCCCCGTAAATTTCACTGTAGAGCTAAATTAGCATTAACC
+TTTTAAGTTAAAGACTAAGAGGACCACTACCTCTTTACAGTGAAATGCCCCAATTAAACA
+CCACCGTGTGACCTACAATCATCATATCAATACTCCTCGCACTATTCCTCCTTATACAGC
+TGAAAACACTAAATACACACTACCACCCACCCGCCTCCCCAAAACTCACGAACATTAAAC
+CTCATAATAATCCCTGAGAACACAAATGAACGAAAATCTATTCACTTCATTCGCTACCCC
+CACAATTCTAGGCTTACCCGCCGCAGTACCAATTATTCTATTTCCCTCCCTATTAATCCC
+TACTTCCAAATACCTCATCAACAACCGACTAATTACCACCCAACAGTGACTAATTCAACT
+GACCTTAAAGCAAATAATAACGATACATAATACTAAAGGACGAACCTGATCCCTCATGCT
+AATCTCCCTAATTACCTTTATTGCCACAACCAACCTCCTTGGCCTCCTACCCCACTCATT
+CACACCAACTACCCAACTATCCATAAACCTGGCCATAGCAATCCCCCTATGAGCAGGCAC
+AGTAGCCACAGGCTTCCGCCTTAAGGCCAAAAATACCCTTGCCCACCTTCTACCCCAGGG
+CACACCCACCCCTCTCATTCCAATATTAATTATCATCGAAACCATTAGCCTATTTATCCA
+ACCCGTAGCCCTTGCTGTACGACTAACTGCAAATATTACAGCAGGTCACCTACTAATACA
+CCTGATCGGAGCAGCCACAATAGCCTTATCAACTATTAGCCTACCCGCAACCCCCATCAT
+CTTCACAGTCTTAACTCTACTAACAACCCTCGAAATTGCCGTAGCTCTAATCCAAGCATA
+CGTCTTCACACTCCTGGTGAGCCTCTACCTGCACGACAACACGTAATGACCCACCAATCC
+CACGCCTACCATATAGTAAAACCCAGCCCTTGGCCTCTGACAGGGGCTCTCTCAGCCCTC
+CTACTAACATCCGGCCTAGCCATATGATTCCACTTCCACTCCACCACTCTACTAACACTA
+AGCATACTGACTAACGCGCTAACCATATTCCAATGGTGGCGCGACGTAGTGCGAGAAGGC
+ACATACCAAGGCCACCACACAATACCTGTCCAAAAAGGGCTTCGCTACGGAATAGTCTTA
+TTTATTACCTCAGAAATCTTCTTCTTTGCTGGATTTTTTTGAGCATTCTACCATTCCAGC
+CTAGCCCCCACCCCCCAACTAGGAGGACACTGACCCCCAACGGGCATCACCCCACTCAAC
+CCTCTAGAAGTCCCACTCCTAAACACCTCAGTACTGCTCGCATCAGGAGTTTCAATCACC
+TGAGCCCACCACAGCCTAATAGAAAACAATCGAAATCAAATAATCCAAGCGCTACTTATC
+ACAATCCTGCTAGGCATCTACTTTACCCTCCTGCAAATCTCAGAATATTTTGAAGCCCCT
+TTCACCATCTCCGACGGCATTTATGGCTCCACATTCTTTGTAGCCACGGGCTTCCACGGG
+CTCCATGTCATTATCGGATCAACATTCCTCACCATCTGCCTTATCCGCCAACTACTATTC
+CACTTCACATCTAAACACCACTTCGGCTTCGAAGCCGCCGCTTGATATTGACATTTCGTA
+GATGTGGTTTGACTGTTCCTATATGTCTCCATCTACTGATGAGGATCCTACTCTTTTAGT
+ATAAACAGTACTGTTAACTTCCAATTAACCAGCTTCGATAACGCTCGAAAAAGAGTAATG
+AATCTGGCATTAGCCCTAATAATTAATACACTCCTAGCCCTACTACTAATGACTATTACA
+TTCTGACTACCACAGCTCAACACCTACATAGAAAAAACCAACCCCTACGAATGCGGATTT
+GACCCACTATCCCCCGCCCGCATTCCATTTTCCATAAAATTCTTCCTGGTCGCAATCACT
+TTTCTACTATTTGATCTAGAAATCGCTCTACTACTACCCCTACCGTGAGCCCTACAAACA
+ACAAACCCCTCACTGACAATCGCATCATCACTCACATTAATCACCATTTTAATCCTAAGC
+TTGGCTTACGAATGATCACAAAAAGGGCTAGACTGGGTCGAATTGGTAAGTAGTTTAAGC
+TAAAACAAATGATTTCGACTCATTAAATTATGGCAACCATACTTATCAAATGCCCCTCAT
+CTACATAAATATCACACTGGCATTCGCTATCTCACTCCTAGGCATACTAATCTACCGTTC
+ACACCTCATATCCTCCCTACTATGCCTGGAGGGAATAATATTATCATTATTCATCATGAG
+TACCCTTATAGCCTTAAACACACACTCCCTCCTAATCAACATTATGCCTGTCGTTCTGTT
+AGTCTTTGCTGCCTGCGAAGCGGCAGTAGGCCTAGCCTTACTAGTCTCAATCTCTAACAC
+ATACGGCCTAGACCACATCCACAACCTAAACCTACTCCAATGCTAAAGCTAATTATCCCC
+ACCACTATACTATTACCCCTGACATGACTATCTAAAAAACACATAATCTGAATTAACACA
+ACCACCCACAGTCTAATTATCAGCCCCATCCCGCTATTATTCTTTAACCAGGCCAACAAC
+AACCTATTTACCTACTCCCTATCCTTTTCCTCCGACCCCTTAACCACACCCCTCCTAATG
+TTGACAACCTGACTGCTCCCCCTAATAATCATAGCAAGCCAACACCACCTATCCAACGAA
+CCCCCTCTACGAAAAAAACTCTACCTATCTATATTAATTATCCTCCAAGTCTCACTAATT
+ATAACATTCACCGCCACTGAACTAATAATATTCTACGTCCTCTTTGAAACTACACTCATC
+CCCACTCTAGTTATCATTACTCGATGGGGTAACCAACCGGAACGCCTAAACGCAGGCTCA
+TACTTTCTATTCTACACCCTAGTAGGCTCCCTCCCCCTACTCATTGCACTTATCCACACC
+CACAACACCCTAGGCTCACTAAACATTATATTACTAACCCTCTCCTCCCAAAACCTAACA
+GATTCTTGATCCAATAATCTCATATGACTAGCATACATAATAGCTTTCATAGTAAAAATA
+CCCCTTTACGGACTTCACCTCTGACTCCCCAAAGCCCATGTTGAAGCCCCCATCGCTGGC
+TCAATAGTACTCGCCGCAGTACTCCTAAAACTAGGCGGCTACGGCATAATACGGCTCACC
+CTCATTCTTAGCCCACTGACAAAACACATAGCCTACCCCTTCTTAATACTATCCCTGTGA
+GGCATGATCATAACAAGCTCCATCTGCCTACGACAAACAGACCTAAAATCCCTCATCGCA
+TACTCCTCCGTAAGCCACATAGCCCTTGTAATTACAGCTATCCTTATTCAAACCCCCTGA
+AGCTTTACAGGTGCAACCGTCCTCATAATCGCCCACGGACTAACCTCTTCCCTGCTATTC
+TGCCTTGCAAACTCAAACTACGAACGAACTCACAGCCGCATCATAATCCTATCTCGAGGG
+CTCCAAGCCTTACTCCCACTGATAGCCTTCTGATGACTCGCAGCAAGCCTCGCTAACCTC
+GCCCTACCCCCCACTATTAACCTCCTAGGTGAACTCTTCGTACTAATGGCCTCCTTCTCC
+TGGGCAAACACTACTATTACACTCACCGGGCTCAACGTACTAATCACGGCCCTATACTCT
+CTTTACATATTTATCATAACACAACGAGGCACACTTACACACCACATTAAAAACATAAAA
+CCCTCACTCACACGAGAAAACATATTAATACTTATGCACCTCTTCCCCCTCCTCCTCCTA
+ACCCTCAACCCTAACATCATTACTGGCTTTACTCCCTGTAAACATAGTTTAATCAAAACA
+TTAGATTGTGAATCTAACAATAGAGGCTCGAAACCTCTTGCTTACCGAGAAAGCCCACAA
+GAACTGCTAACTCACTATCCCATGTATAACAACATGGCTTTCTCAACTTTTAAAGGATAA
+CAGCTATCCATTGGTCTTAGGACCCAAAAATTTTGGTGCAACTCCAAATAAAAGTAATAG
+CAATGTACACCACCATAGCCATTCTAACGCTAACCTCCCTAATTCCCCCCATTACAGCCA
+CCCTTATTAACCCCAATAAAAAGAACTTATACCCGCACTACGTAAAAATGACCATTGCCT
+CTACCTTTATAATCAGCCTATTTCCCACAATAATATTCATGTGCACAGACCAAGAAACCA
+TTATTTCAAACTGACACTGAACTGCAACCCAAACGCTAGAACTCTCCCTAAGCTTCAAAC
+TAGACTACTTCTCCATAATATTCATCCCCATCGCACTATTCGTCACCTGATCCATTATAG
+AATTCTCACTATGATACATACACTCAGACCCAAACATCAACCAGTTCTTCAAATATTTAC
+TCATCTTCCTCACCACAATACTAATCCTAGTTACCGCCAACAACCTGTTTCAACTCTTCA
+TCGGCTGAGAAGGCGTAGGAATCATATCCTTTCTCCTAATTGGCTGATGACACGCCCGAG
+AAGAGGCTAACACCGCAGCCATCCAGGCAATCCTATACAACCGCATCGGCGACATCGGCT
+TTATTCTAGCCCTAGCATGATTTCTCCTCCACACCAACTCGTGAGAACCACAACAAATAA
+TCCTCCTAAACTCCAACCCCAACTTTCTTCCACTGGCAGGCCTTCTTCTAGCGGCACGGG
+GAAAATCAGCCCAACTTGGGCTACACCCCTGACTTCCTTCAGCCATAGAAGGCCCGACCC
+CTGTCTCAGCCCTACTCCATTCAAGCACCATAGTCGTAGCCGGAGTTTTCCTACTCATCC
+GCTTCCACCCTCTAACAGAAAACAACCAACTAATCCAAACCCTTACACTATGCTTAGGTG
+CTATCACCACCCTATTCACAGCAATCTGTGCCCTAACACAAAACGATATCAAAAAAATCG
+TAGCATTCTCTACCTCCAGCCAACTAGGCCTAATGGTGGTCACAATTGGCATCAACCAAC
+CATACCTAGCATTCTTACATATCTGCACCCATGCCTTCTTCAAAGCCATGCTATTCATAT
+GTTCTGGATCCATTATTCATAACCTTAACAATGAACAGGACATCCGAAAAATAGGAGGCC
+TGTTTAAAACGCTGCCCCTCACCTCAACCTCCTTAACCATCGGTAGCCTCGCACTTACAG
+GAATGCCCTTCCTTACAGGCTTCTACTCCAAAGATCTTATTATCGAAACTGCAAACATAT
+CATACACCAACGCCTGAGCCCTATCCACAACTCTCATTGCCACTTCCCTAACAAGCGCCT
+ACAGCACCCGAATAATTCTCCTCACCCTAACAAACCGACCCCGCTTCCCAACCCTAACCA
+ACATCAACGAGAACAACCCCACCCTACTAAACCCCATCAAACGCCTAACAATCGGAAGCC
+TCCTAGCAGGCTTTCTCATCATTAACAGCATTCCCCCTACCTCCCCTTCCCAAACGACAA
+TCCCACTCTACCTAAAACTAACAGCCTTAAGCATCACCCTCCTAGGCTTCCTAACAGCTT
+TTGACCTTCATCTCTTGACCAACAAACTTAAAATAAAAAACCCCTCACACACATTCCATT
+TCTCCAACATACTAGGATTCTACCCCAACACCATCCACCGCACCATCCCCTACGCAAGTC
+TTACCATAAGCCAAAACCTAGCATCACTCCTACTAGACCTAGCCTGACTAGAAAAACTAA
+TACCCAAAACCATCTCACACCACCAAATCTCTGCCTCCGTCACTATTTCTTCTCAAAAAG
+GCATAATCAAGCTCTACTCCCTCTCCCTTCTAATCCCACTTTCCCTAACCCTCCTTCTAA
+TCATATAACCTATTACCTCGGGCAATCTCGATTACAATATATACGCCAACAAGCAATGTC
+CACCCAGTAACCACTACCAATCACCGCCCATAATCATACAAGGCACCCGCACCAATAGAA
+TCCTCCCGAATTAAACCCGACCCCTCCCCCTCATAAATCACCCAGCTCCCCATGTTATCA
+AAATTCAACACCATCACCAACCCGTCATATTCCTTCGCCCATAGAACCAACGCTACCTCC
+ATTACAAACCCCACTAAAACACCCACCAGGACCTCAACCCCTGACCCCCATGCCTCAGGA
+TACTCCTCAATAGCCATCGCAGTAGTATACCCAAAAACAACCATCATACCCCCCAGATAA
+ATTAAAAAAACTATCAAACCCAAATACCCCCCTCCACAATTCAAAATAACAGCACACCCC
+ACCACACCACTAACAACCAACACCAAGCCCCCATAAATAGGGGAAGGTTTGGACGAAAAT
+CCAACAAACCCCACTACTAAAATTACACTTAACAAAAGCAAAGTATATGTCATCATTCTC
+GCATGGACTACAACCACGACCAATGATACGAAAAACCATCGTTGTATTTCAACTACAAGA
+ACACCAATGACCCCCCTGCGCAAAACTAACCCACTAATAAAACTAATCAACCACTCACTT
+ATCGACCTTCCAGCCCCATCCAACATTTCTATATGATGAAACTTTGGTTCACTCCTAGGC
+GCCTGCTTGATCCTCCAGATCATCACAGGATTATTTTTAGCCATACACTACACACCAGAT
+GCCTCCACAGCTTTCTCATCAGTAGCTCACATCACCCGAGACGTAAACTACGGCTGAATC
+ATCCGCTACCTTCACGCCAACGGTGCCTCAATATTTTTTATCTGCCTATTCCTACACATC
+GGCCGAGGCCTATACTACGGTTCATTCCTTTACCTAGAAACCTGAAATATTGGCATTATC
+CTCCTACTCGCAACCATAGCAACAGCCTTCATGGGCTATGTCCTCCCATGAGGCCAAATA
+TCCTTTTGAGGGGCCACAGTAATCACAAACCTACTATCCGCCGTCCCATACATCGGAACA
+GATCTAGTCCAATGGGTCTGAGGCGGCTACTCAGTAGATAACGCCACACTCACACGCTTT
+TTCACCTTTCACTTCATCCTACCTTTCATTATCACGGCCCTAGCAGCCCTGCACCTTCTA
+TTCCTACACGAGACAGGATCAAACAATCCCTTAGGCATCTCCTCCCAACCAGACAAAATC
+GCCTTCCACCCCTACTATACAATCAAAGACATCCTAGGACTATTTCTCCTCCTCCTCATA
+CTAATAAGCCTAGTACTATTCTCACCCGACCTCCTAGGCGACCCGAGCAACTATACCCAG
+GCTAATCCCCTAAACACCCCTCCCCACATCAAACCCGAATGATACTTTTTATTCGCATAC
+GCAATTCTACGGTCCGTCCCTAATAAATTGGGAGGCGTACTAGCCCTCCTACTATCAATC
+CTCATCCTAGCAATAATCCCCGCACTCCACACAGCTAAACAGCAAAGCATGATATTTCGC
+CCACTAAGCCAGCTCACGTACTGACTCCTAGTAATAAACTTACTGATTCTCACATGAATC
+GGAGGACAACCGGTAAGCTACCCATTTATCACCATTGGACAAGTGGCATCCGCACTATAC
+TTCACCACAATCCTAGTACTTATACCAGCCGCCTCCCTAATCGAAAACAAAATACTCAAA
+TGAACCTGCCCTTGTAGTATAAGCCAATACACCGGTCTTGTAAGCCGGAACTGAAATCTT
+CCTTCCAAGGACAACTCAGAGAAAAAGTACTTAACTTCACCCTCAGCACCCAAAGCTAAA
+ATTCTAACTTAAACTATTCTCTGTATTCTCATGTGGAAGCCATTTTGGGTACAACCCCAG
+TACTAACCCACTTCTCCACAACTCTATGTACTTCGTACATTACTGCCAGTCCCCATGCAT
+ATTGTACAGTACTATAATCACTTAAGTAACTGTAGTACATTACCCACCAAACGTACATAC
+AAACGACCCCAACATGCTTACAAGCAAGCACCAGCACATCTTGACCAACTGTAGAGCATC
+CACTTCACTCTCACGACATAAACAGCAACCAGTAAAGATAGTCCATCTAAAGGGCATGAT
+GCACTCATTCATTCACCGCACATACAAACTCCCTACCACACTCAACTCACAATCCATACA
+CAACCTATTTCACATGGAAGTTTCCCGCCCAGCATCCTCCGTGAAATCAGCAACCCGCAC
+AAGAGTACTAACTCCCCTCGCTCCGGGCTTACAACACCTGGGGGTAGCTACAGTGAGCTG
+TATCCGGCATCTGGTTCTTACCTCCCGGCCATAAAGCCTAAAATCGCCCATACGTTCCCC
+TTAAATAAGACATCACGATGGATCACGGGTCTATCACCCTATTAACCAGTCACGGGAGCT
+CTCCATGCATTTGGTATCTTTTTACGGGGGCGTGCACGCGATAGCATTGCGAAACGCTGG
+AGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATACCTATCCCATCCCATTGTTGAT
+CGCGCCTACATTCCATATTCCAGCCGAGCATCCAATCCACTAAAGGTGCTAATTAATTCA
+TGCTTGTTGGACATAGCAATAACCAACCAACGTAACCCCAAACCACACTCCCTCAACGGA
+ATGAGAAAATTCACTCCGCAAACCCCCCCACACCCCCCCCCACCTTTGCCAAACCCCAAA
+AACAAAGTAACCCCAGTGAGCCAGACCCATCTTTTGGCGGTACACGCCTTTAACAGCCAC
+CCCCTCAACTAACACATATTTTTTTTTCTTCTTTTCCCTCCCACCTACTACTACTCCCTT
+ACCTCAAACCAGCCTATCCCCAAAGAGTCCCC
diff --git a/paleomix/resources/examples/phylo_pipeline/alignment/setup.sh b/paleomix/resources/examples/phylo_pipeline/alignment/setup.sh
new file mode 100755
index 0000000..4ebed53
--- /dev/null
+++ b/paleomix/resources/examples/phylo_pipeline/alignment/setup.sh
@@ -0,0 +1,30 @@
+#!/bin/bash
+#
+# Copyright (c) 2013 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+set -o nounset # Fail on unset variables
+set -o errexit # Fail on uncaught non-zero returncodes
+set -o pipefail # Fail is a command in a chain of pipes fails
+
+# For compatibility with old instructions
+
+cd ..
+./setup.sh
\ No newline at end of file
diff --git a/paleomix/resources/examples/phylo_pipeline/phylogeny/000_makefile.yaml b/paleomix/resources/examples/phylo_pipeline/phylogeny/000_makefile.yaml
new file mode 100644
index 0000000..246f5e1
--- /dev/null
+++ b/paleomix/resources/examples/phylo_pipeline/phylogeny/000_makefile.yaml
@@ -0,0 +1,177 @@
+# -*- mode: Yaml; -*-
+Project:
+ Title: ExampleProject
+
+ # List of samples to be included in the analytical steps, which may be
+ # grouped using any arbitrary number of (levels of) groups. (Sub)groups
+ # are not required, but may be used instead of listing individual samples
+ # in 'ExcludeSamples' and 'FilterSingletons'.
+ Samples:
+ <Primates>:
+ bonobo:
+ Sex: NA
+ chimpanzee:
+ Sex: NA
+ gorilla:
+ Sex: NA
+ rCRS:
+ Sex: NA
+ GenotypingMethod: Reference Sequence
+ sumatran_orangutan:
+ Sex: NA
+ white_handed_gibbon:
+ Sex: NA
+
+ # Specifies a set of regions of interest, each representing one or more
+ # named regions in a reference sequence (e.g. genes) in BED format.
+ RegionsOfInterest:
+ protein_coding.CDS:
+ # Name of the prefix; is expected to correspond to the filename
+ # of the FASTA file without the extension / the name of the
+ # prefix used in the BAM pipeline.
+ Prefix: rCRS
+ # If true, BAM files are expected to have the postfix ".realigned";
+ # allows easier interopterability with the BAM pipeline.
+ Realigned: yes
+ # Specifies whether or not the sequences are protein coding; if true
+ # indels are only included in the final sequence if the length is
+ # divisible by 3.
+ ProteinCoding: yes
+ # Do not include indels in final sequence; note that indels are still
+ # called, and used to filter SNPs. Requires that the option
+ # 'MultipleSequenceAlignment' is enabled
+ IncludeIndels: yes
+ # List of contigs for which heterozygous SNPs should be filtered
+ # (site set to 'N') based on sex; All sexes used in the 'Samples'
+ # section must be listed:
+ HomozygousContigs:
+ NA:
+ - NC_012920_1
+
+ # Filter sites in a sample, replacing any nucleotide not observed
+ # in the specified list of samples or groups with 'N'.
+# FilterSingletons:
+# NAME_OF_SAMPLE:
+# - <NAME_OF_GROUP>
+# - NAME_OF_SAMPLE
+
+
+Genotyping:
+ # Default settings for all regions of interest
+ Defaults:
+ # Regions of interest are expanded by this number of bases when calling
+ # SNPs, in order to ensure that adjacent indels can be used during filtering
+ # (VCF_filter --min-distance-to-indels and --min-distance-between-indels).
+ # The final sequences does not include the padding.
+ Padding: 10
+
+ # By default, each set of regions of interest are genotyped seperately,
+ # even if these overlap. By setting this option to true, the entire prefix
+ # is genotyped once, and all regions of interest are extracted from this.
+ # This can only be done for prefixes that only use genotyping defaults.
+ GenotypeEntirePrefix: no
+
+ # Settings for genotyping by random sampling of nucletoides at each site
+ Random:
+ # Min distance of variants to indels
+ --min-distance-to-indels: 2
+
+ MPileup:
+ -E: # extended BAQ for higher sensitivity but lower specificity
+ -A: # count anomalous read pairs
+
+ BCFTools:
+ -g: # Call genotypes at variant sites
+
+ VCF_Filter:
+ # Maximum coverage acceptable for genotyping calls; if set to zero, the
+ # default vcf_filter value is used; if set to 'auto', the MaxDepth value
+ # will be read from the depth histograms generated by the BAM pipeline.
+ MaxReadDepth: auto
+
+ # Minimum coverage acceptable for genotyping calls
+ --min-read-depth: 6
+ # Min RMS mapping quality
+ --min-mapping-quality: 10
+ # Min QUAL score (Phred) for genotyping calls
+ --min-quality: 30
+ # Min distance of variants to indels
+ --min-distance-to-indels: 2
+ # Min distance between indels
+ --min-distance-between-indels: 10
+ # Min P-value for strand bias (given PV4)
+ --min-strand-bias: 1.0e-4
+ # Min P-value for baseQ bias (given PV4)
+ --min-baseq-bias: 1.0e-4
+ # Min P-value for mapQ bias (given PV4)
+ --min-mapq-bias: 1.0e-4
+ # Min P-value for end distance bias (given PV4)
+ --min-end-distance-bias: 1.0e-4
+ # Max frequency of the major allele at heterozygous sites
+ --min-allele-frequency: 0.2
+ # Minimum number of alternative bases observed for variants
+ --min-num-alt-bases: 2
+
+# Add / overwrite default settings for a set of regions
+# NAME_OF_REGIONS:
+# ...
+
+
+MultipleSequenceAlignment:
+ # Default settings for all regions of interest
+ Defaults:
+ Enabled: yes
+
+ # Multiple sequence alignment using MAFFT
+ MAFFT:
+ # Select alignment algorithm; valid values are 'mafft', 'auto', 'fft-ns-1',
+ # 'fft-ns-2', 'fft-ns-i', 'nw-ns-i', 'l-ins-i', 'e-ins-i', and 'g-ins-i'.
+ Algorithm: G-INS-i
+
+ # Parameters for mafft algorithm; see above for example of how to specify
+ --maxiterate: 1000
+
+# Add / overwrite default settings for a set of regions
+# NAME_OF_REGIONS:
+# ...
+
+
+PhylogeneticInference:
+ ProteinCodingGenes:
+ # Exclude (groups of) samples from this analytical step
+# ExcludeSamples:
+# - <NAME_OF_GROUP>
+# - NAME_OF_SAMPLE
+
+ # Root the final tree(s) on one or more samples; if no samples
+ # are specified, the tree(s) will be rooted on the midpoint(s)
+# RootTreesOn:
+# - <NAME_OF_GROUP>
+# - NAME_OF_SAMPLE
+
+ # If 'yes', a tree is generated per named sequence in the areas of
+ # interest; otherwise a super-matrix is created from the combined set
+ # of regions specfied below.
+ PerGeneTrees: no
+
+ # Which Regions Of Interest to build the phylogeny from.
+ RegionsOfInterest:
+ protein_coding.CDS:
+ # Partitioning scheme for sequences: Numbers specify which group a
+ # position belongs to, while 'X' excludes the position from the final
+ # partioned sequence; thus "123" splits sequences by codon-positions,
+ # while "111" produces a single partition per gene. If set to 'no',
+ # a single partition is used for the entire set of regions.
+ Partitions: "112"
+ # Limit analysis to a subset of a RegionOfInterest; subsets are expected to be
+ # located at <genome root>/<prefix>.<region name>.<subset name>.names, and
+ # contain single name (corresponding to column 4 in the BED file) per line.
+# SubsetRegions: SUBSET_NAME
+
+ ExaML:
+ # Number of times to perform full phylogenetic inference
+ Replicates: 1
+ # Number of bootstraps to compute
+ Bootstraps: 100
+ # Model of rate heterogeneity (GAMMA or PSR)
+ Model: GAMMA
diff --git a/paleomix/resources/examples/phylo_pipeline/phylogeny/data/regions/rCRS.non_coding.bed b/paleomix/resources/examples/phylo_pipeline/phylogeny/data/regions/rCRS.non_coding.bed
new file mode 100644
index 0000000..bc9e1ba
--- /dev/null
+++ b/paleomix/resources/examples/phylo_pipeline/phylogeny/data/regions/rCRS.non_coding.bed
@@ -0,0 +1,12 @@
+NC_012920_1 0 3306 non_coding 0 +
+NC_012920_1 4262 4469 non_coding 0 +
+NC_012920_1 5510 5903 non_coding 0 +
+NC_012920_1 7442 7585 non_coding 0 +
+NC_012920_1 8266 8365 non_coding 0 +
+NC_012920_1 9204 9206 non_coding 0 +
+NC_012920_1 9989 10058 non_coding 0 +
+NC_012920_1 10402 10469 non_coding 0 +
+NC_012920_1 12136 12336 non_coding 0 +
+NC_012920_1 14145 14149 non_coding 0 +
+NC_012920_1 14673 14746 non_coding 0 +
+NC_012920_1 15888 16569 non_coding 0 +
diff --git a/paleomix/resources/examples/phylo_pipeline/phylogeny/data/regions/rCRS.protein_coding.CDS.bed b/paleomix/resources/examples/phylo_pipeline/phylogeny/data/regions/rCRS.protein_coding.CDS.bed
new file mode 100644
index 0000000..425202d
--- /dev/null
+++ b/paleomix/resources/examples/phylo_pipeline/phylogeny/data/regions/rCRS.protein_coding.CDS.bed
@@ -0,0 +1,13 @@
+NC_012920_1 3306 4262 ENST00000361390 0 +
+NC_012920_1 4469 5510 ENST00000361453 0 +
+NC_012920_1 5903 7442 ENST00000361624 0 +
+NC_012920_1 7585 8266 ENST00000361739 0 +
+NC_012920_1 8365 8569 ENST00000361851 0 +
+NC_012920_1 8526 9204 ENST00000361899 0 +
+NC_012920_1 9206 9989 ENST00000362079 0 +
+NC_012920_1 10058 10402 ENST00000361227 0 +
+NC_012920_1 10469 10763 ENST00000361335 0 +
+NC_012920_1 10759 12136 ENST00000361381 0 +
+NC_012920_1 12336 14145 ENST00000361567 0 +
+NC_012920_1 14149 14673 ENST00000361681 0 -
+NC_012920_1 14746 15888 ENST00000361789 0 +
diff --git a/paleomix/resources/examples/phylo_pipeline/setup.sh b/paleomix/resources/examples/phylo_pipeline/setup.sh
new file mode 120000
index 0000000..f0f48ab
--- /dev/null
+++ b/paleomix/resources/examples/phylo_pipeline/setup.sh
@@ -0,0 +1 @@
+../../../../misc/setup_phylo_pipeline_example.sh
\ No newline at end of file
diff --git a/paleomix/resources/examples/phylo_pipeline/synthesize_reads.py b/paleomix/resources/examples/phylo_pipeline/synthesize_reads.py
new file mode 120000
index 0000000..8576541
--- /dev/null
+++ b/paleomix/resources/examples/phylo_pipeline/synthesize_reads.py
@@ -0,0 +1 @@
+../../../../misc/synthesize_reads.py
\ No newline at end of file
diff --git a/paleomix/resources/reports/zonkey/report.css b/paleomix/resources/reports/zonkey/report.css
new file mode 100755
index 0000000..491b80a
--- /dev/null
+++ b/paleomix/resources/reports/zonkey/report.css
@@ -0,0 +1,267 @@
+/* CSS Document */
+
+body {
+ margin: 0;
+ font-family: "Trebuchet MS", Arial, Verdana;
+ color: #555555;
+ background-color: #FFFFFF;
+ text-align: left;
+}
+
+a {
+ color: #4D87CE;
+ font-weight: bold;
+ text-decoration: none;
+}
+
+a:hover {
+ color: #2a65ad;
+}
+
+ol, ul {
+}
+
+li {
+ line-height:20px;
+ text-align:left;
+}
+
+table {
+ width:100%;
+}
+
+table, th, td {
+ border:0px;
+ border-collapse: collapse;
+}
+
+th, td {
+ padding:5px;
+ text-align:left;
+}
+
+table tr:nth-child(even) {
+ background-color:#eee;
+}
+
+table tr:nth-child(odd) {
+ background-color:#ffffff;
+}
+
+table th {
+ background-color: #728c40;
+ color: black;
+}
+
+#header {
+ width:90%;
+ margin:0 auto 0 auto;
+ margin-bottom:40px;
+}
+
+#header h1 {
+ font-family:"Century Gothic", "Trebuchet MS", "Arial Narrow", Arial, sans-serif;
+ font-size:30px;
+ text-transform:uppercase;
+ font-weight:normal;
+ margin:0;
+ padding:0;
+ padding-top:5px;
+ color:#736451;
+ margin-bottom:10px;
+ text-align:left;
+}
+#header h2 {
+ font-family:"Century Gothic", "Trebuchet MS", "Arial Narrow", Arial, sans-serif;
+ font-size:15px;
+ text-transform:uppercase;
+ text-align:right;
+ font-weight:normal;
+ margin:0;
+ padding:0;
+ color:#000000;
+ border-bottom:1px solid #eeeeee;
+}
+
+#content {
+ max-width:1280px;
+ margin:0 auto 0 auto;
+ text-align:left;
+}
+
+.introduction {
+ font-family:Helvetica, Arial, sans-serif;
+ line-height:20px;
+ font-size:12px;
+ color:#99B198;
+ text-indent:25px;
+ background-position:bottom left;
+ padding-left:0;
+ padding-right:200px;
+ padding-bottom:35px;
+ padding-top:0;
+ background-repeat:no-repeat;
+}
+.logo {
+ float:right;
+ margin-right:30px;
+}
+.clearer {
+ clear:both;
+}
+
+#sidebar {
+ width:160px;
+ float:right;
+ font-size:95%;
+ color:#6C534A;
+ text-align:left;
+}
+#sidebar h1 {
+ font-family:"Century Gothic", "Trebuchet MS", "Arial Narrow", Arial, sans-serif;
+ font-size:20px;
+ text-transform:uppercase;
+ font-weight:normal;
+ border-bottom:1px solid #eeeeee;
+ color:#728c40;
+ text-align:right;
+}
+.submenu {
+}
+.submenu a {
+ display:block;
+ width:100%;
+ background-position:right;
+ background-repeat:no-repeat;
+ padding:5px;
+ padding-right:0;
+}
+.submenu a:hover {
+ background-color:#eeeeee;
+ background-repeat:repeat-y;
+}
+#mainbar {
+ margin-right:185px;
+ padding-right:35px;
+ background-position:top right;
+ background-repeat:no-repeat;
+}
+
+#mainbar h1 {
+ font-family:"Century Gothic", "Trebuchet MS", "Arial Narrow", Arial, sans-serif;
+ font-size:25px;
+ text-transform:uppercase;
+ font-weight:normal;
+ border-bottom:1px solid #eeeeee;
+ color:#728c40;
+}
+
+#mainbar p {
+ padding:15px;
+ line-height:20px;
+}
+
+#mainbar .articleimg {
+ float:right;
+ padding:5px;
+ border:1px solid #eeeeee;
+ margin-left:20px;
+ margin-top:20px;
+ margin-bottom:20px;
+}
+
+#footer {
+ width:100%;
+ background-color:#b6cf84;
+ border-top:1px solid #d8efa6;
+ border-bottom:5px solid #94ae62;
+ padding-top:5px;
+ padding-bottom:5px;
+ color:#FFFFFF;
+}
+#footer a {
+ color:#006600;
+}
+#footer a:hover {
+ color:#009900;
+}
+
+
+/*****************************************************************************/
+
+.image {
+ width: 100%;
+ height: 100%;
+}
+
+.image img {
+ -webkit-transition: all 0.5s ease; /* Safari and Chrome */
+ -moz-transition: all 0.5s ease; /* Firefox */
+ -ms-transition: all 0.5s ease; /* IE 9 */
+ -o-transition: all 0.5s ease; /* Opera */
+ transition: all 0.5s ease;
+}
+
+.image:hover img {
+ -webkit-transform:scale(2.5); /* Safari and Chrome */
+ -moz-transform:scale(2.5); /* Firefox */
+ -ms-transform:scale(2.5); /* IE 9 */
+ -o-transform:scale(2.5); /* Opera */
+ transform:scale(2.5);
+}
+
+
+/*****************************************************************************/
+
+ul#tabs {
+ list-style-type: none;
+ margin: 30px 0 0 0;
+ padding: 0 0 0.2em 0;
+}
+
+ul#tabs li {
+ display: inline;
+}
+
+ul#tabs li a {
+ color: #42454a;
+ background-color: #dedbde;
+ border: 1px solid #c9c3ba;
+ border-bottom: none;
+ padding: 0.3em;
+ text-decoration: none;
+}
+
+ul#tabs li a:hover {
+ background-color: #f1f0ee;
+}
+
+ul#tabs li a.selected {
+ color: #000;
+ background-color: #f1f0ee;
+ font-weight: bold;
+ padding: 0.7em 0.3em 0.38em 0.3em;
+}
+
+div.tabContent {
+ height: 15em;
+ border: 1px solid #c9c3ba;
+ padding: 0.5em;
+ background-color: #f1f0ee;
+}
+
+div.tabContent.small {
+ height: 5em;
+}
+
+div.tabContent.hide {
+ display: none;
+}
+
+div.tabContent.small div {
+ padding: 0.75em;
+}
+
+div.warning {
+ color: #dc143c;
+}
diff --git a/paleomix/resources/rscripts/common/requires.r b/paleomix/resources/rscripts/common/requires.r
new file mode 100644
index 0000000..48d756b
--- /dev/null
+++ b/paleomix/resources/rscripts/common/requires.r
@@ -0,0 +1,21 @@
+args <- commandArgs(trailingOnly = TRUE)
+if (length(args) != 1) {
+ cat("Usage: requires.R <module>\n")
+ quit(status=1)
+}
+
+packages <- installed.packages()
+
+for (module in args) {
+ if (!(module %in% packages)) {
+ cat(paste("R module '", module, "' is not installed!\n", sep=''))
+ cat(paste("Please run R and execute the following command:\n\n"))
+ cat(paste("> install.packages('", module, "')\n", sep=''))
+ quit(status=1)
+ }
+
+ # Magic string to detect successful loading
+ cat(paste("d0fd3ea6:", packageVersion(module), " \n"))
+
+ quit(status=0)
+}
diff --git a/paleomix/resources/rscripts/zonkey/admixture.r b/paleomix/resources/rscripts/zonkey/admixture.r
new file mode 100644
index 0000000..6a4a3af
--- /dev/null
+++ b/paleomix/resources/rscripts/zonkey/admixture.r
@@ -0,0 +1,102 @@
+#!/usr/bin/env Rscript
+# Copyright (c) 2013 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+require(ggplot2)
+require(reshape2)
+
+
+plot.admixture <- function(input_file, sample_names)
+{
+ samples <- read.table(sample_names, as.is=TRUE, comment.char="", header=TRUE)
+ Q <- read.table(input_file)
+ Q <- t(as.matrix(Q))
+ colnames(Q) <- samples$Name
+
+ # Order by name, and then move Sample to the right (Group is '-')
+ Q <- Q[, order(samples$Group, samples$Name)]
+ Q <- cbind(Q[, -1], Q[, 1, drop=FALSE])
+
+ data <- melt(Q)
+ colors <- NULL
+ for (variable in unique(data$Var1)) {
+ groups <- NULL
+
+ for (clade in unique(data$Var2)) {
+ group <- samples[clade == samples$Name, 'Group']
+ pct <- data$value[data$Var1 == variable & data$Var2 == clade]
+
+ if (group != '-' && pct >= 0.0010 ) {
+ groups <- unique(c(groups, group))
+ }
+ }
+
+ if (length(groups) != 1) {
+ color <- "ivory3"
+ samples$Color[samples$Group %in% groups] <- "ivory4"
+ } else {
+ color <- samples$Color[samples$Group == groups][1]
+ }
+
+ colors[[variable]] <- color
+ }
+
+ text_color <- samples$Color[order(samples$Group, samples$Name)]
+ text_color <- c(text_color[-1], text_color[1])
+
+ pp <- ggplot(data, aes(x=Var2, y=value, fill=Var1))
+ pp <- pp + geom_bar(stat="identity")
+
+ pp <- pp + xlab(NULL)
+ pp <- pp + ylab("Ancestry")
+
+ pp <- pp + theme_minimal()
+ pp <- pp + theme(legend.position="None",
+ axis.text.y=element_blank(),
+ panel.grid.minor.y=element_blank(),
+ panel.grid.major.y=element_blank(),
+ axis.text.x=element_text(angle=25,
+ hjust=1,
+ size=12,
+ color=text_color))
+ pp <- pp + scale_fill_manual(values=colors)
+
+ return(pp)
+}
+
+
+
+args <- commandArgs(trailingOnly = TRUE)
+if (length(args) != 3) {
+ cat("Usage: admixture.R <input_file> <sample_names> <output_prefix>\n", file=stderr())
+ quit(status=1)
+}
+
+input_file <- args[1]
+sample_names <- args[2]
+output_prefix <- args[3]
+
+pdf(paste(output_prefix, ".pdf", sep=""))
+plot.admixture(input_file, sample_names)
+dev.off()
+
+# bitmap is preferred, since it works in a headless environment
+bitmap(paste(output_prefix, ".png", sep=""), height=6, width=6, res=96, taa=4, gaa=4)
+plot.admixture(input_file, sample_names)
+dev.off()
diff --git a/paleomix/resources/rscripts/zonkey/coverage.r b/paleomix/resources/rscripts/zonkey/coverage.r
new file mode 100644
index 0000000..62cf523
--- /dev/null
+++ b/paleomix/resources/rscripts/zonkey/coverage.r
@@ -0,0 +1,112 @@
+#!/usr/bin/env Rscript
+# Copyright (c) 2013 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+args <- commandArgs(trailingOnly = TRUE)
+
+if (length(args) != 2) {
+ cat("Usage: test.R <table> <output_prefix>\n")
+ quit(status=1)
+}
+
+library(ggplot2)
+
+
+plot_coverage <- function(filename)
+{
+ tbl <- read.table(filename, as.is=TRUE, header=TRUE)
+ tbl$Hits <- as.numeric(tbl$Hits)
+ tbl$Size <- as.numeric(tbl$Size)
+ tbl$Sample <- as.factor("Sample")
+
+ # Correct size by number of uncalled ('N' / '-') bases
+ tbl$RelHits <- tbl$Hits / (tbl$Size - tbl$Ns)
+
+ autosomes <- tbl[tbl$ID != 'X' & tbl$ID != 'Y',]
+ autosomes$ID <- as.numeric(autosomes$ID)
+ autosomes$NormHits <- 2 * autosomes$RelHits / mean(autosomes$RelHits)
+
+ pp <- ggplot()
+
+ pp <- pp + geom_hline(yintercept=range(autosomes$NormHits),
+ linetype='dashed', color="grey31")
+
+ labels <- data.frame(x=max(autosomes$ID),
+ y=max(autosomes$NormHits) * 1.010,
+ label='Female')
+ pp <- pp + geom_text(data=labels, aes(x=x, y=y, label=label),
+ vjust=0, hjust=1, color="grey31")
+
+ pp <- pp + geom_hline(yintercept=0.5 * range(autosomes$NormHits),
+ linetype='dashed', color="grey")
+
+ labels <- data.frame(x=max(autosomes$ID),
+ y=max(autosomes$NormHits) * 0.505,
+ label='Male?', color="grey")
+ pp <- pp + geom_text(data=labels, aes(x=x, y=y, label=label),
+ vjust=0, hjust=1, color="grey")
+
+
+ pp <- pp + geom_point(data=autosomes, aes(x=ID, y=NormHits))
+
+ pp <- pp + ylab("Estimated ploidy")
+ pp <- pp + xlab("Chromosome")
+ pp <- pp + theme_bw()
+ pp <- pp + theme(axis.ticks.x=element_blank(),
+ panel.border=element_blank())
+
+ pp <- pp + scale_x_continuous(limits=range(autosomes$ID),
+ breaks=seq(1, max(autosomes$ID) + 10, 10))
+
+
+ sex <- tbl[tbl$ID == 'X' | tbl$ID == 'Y', , drop=FALSE]
+ sex <- sex[order(sex$ID), , drop=FALSE]
+
+ if (nrow(sex) > 0) {
+ id_range <- range(autosomes$ID)
+ step <- (id_range[2] - id_range[1]) / (nrow(sex) + 1)
+ sex$x <- id_range[1] + step * 1:nrow(sex)
+
+ sex$NormHits <- 2 * sex$RelHits / mean(autosomes$RelHits)
+
+ pp <- pp + geom_point(data=sex, shape=sex$ID, color="red", size=5,
+ aes(x=x, y=NormHits))
+
+ ymin <- min(0.500, 0.95 * min(autosomes$NormHits, sex$NormHits))
+ ymax <- max(2.500, 1.05 * max(autosomes$NormHits, sex$NormHits))
+
+ pp <- pp + scale_y_continuous(breaks=seq(1, 2),
+ limits=c(ymin, ymax))
+ }
+
+ return(pp)
+}
+
+
+input_file <- args[1]
+output_prefix <- args[2]
+
+pdf(paste(output_prefix, ".pdf", sep=""), width=5, height=5)
+plot_coverage(input_file)
+dev.off()
+
+# bitmap is preferred, since it works in a headless environment
+bitmap(paste(output_prefix, ".png", sep=""), height=5, width=5, res=96, taa=4, gaa=4)
+plot_coverage(input_file)
+dev.off()
diff --git a/paleomix/resources/rscripts/zonkey/pca.r b/paleomix/resources/rscripts/zonkey/pca.r
new file mode 100644
index 0000000..5fe8f37
--- /dev/null
+++ b/paleomix/resources/rscripts/zonkey/pca.r
@@ -0,0 +1,83 @@
+args <- commandArgs(trailingOnly = TRUE)
+if (length(args) != 3) {
+ cat("Usage: plot_pca.R <input_prefix> <name_table> <output_file>\n",
+ file=stderr())
+ quit(status=1)
+}
+
+
+library(ggplot2)
+library(ggrepel)
+
+
+expanded.range <- function(values, expand.by=0.2)
+{
+ r <- range(values)
+ p <- abs(r[2] - r[1]) * expand.by
+
+ return(r + c(-0.5, 0.5) * p)
+}
+
+
+calc.lims <- function(x, y)
+{
+ x <- expanded.range(x)
+ y <- expanded.range(y)
+ xlen <- x[2] - x[1]
+ ylen <- y[2] - y[1]
+
+ if (xlen < ylen) {
+ x <- x + c(-0.5, 0.5) * (ylen - xlen)
+ }
+
+ return(x)
+}
+
+
+plot.pca <- function(input_prefix, names.table=NULL)
+{
+ d <- read.table(paste(input_prefix, "evec", sep="."), as.is=TRUE)
+ v <- read.table(paste(input_prefix, "eval", sep="."), as.is=TRUE)
+
+ pc1 <- round(v$V1[1] / sum(v$V1), 3)
+ pc2 <- round(v$V1[2] / sum(v$V1), 3)
+
+ d$V1 <- sapply(strsplit(d$V1, ":", fixed=TRUE), function(x) x[1])
+ d <- d[, 1:3]
+ colnames(d) <- c("Name", "X", "Y")
+
+ colors <- read.table(names.table, comment.char="", header=TRUE)
+ final <- merge(d, colors)
+
+ pp <- ggplot(final)
+ pp <- pp + geom_text_repel(aes(x=X, y=Y, label=Name))
+
+ pp <- pp + geom_point(aes(x=X, y=Y), color=final$Color, size=3)
+
+ pp <- pp + xlab(sprintf("PC1: %.1f%%", pc1 * 100))
+ pp <- pp + ylab(sprintf("PC2: %.1f%%", pc2 * 100))
+
+ pp <- pp + theme_minimal()
+ pp <- pp + xlim(calc.lims(final$X, final$Y))
+ pp <- pp + ylim(calc.lims(final$Y, final$X))
+
+ # Force 1:1 aspect ratio for the PCA plot; this must be done in
+ # order to prevent misrepresenting distances on different axes.
+ pp <- pp + coord_fixed()
+
+ return(pp)
+}
+
+
+input_prefix <- args[1]
+names_table <- args[2]
+output_prefix <- args[3]
+
+pdf(file=paste(output_prefix, ".pdf", sep=""))
+plot.pca(input_prefix, names_table)
+dev.off()
+
+# bitmap is preferred, since it works in a headless environment
+bitmap(paste(output_prefix, ".png", sep=""), height=6, width=6, res=96, taa=4, gaa=4)
+plot.pca(input_prefix, names_table)
+dev.off()
diff --git a/paleomix/resources/rscripts/zonkey/tinytree.r b/paleomix/resources/rscripts/zonkey/tinytree.r
new file mode 100644
index 0000000..f609425
--- /dev/null
+++ b/paleomix/resources/rscripts/zonkey/tinytree.r
@@ -0,0 +1,532 @@
+#!/usr/bin/env Rscript
+# Copyright (c) 2015 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+
+# Required for 'read.tree'
+library(ape)
+library(ggplot2)
+library(grid)
+library(methods)
+
+
+TTBar <- setRefClass("TTBar",
+ fields = list(leftmax = "numeric",
+ left = "numeric",
+ right = "numeric",
+ rightmax = "numeric"))
+
+
+TTNode <- setRefClass("TTNode",
+ fields = list(children = "list",
+ bar = 'TTBar',
+ len = "numeric",
+ label = "character",
+ group = "character"
+ ),
+ methods = list(
+ "initialize" = function(children = NULL, bar = NULL, len = 0,
+ label = "", group = "") {
+ .self$children <- as.list(children)
+ if (!is.null(bar)) {
+ .self$bar <- bar
+ }
+ .self$len <- len
+ .self$label <- label
+ .self$group <- group
+ },
+
+ "show" = function() {
+ print(node$pformat())
+ },
+
+ "pformat" = function() {
+ return(paste(to_str(), ";", sep=""))
+ },
+
+ "to_str" = function() {
+ fields <- NULL
+ if (length(children)) {
+ child_str <- NULL
+ for (child in children) {
+ child_str <- c(child_str, child$to_str())
+ }
+
+ fields <- c(fields, "(", paste(child_str, sep="", collapse=","), ")")
+ }
+
+ if (nchar(label) > 0) {
+ fields <- c(fields, label)
+ }
+
+ if (length(len) > 0) {
+ fields <- c(fields, ":", len)
+ }
+
+ return(paste(fields, sep="", collapse=""))
+ },
+
+ "height_above" = function() {
+ total <- ifelse(length(children) > 0, 0, 1)
+ for (child in children_above()) {
+ total <- total + child$height_above() + child$height_below()
+ }
+
+ return(total)
+ },
+
+ "height_below" = function() {
+ total <- ifelse(length(children) > 0, 0, 1)
+ for (child in children_below()) {
+ total <- total + child$height_above() + child$height_below()
+ }
+
+ return(total)
+ },
+
+ "height" = function() {
+ return(height_above() + height_below())
+ },
+
+ "width" = function() {
+ total <- 0
+
+ for (child in children) {
+ total <- max(total, child$width())
+ }
+
+ if (length(len) > 0) {
+ total <- total + len
+ }
+
+ return(total)
+ },
+
+ "to_tables" = function(from_x=0, from_y=0) {
+ current_x <- from_x + ifelse(length(len) > 0, len, 0)
+
+ # Horizontal line
+ tables <- list(
+ labels=data.frame(
+ start_x=current_x,
+ start_y=from_y + calc_offset(from_y),
+ label=label,
+ group=get_labelgroup()
+ ),
+ segments=data.frame(
+ start_x=from_x,
+ start_y=from_y + calc_offset(from_y),
+ end_x=current_x,
+ end_y=from_y + calc_offset(from_y),
+ group=get_linegroup()),
+ bars=to_bar(current_x, from_y + calc_offset(from_y)))
+
+ max_y <- from_y
+ current_y <- max_y
+ for (child in children_above()) {
+ current_y <- current_y + child$height_below()
+ tables <- merge_tables(tables, child$to_tables(current_x, current_y))
+ max_y <- current_y + child$calc_offset(current_y)
+ current_y <- current_y + child$height_above()
+ }
+
+ min_y <- from_y
+ current_y <- min_y
+ for (child in children_below()) {
+ current_y <- current_y - child$height_above()
+ tables <- merge_tables(tables, child$to_tables(current_x, current_y))
+ min_y <- current_y + child$calc_offset(current_y)
+ current_y <- current_y - child$height_below()
+ }
+
+ # Vertical line
+ tables$segments <- rbind(tables$segments,
+ data.frame(
+ start_x=current_x,
+ start_y=max_y,
+ end_x=current_x,
+ end_y=min_y,
+ group=get_linegroup()))
+
+ return(tables)
+ },
+
+ "to_bar" = function(current_x, current_y) {
+ if (length(c(bar$leftmax, bar$left, bar$right, bar$rightmax)) != 4) {
+ return(data.frame())
+ }
+
+ return(data.frame(
+ start_x=c(bar$leftmax, bar$left, bar$right) + current_x,
+ end_x=c(bar$left, bar$right, bar$rightmax) + current_x,
+ start_y=current_y - c(0.25, 0.5, 0.25),
+ end_y=current_y + c(0.25, 0.5, 0.25)))
+ },
+
+ "merge_tables" = function(tbl_a, tbl_b) {
+ result <- list()
+ for (name in unique(names(tbl_a), names(tbl_b))) {
+ result[[name]] <- rbind(tbl_a[[name]], tbl_b[[name]])
+ }
+ return(result)
+ },
+
+ "clade" = function(taxa) {
+ # FIXME: Handle multiple tips with identical label
+ if (length(intersect(taxa, tips())) != length(taxa)) {
+ return(NULL)
+ }
+
+ for (child in children) {
+ if (length(intersect(taxa, child$tips())) == length(taxa)) {
+ return(child$clade(taxa))
+ }
+ }
+
+ return(.self)
+ },
+
+ "tips" = function(taxa) {
+ if (length(children) == 0) {
+ return(label)
+ } else {
+ result <- NULL
+ for (child in children) {
+ result <- c(result, child$tips())
+ }
+ return(result)
+ }
+ },
+
+ "calc_offset" = function(from_y) {
+ max_y <- from_y
+ current_y <- from_y
+ for (child in children_above()) {
+ current_y <- current_y + child$height_below()
+ max_y <- current_y + child$calc_offset(current_y)
+ current_y <- current_y + child$height_above()
+ }
+
+ min_y <- from_y
+ current_y <- from_y
+ for (child in children_below()) {
+ current_y <- current_y - child$height_above()
+ min_y <- current_y + child$calc_offset(current_y)
+ current_y <- current_y - child$height_below()
+ }
+
+ return(max_y - from_y - (max_y - min_y) / 2)
+ },
+
+ "children_above" = function() {
+ if (length(children) < 1) {
+ return(list())
+ }
+ return(children[1:ceiling(length(children) / 2)])
+ },
+
+ "children_below" = function() {
+ if (length(children) < 1) {
+ return(list())
+ }
+ return(children[(ceiling(length(children) / 2) + 1):length(children)])
+ },
+
+ "get_labelgroup" = function(prefix=NULL) {
+ if (is.null(prefix)) {
+ prefix <- ifelse(length(children) > 0, "node", "leaf")
+ }
+
+ if (nchar(group) > 0) {
+ prefix <- paste(prefix, ":", group, sep="")
+ }
+
+ return(prefix)
+ },
+
+ "get_linegroup" = function() {
+ prefix <- "line"
+ if (nchar(group) > 0) {
+ prefix <- paste(prefix, ":", group, sep="")
+ }
+
+ return(prefix)
+ },
+
+ "set_group" = function(value=NULL) {
+ .self$group <- ifelse(is.null(value), "line", value)
+ for (child in children) {
+ child$set_group(value)
+ }
+ },
+
+ "is_leaf" = function() {
+ '
+ Convinience function; returns true if the node is a leaf.
+ '
+ return(length(children) == 0)
+ },
+
+ "collect" = function() {
+ '
+ Returns a vector of all in the tree, including this node.
+ '
+ result <- .self
+ for (child in children) {
+ result <- c(result, child$collect())
+ }
+ return(result)
+ },
+
+ "sort_nodes" = function() {
+ if (!is_leaf()) {
+ widths <- NULL
+ for (child in children) {
+ widths <- c(widths, child$width())
+ child$sort_nodes()
+ }
+
+ .self$children <- children[order(widths, decreasing=FALSE)]
+ }
+ }))
+
+
+print.TTNode <- function(node)
+{
+ print(node$pformat())
+}
+
+
+tinytree.phylo.to.tt <- function(phylo)
+{
+ nnodes <- nrow(phylo$edge) + 1
+ lengths <- phylo$edge.length
+ to.node <- phylo$edge[, 2]
+ from.node <- phylo$edge[, 1]
+
+ nodes <- list()
+ labels <- c(phylo$tip.label, phylo$node.label)
+ for (edge in 1:nnodes) {
+ len <- lengths[to.node == edge]
+ nodes[[edge]] <- TTNode(label = labels[edge],
+ len = as.numeric(len))
+ }
+
+ for (edge in 1:nnodes) {
+ from <- from.node[to.node == edge]
+ if (length(from) != 0 && from != 0) {
+ children <- nodes[[from]]$children
+ children[[length(children) + 1]] <- nodes[[edge]]
+ nodes[[from]]$children <- children
+ }
+ }
+
+ root <- nodes[[length(phylo$tip.label) + 1]]
+ root$len <- 0
+
+ return(root)
+}
+
+
+tinytree.read.newick <- function(filename)
+{
+ return(tinytree.phylo.to.tt(read.tree(filename)))
+}
+
+
+tinytree.defaults.collect <- function(tt, defaults, values)
+{
+ stopifnot(!any(is.null(names(values))) || length(values) == 0)
+
+ # Overwrite using user supplied values
+ for (idx in seq(values)) {
+ defaults[[names(values)[idx]]] <- values[[idx]]
+ }
+
+ # Set default values based on type (line, node, leaf, etc.)
+ for (node in tt$collect()) {
+ for (type in c(node$get_labelgroup(), node$get_linegroup())) {
+ if (!(type %in% names(defaults))) {
+ root <- unlist(strsplit(type, ":"))[[1]]
+ stopifnot(root %in% names(defaults))
+ defaults[[type]] <- defaults[[root]]
+ }
+ }
+ }
+
+ return(defaults)
+}
+
+
+tinytree.default.colours <- function(pp, tt, ...)
+{
+ defaults <- c("line"="black",
+ "node"="darkgrey",
+ "leaf"="black",
+ "bar"="blue")
+ defaults <- tinytree.defaults.collect(tt, defaults, list(...))
+
+ return(pp +
+ scale_colour_manual(values=defaults) +
+ scale_fill_manual(values=defaults))
+}
+
+
+tinytree.default.sizes <- function(pp, tt, ...)
+{
+ defaults <- c("line"=0.5,
+ "node"=4,
+ "leaf"=5)
+ defaults <- tinytree.defaults.collect(tt, defaults, list(...))
+
+ return(pp + scale_size_manual(values=defaults))
+}
+
+
+tinytree.draw <- function(tt, default.scales=TRUE, xaxis="scales", padding=0.3)
+{
+ tbl <- tt$to_tables(-tt$len)
+
+ pp <- ggplot()
+ pp <- pp + geom_segment(data=tbl$segments, lineend="round",
+ aes(x=start_x, y=start_y, xend=end_x, yend=end_y,
+ color=group, size=group))
+
+ if (nrow(tbl$bars) > 0) {
+ pp <- pp + geom_rect(data=tbl$bars, alpha=0.3,
+ aes(xmin=start_x, xmax=end_x, ymin=start_y, ymax=end_y,
+ fill="bar"))
+ }
+
+ if (any(!is.na(tbl$labels$label))) {
+ labels <- tbl$labels[!is.na(tbl$labels$label),]
+ pp <- pp + geom_text(data=labels, hjust=0,
+ aes(label=sprintf(" %s", label),
+ x=start_x, y=start_y, color=group, size=group))
+ }
+
+ pp <- pp + theme_minimal()
+
+ # Disable legend
+ pp <- pp + theme(legend.position="none",
+ # Disable y axis + y axis labels + grid
+ axis.ticks.y=element_blank(),
+ axis.text.y=element_blank(),
+ panel.grid.minor.y=element_blank(),
+ panel.grid.major.y=element_blank(),
+ panel.grid.major = element_line(colour = "grey90", size = 0.4),
+ panel.grid.minor = element_line(colour = "grey90", size = 0.2))
+
+ if (xaxis != "axis") {
+ stopifnot(xaxis %in% c("scales", "none"))
+ pp <- pp + theme(axis.ticks.x=element_blank(),
+ axis.text.x=element_blank(),
+ panel.grid.minor.x=element_blank(),
+ panel.grid.major.x=element_blank())
+
+ if (xaxis == "scales") {
+ y_offset <- min(tbl$segments$start_y, tbl$segments$end_y) - 3
+ x_offset <- max(tbl$segments$end_x) * 0.2
+
+ df <- data.frame(x=0, y=y_offset, xend=x_offset, yend=y_offset)
+ pp <- pp + geom_segment(data=df,
+ aes(color="line", size="line",
+ x=x, xend=xend, y=y, yend=yend))
+
+ df <- data.frame(x=x_offset, y=y_offset, label=paste("", signif(x_offset, 2)))
+ pp <- pp + geom_text(data=df, aes(x=x, y=y, hjust=0, label=label,
+ size="leaf", colour="leaf"))
+ }
+ }
+
+ # Disable axis labels by default
+ pp <- pp + xlab(NULL)
+ pp <- pp + ylab(NULL)
+
+ # Default colors; may be overwritten
+ if (default.scales) {
+ pp <- tinytree.default.sizes(pp, tt)
+ pp <- tinytree.default.colours(pp, tt)
+ }
+
+ range <- max(tbl$segments$end_x) - min(tbl$segments$start_x)
+ pp <- pp + coord_cartesian(xlim=c(min(tbl$segments$start_x),
+ max(tbl$segments$end_x) + padding * range))
+
+ return(pp)
+}
+
+
+plot.tree <- function(filename, sample_names, padding=0.3)
+{
+ samples <- read.table(sample_names, as.is=TRUE, comment.char="", header=TRUE)
+ tt <- tinytree.read.newick(filename)
+ tt$sort_nodes()
+
+ for (node in tt$collect()) {
+ if (node$is_leaf()) {
+ node$set_group(node$label)
+ }
+ }
+
+ pp <- tinytree.draw(tt,
+ default.scales=FALSE,
+ padding=padding)
+ pp <- tinytree.default.sizes(pp, tt, "node"=3, "leaf"=4, "line"=0.75)
+
+ defaults <- c("line"="black",
+ "node"="grey40",
+ "leaf"="black",
+ "bar"="blue")
+ defaults <- tinytree.defaults.collect(tt, defaults, list())
+
+ for (row in 1:nrow(samples)) {
+ row <- samples[row, , drop=FALSE]
+ key <- sprintf("leaf:%s", row$Name)
+ print(c(key, row$Color))
+
+ defaults[[key]] <- row$Color
+ }
+ print(defaults)
+
+ return(pp +
+ scale_colour_manual(values=defaults) +
+ scale_fill_manual(values=defaults))
+}
+
+
+args <- commandArgs(trailingOnly = TRUE)
+if (length(args) != 3) {
+ cat("Usage: ggtinytree.R <input_file> <sample_names> <output_prefix>\n", file=stderr())
+ quit(status=1)
+}
+
+input_file <- args[1]
+sample_names <- args[2]
+output_prefix <- args[3]
+
+pdf(paste(output_prefix, ".pdf", sep=""))
+plot.tree(input_file, sample_names)
+dev.off()
+
+# bitmap is preferred, since it works in a headless environment
+bitmap(paste(output_prefix, ".png", sep=""), height=6, width=6, res=96, taa=4, gaa=4)
+plot.tree(input_file, sample_names)
+dev.off()
+
diff --git a/paleomix/resources/rscripts/zonkey/treemix.r b/paleomix/resources/rscripts/zonkey/treemix.r
new file mode 100644
index 0000000..9f3f525
--- /dev/null
+++ b/paleomix/resources/rscripts/zonkey/treemix.r
@@ -0,0 +1,651 @@
+#
+# Copyright (C) 2011 by Joseph Pickrell
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program. If not, see <http://www.gnu.org/licenses/>.
+#
+
+#
+# functions for plotting a tree
+#
+
+library(RColorBrewer)
+set_y_coords = function(d){
+ i = which(d[,3]=="ROOT")
+ y = d[i,8]/ (d[i,8]+d[i,10])
+ d[i,]$y = 1-y
+ d[i,]$ymin = 0
+ d[i,]$ymax = 1
+ c1 = d[i,7]
+ c2 = d[i,9]
+ ni = which(d[,1]==c1)
+ ny = d[ni,8]/ (d[ni,8]+d[ni,10])
+ d[ni,]$ymin = 1-y
+ d[ni,]$ymax = 1
+ d[ni,]$y = 1- ny*(y)
+
+ ni = which(d[,1]==c2)
+ ny = d[ni,8]/ (d[ni,8]+d[ni,10])
+ d[ni,]$ymin = 0
+ d[ni,]$ymax = 1-y
+ d[ni,]$y = (1-y)-ny*(1-y)
+
+ for (j in 1:nrow(d)){
+ d = set_y_coord(d, j)
+ }
+ return(d)
+}
+
+set_y_coord = function(d, i){
+ index = d[i,1]
+ parent = d[i,6]
+ if (!is.na(d[i,]$y)){
+ return(d)
+ }
+ tmp = d[d[,1] == parent,]
+ if ( is.na(tmp[1,]$y)){
+ d = set_y_coord(d, which(d[,1]==parent))
+ tmp = d[d[,1]== parent,]
+ }
+ py = tmp[1,]$y
+ pymin = tmp[1,]$ymin
+ pymax = tmp[1,]$ymax
+ f = d[i,8]/( d[i,8]+d[i,10])
+ #print (paste(i, index, py, pymin, pymax, f))
+ if (tmp[1,7] == index){
+ d[i,]$ymin = py
+ d[i,]$ymax = pymax
+ d[i,]$y = pymax-f*(pymax-py)
+ if (d[i,5]== "TIP"){
+ d[i,]$y = (py+pymax)/2
+ }
+ }
+ else{
+ d[i,]$ymin = pymin
+ d[i,]$ymax = py
+ d[i,]$y = py-f*(py-pymin)
+ if (d[i,5]== "TIP"){
+ d[i,]$y = (pymin+py)/2
+ }
+
+ }
+ return(d)
+}
+
+
+set_x_coords = function(d, e){
+ i = which(d[,3]=="ROOT")
+ index = d[i,1]
+ d[i,]$x = 0
+ c1 = d[i,7]
+ c2 = d[i,9]
+ ni = which(d[,1]==c1)
+ tmpx = e[e[,1]==index & e[,2] == c1,3]
+ if (length(tmpx) == 0){
+ tmp = e[e[,1] == index,]
+ tmpc1 = tmp[1,2]
+ if ( d[d[,1]==tmpc1,4] != "MIG"){
+ tmpc1 = tmp[2,2]
+ }
+ tmpx = get_dist_to_nmig(d, e, index, tmpc1)
+ }
+ if(tmpx < 0){
+ tmpx = 0
+ }
+ d[ni,]$x = tmpx
+
+ ni = which(d[,1]==c2)
+ tmpx = e[e[,1]==index & e[,2] == c2,3]
+ if (length(tmpx) == 0){
+ tmp = e[e[,1] == index,]
+ tmpc2 = tmp[2,2]
+ if ( d[d[,1]==tmpc2,4] != "MIG"){
+ tmpc2 = tmp[1,2]
+ }
+ tmpx = get_dist_to_nmig(d, e, index, tmpc2)
+ }
+ if(tmpx < 0){
+ tmpx = 0
+ }
+ d[ni,]$x = tmpx
+
+ for (j in 1:nrow(d)){
+ d = set_x_coord(d, e, j)
+ }
+ return(d)
+}
+
+
+set_x_coord = function(d, e, i){
+ index = d[i,1]
+ parent = d[i,6]
+ if (!is.na(d[i,]$x)){
+ return(d)
+ }
+ tmp = d[d[,1] == parent,]
+ if ( is.na(tmp[1,]$x)){
+ d = set_x_coord(d, e, which(d[,1]==parent))
+ tmp = d[d[,1]== parent,]
+ }
+ #print (paste(parent, index))
+ tmpx = e[e[,1]==parent & e[,2] == index,3]
+ if (length(tmpx) == 0){
+ tmp2 = e[e[,1] == parent,]
+ tmpc2 = tmp2[2,2]
+ #print
+ if ( d[d[,1]==tmpc2,4] != "MIG"){
+ tmpc2 = tmp2[1,2]
+ }
+ tmpx = get_dist_to_nmig(d, e, parent, tmpc2)
+ }
+ if(tmpx < 0){
+ tmpx = 0
+ }
+ d[i,]$x = tmp[1,]$x+ tmpx
+ return(d)
+}
+
+plot_tree_internal = function(d, e, samples, o = NA, cex = 1, disp = 0.005, plus = 0.005, arrow = 0.05, ybar = 0.01, scale = T, mbar = T, mse = 0.01, plotmig = T, plotnames = T, xmin = 0, lwd = 1, font = 1){
+ plot(d$x, d$y, axes = F, ylab = "", xlab = "Drift parameter", xlim = c(xmin, max(d$x)+plus), pch = "")
+ axis(1)
+ mw = max(e[e[,5]=="MIG",4])
+ mcols = rev(heat.colors(150))
+ for(i in 1:nrow(e)){
+ col = "black"
+ if (e[i,5] == "MIG"){
+ w = floor(e[i,4]*200)+50
+ if (mw > 0.5){
+ w = floor(e[i,4]*100)+50
+ }
+ col = mcols[w]
+ if (is.na(col)){
+ col = "blue"
+ }
+ }
+ v1 = d[d[,1] == e[i,1],]
+ v2 = d[d[,1] == e[i,2],]
+ if (e[i,5] == "MIG"){
+ if (plotmig){
+ arrows( v1[1,]$x, v1[1,]$y, v2[1,]$x, v2[1,]$y, col = col, length = arrow)
+ }
+ }
+ else{
+ lines( c(v1[1,]$x, v2[1,]$x), c(v1[1,]$y, v2[1,]$y), col = col, lwd = lwd)
+ }
+ }
+ tmp = d[d[,5] == "TIP",]
+ if ( !is.na(o)){
+ for(i in 1:nrow(tmp)){
+ tcol = o[o[,1] == tmp[i,2],2]
+ if(plotnames){
+ #print(tmp[i,2])
+ text(tmp[i,]$x+disp, tmp[i,]$y, labels = tmp[i,2], adj = 0, cex = cex, col = tcol, font = font)
+ }
+ }
+ }
+ else{
+ if (plotnames){
+ cols = NULL
+ for (name in tmp[, 2]) {
+ if (name %in% samples$Name) {
+ cols <- append(cols, head(samples$Color[samples$Name == name], 1))
+ } else {
+ cols <- append(cols, "#000000")
+ }
+ }
+
+ text(tmp$x+disp, tmp$y, labels = tmp[,2], adj = 0, cex = cex, font = font, col=cols)
+ }
+ }
+ if (scale){
+ print (paste("mse", mse))
+ lines(c(0, mse*10), c(ybar, ybar))
+ text( 0, ybar - 0.04, lab = "10 s.e.", adj = 0, cex = 0.8)
+ lines( c(0, 0), c( ybar - 0.01, ybar+0.01))
+ lines( c(mse*10, mse*10), c(ybar- 0.01, ybar+ 0.01))
+ }
+ if (mbar){
+ mcols = rev( heat.colors(150) )
+ mcols = mcols[50:length(mcols)]
+ ymi = ybar+0.15
+ yma = ybar+0.35
+ l = 0.2
+ w = l/100
+ xma = max(d$x/20)
+ rect( rep(0, 100), ymi+(0:99)*w, rep(xma, 100), ymi+(1:100)*w, col = mcols, border = mcols)
+ text(xma+disp, ymi, lab = "0", adj = 0, cex = 0.7)
+ if ( mw >0.5){ text(xma+disp, yma, lab = "1", adj = 0, cex = 0.7)}
+ else{
+ text(xma+disp, yma, lab = "0.5", adj = 0, cex =0.7)
+ }
+ text(0, yma+0.06, lab = "Migration", adj = 0 , cex = 0.6)
+ text(0, yma+0.03, lab = "weight", adj = 0 , cex = 0.6)
+ }
+}
+
+set_mig_coords = function(d, e){
+ for (j in 1:nrow(d)){
+ if (d[j,4] == "MIG"){
+ p = d[d[,1] == d[j,6],]
+ c = d[d[,1] == d[j,7],]
+ tmpe = e[e[,1] == d[j,1],]
+ y1 = p[1,]$y
+ y2 = c[1,]$y
+ x1 = p[1,]$x
+ x2 = c[1,]$x
+
+ mf = tmpe[1,6]
+ if (is.nan(mf)){
+ mf = 0
+ }
+ #d[j,]$y = (y1+y2)* mf
+ #d[j,]$x = (x1+x2) *mf
+ d[j,]$y = y1+(y2-y1)* mf
+ print(paste(mf, x1, x2))
+ d[j,]$x = x1+(x2-x1) *mf
+ }
+
+ }
+ return(d)
+
+}
+
+get_f = function(stem){
+ d = paste(stem, ".cov.gz", sep = "")
+ d2 = paste(stem, ".modelcov.gz", sep = "")
+ d = read.table(gzfile(d), as.is = T, comment.char = "", quote = "")
+ d2 = read.table(gzfile(d2), as.is = T, comment.char = "", quote = "")
+ d = d[order(names(d)), order(names(d))]
+ d2 = d2[order(names(d2)), order(names(d2))]
+ tmpcf = vector()
+ tmpmcf = vector()
+ for (j in 1:nrow(d)){
+ for (k in (j+1):nrow(d)){
+ tmpcf = append(tmpcf, d[j,k])
+ tmpmcf = append(tmpmcf, d[j,k] - d2[j,k])
+ }
+ }
+ tmpv = var(tmpmcf)/var(tmpcf)
+ return(1-tmpv)
+
+}
+
+plot_tree = function(stem, samples, o = NA, cex = 1, disp = 0.003, plus = 0.01, flip = vector(), arrow = 0.05, scale = T, ybar = 0.1, mbar = T, plotmig = T, plotnames = T, xmin = 0, lwd = 1, font = 1){
+ samples = read.table(samples, as.is=T, header=T, comment.char="")
+ d = paste(stem, ".vertices.gz", sep = "")
+ e = paste(stem, ".edges.gz", sep = "")
+ se = paste(stem, ".covse.gz", sep = "")
+ d = read.table(gzfile(d), as.is = T, comment.char = "", quote = "")
+ e = read.table(gzfile(e), as.is = T, comment.char = "", quote = "")
+ if (!is.na(o)){
+ o = read.table(o, as.is = T, comment.char = "", quote = "")
+ }
+ e[,3] = e[,3]*e[,4]
+ e[,3] = e[,3]*e[,4]
+
+ se = read.table(gzfile(se), as.is = T, comment.char = "", quote = "")
+ m1 = apply(se, 1, mean)
+ m = mean(m1)
+ #m = 0
+ for(i in 1:length(flip)){
+ d = flip_node(d, flip[i])
+ }
+ d$x = "NA"
+ d$y = "NA"
+ d$ymin = "NA"
+ d$ymax = "NA"
+ d$x = as.numeric(d$x)
+ d$y = as.numeric(d$y)
+ d$ymin = as.numeric(d$ymin)
+ d$ymax = as.numeric(d$ymax)
+
+ d = set_y_coords(d)
+ d = set_x_coords(d, e)
+ d = set_mig_coords(d, e)
+ plot_tree_internal(d, e, samples, o = o, cex = cex, xmin = xmin, disp = disp, plus = plus, arrow = arrow, ybar = ybar, mbar = mbar, mse = m, scale = scale, plotmig = plotmig, plotnames = plotnames, lwd = lwd, font = font)
+ return(list( d= d, e = e))
+}
+
+get_dist_to_nmig = function(d, e, n1, n2){
+ toreturn = e[e[,1] == n1 & e[,2] == n2,3]
+ #print(toreturn)
+ while ( d[d[,1] ==n2,4] == "MIG"){
+ tmp = e[e[,1] == n2 & e[,5] == "NOT_MIG",]
+ toreturn = toreturn+tmp[1,3]
+ n2 = tmp[1,2]
+ }
+ return(toreturn)
+}
+
+flip_node = function(d, n){
+ i = which(d[,1] == n)
+ t1 = d[i,7]
+ t2 = d[i,8]
+ d[i,7] = d[i,9]
+ d[i,8] = d[i,10]
+ d[i,9] = t1
+ d[i,10] = t2
+ return(d)
+
+}
+
+plot_modelcov = function(stem, pop_order, min = -0.009, max = 0.009, cex = 1, usemax = T){
+ c = read.table(gzfile(paste(stem, ".modelcov.gz", sep = "")), as.is = T, head = T)
+ o = read.table(pop_order, as.is = T, comment.char = "", quote = "")
+
+
+ toplot = data.frame(matrix(nrow = nrow(c), ncol = ncol(c)))
+ for(i in 1:nrow(o)){
+ for( j in 1:nrow(o)){
+
+ toplot[i, j] = c[which(names(c)==o[i,1]), which(names(c)==o[j,1])]
+ }
+ }
+ if (usemax){
+ m1 = max(abs(toplot))
+ max = m1*1.1
+ min = -(m1*1.1)
+ }
+ names(toplot) = o[,1]
+ plot_resid_internal(toplot, max = max, min = min)
+}
+
+
+
+plot_cov = function(stem, pop_order, min = -0.009, max = 0.009, cex = 1, usemax = T, wcols = ""){
+ c = read.table(gzfile(paste(stem, ".cov.gz", sep = "")), as.is = T, head = T)
+ o = read.table(pop_order, as.is = T)
+
+
+ toplot = data.frame(matrix(nrow = nrow(c), ncol = ncol(c)))
+ for(i in 1:nrow(o)){
+ for( j in 1:nrow(o)){
+
+ toplot[i, j] = c[which(names(c)==o[i,1]), which(names(c)==o[j,1])]
+ }
+ }
+ if (usemax){
+ m1 = max(abs(toplot))
+ max = m1*1.1
+ min = 0
+ }
+ names(toplot) = o[,1]
+ plot_cov_internal(toplot, max = max, min = min, wcols = wcols, o = o, cex = cex)
+}
+
+
+plot_resid = function(stem, pop_order, min = -0.009, max = 0.009, cex = 1, usemax = T, wcols = "r"){
+ c = read.table(gzfile(paste(stem, ".cov.gz", sep = "")), as.is = T, head = T, quote = "", comment.char = "")
+ m = read.table(gzfile(paste(stem, ".modelcov.gz", sep = "")), as.is = T, head = T, quote = "", comment.char = "")
+ names(c) = rownames(c)
+ names(m) = rownames(m)
+ o = read.table(pop_order, as.is = T, comment.char = "", quote = "", header = TRUE)
+ se = read.table(gzfile(paste(stem, ".covse.gz", sep = "")), as.is = T, head = T, quote = "", comment.char = "")
+ mse = apply(se, 1, mean)
+ mse = mean(mse)
+ c = c[order(names(c)), order(names(c))]
+ m = m[order(names(m)), order(names(m))]
+ tmp = c -m
+ #tmp = m - c
+ #tmp = (m-c)/m
+ #print(tmp)
+ toplot = data.frame(matrix(nrow = nrow(tmp), ncol = ncol(tmp)))
+ for(i in 1:nrow(o)){
+ for( j in 1:nrow(o)){
+ #print(paste(o[i,1], o[j,1]))
+ if (o[i,1] %in% names(tmp) ==F){
+ print(paste("not found", o[i,1]))
+ }
+ if (o[j,1] %in% names(tmp) ==F){
+ print(paste("not found", o[j,1]))
+ }
+ toplot[i, j] = tmp[which(names(tmp)==o[i,1]), which(names(tmp)==o[j,1])]
+ }
+ }
+ #print(toplot)
+ if (usemax){
+ m1 = max(abs(toplot), na.rm = T)
+ max = m1*1.02
+ min = -(m1*1.02)
+ }
+ names(toplot) = o[,1]
+ toreturn = plot_resid_internal(toplot, max = max, min = min, wcols = wcols, mse = mse, o = o, cex = cex)
+ return(toreturn)
+}
+
+plot_cov_internal = function(d, o = NA, max = 0.009, min = -0.009, cex =0.5, wcols = "", mse = 5){
+ npop = nrow(d)
+ width = 1/npop
+ height = 1/npop
+ colors = brewer.pal(9, "Spectral")
+ colors = c("red", "orange","yellow", "white", "green", "blue", "black")
+ pal = colorRampPalette(colors)
+ ncol = 80
+ cols = pal(ncol)
+ plot("NA", xlim = c(0, 1), ylim = c(0, 1), axes = F, xlab = "", ylab = "")
+ for (i in 1:npop){
+ for( j in 1:i){
+ v = d[i,j]
+ col= "white"
+ if (v < 0){
+ if (wcols == "rb"){
+ col = rgb(0, 0, 1, v/min)
+ }
+ else{
+ #col = rgb(0, 0, 1, 0.1+0.9*(v/min))
+ col = cols[ncol/2-floor( (v/min)*(ncol/2))]
+ }
+ }
+ else{
+ if (wcols == "rb"){
+ col = rgb(1, 0, 0, v/max)
+ }
+ else{
+ #col = rgb(1, 0, 0, 0.1+0.9*(v/max))
+ col = cols[ceiling((v/max)*(ncol))]
+ }
+ }
+ xmin = j/npop - 1/npop
+ xmax = j/npop
+ ymin = 1-(i/npop)
+ ymax = 1-(i/npop)+1/npop
+ if (v == 0){ col = "white"}
+ rect(xmin, ymin, xmax, ymax, col = col, border = col)
+ }
+ tcol = "black"
+ tmp = o[o[,1] == names(d)[i],]
+ if (length(tmp) != 1){
+ tcol = tmp[1,2]
+ }
+ mtext(names(d)[i], side = 2, at = 1-i/npop+0.5/npop, las = 1, cex = cex, col = tcol)
+ mtext(names(d)[i], side = 1, at = i/npop-0.5/npop, las = 3, cex = cex, col = tcol)
+ }
+ if ( !is.na(mse)){
+ ymi = 0.5
+ yma = 0.9
+ w = (yma-ymi)/ncol
+ xma = 0.80
+ lmi = round(min, digits = 1)
+ lma = round(max, digits = 1)
+ print(cols)
+ print(ymi+(0:ncol)*w)
+ rect( rep(0.75, ncol), ymi+(0:(ncol-1))*w, rep(xma, ncol), ymi+(1:ncol)*w, col = cols, border = cols)
+ text(xma+0.01, ymi, lab = paste(lmi), adj = 0, cex = 0.8)
+ text(xma+0.01, yma, lab = paste(lma, "(Variance)"), adj = 0, cex = 0.8)
+
+ }
+ return(d)
+ #image(as.matrix(d), col = cols)
+}
+
+plot_resid_internal = function(d, o = NA, max = 0.009, min = -0.009, cex =0.5, wcols = "rb", mse = NA){
+ npop = nrow(d)
+ width = 1/npop
+ height = 1/npop
+ colors = brewer.pal(9, "Spectral")
+ colors = c("red", "orange","yellow", "white", "green", "blue", "black")
+ pal = colorRampPalette(colors)
+ ncol = 80
+ cols = pal(ncol)
+ plot("NA", xlim = c(0, 1), ylim = c(0, 1), axes = F, xlab = "", ylab = "")
+ for (i in 1:npop){
+ for( j in 1:i){
+ v = d[i,j]
+ print(paste(i, j, v))
+ col= "white"
+ if (v < 0){
+ if (wcols == "rb"){
+ col = rgb(0, 0, 1, v/min)
+ }
+ else{
+ #col = rgb(0, 0, 1, 0.1+0.9*(v/min))
+ col = cols[ncol/2-floor( (v/min)*(ncol/2))]
+ #col = "white"
+ }
+ }
+ else{
+ if (wcols == "rb"){
+ col = rgb(1, 0, 0, v/max)
+ }
+ else{
+ #col = rgb(1, 0, 0, 0.1+0.9*(v/max))
+ col = cols[ncol/2+ceiling((v/max)*(ncol/2))]
+ }
+ }
+ xmin = j/npop - 1/npop
+ xmax = j/npop
+ ymin = 1-(i/npop)
+ ymax = 1-(i/npop)+1/npop
+ rect(xmin, ymin, xmax, ymax, col = col, border = col)
+ }
+ tcol = "black"
+ tmp = o[o[,1] == names(d)[i],]
+ if (length(tmp) != 1){
+ tcol = tmp[1, "Color"]
+ }
+ mtext(names(d)[i], side = 2, at = 1-i/npop+0.5/npop, las = 1, cex = cex, col = tcol)
+ mtext(names(d)[i], side = 1, at = i/npop-0.5/npop, las = 3, cex = cex, col = tcol)
+ }
+ if ( !is.na(mse)){
+ ymi = 0.5
+ yma = 0.9
+ w = (yma-ymi)/ncol
+ xma = 0.80
+ lmi = round(min/mse, digits = 1)
+ lma = round(max/mse, digits = 1)
+ print(cols)
+ print(ymi+(0:ncol)*w)
+ rect( rep(0.75, ncol), ymi+(0:(ncol-1))*w, rep(xma, ncol), ymi+(1:ncol)*w, col = cols, border = cols)
+ text(xma+0.01, ymi, lab = paste(lmi, "SE"), adj = 0, cex = 0.8)
+ text(xma+0.01, yma, lab = paste(lma, "SE"), adj = 0, cex = 0.8)
+
+ }
+ return(d)
+ #image(as.matrix(d), col = cols)
+}
+
+
+check_cov <- function(stem)
+{
+ cov <- read.table(gzfile(paste(stem, ".cov.gz", sep = "")),
+ as.is=TRUE, head=TRUE)
+
+ if (any(is.nan(as.matrix(cov)))) {
+ plot(NA, NA,
+ bty="n",
+ xaxt="n",
+ yaxt="n",
+ type="n",
+ xlab=NA,
+ ylab=NA,
+ title=NULL,
+ xlim=c(0, 1),
+ ylim=c(0, 1))
+
+ text(x=0.5, y=0.5, adj=c(0.5, 0.5), cex=5.0,
+ labels="Insufficent data\nto run TreeMix!",)
+
+ return(FALSE)
+ } else {
+ return(TRUE)
+ }
+}
+
+
+
+
+args <- commandArgs(trailingOnly = TRUE)
+if (length(args) < 1) {
+ cat("Usage: treemix.R <func> [...]\n", file=stderr())
+ quit(status=1)
+}
+
+func_name <- args[1]
+if (func_name == "plot_tree") {
+ if (length(args) != 4) {
+ cat("Usage: plotting_funcs.R plot_tree <input_file> <names_file> <output_prefix>\n", file=stderr())
+ quit(status=1)
+ }
+
+ input_file <- args[2]
+ names_file <- args[3]
+ output_prefix <- args[4]
+
+ pdf(paste(output_prefix, ".pdf", sep=""))
+ if (check_cov(input_file)) {
+ plot_tree(input_file, names_file)
+ }
+ dev.off()
+
+ # bitmap is preferred, since it works in a headless environment
+ bitmap(paste(output_prefix, ".png", sep=""), height=6, width=6, res=96, taa=4, gaa=4)
+ if (check_cov(input_file)) {
+ plot_tree(input_file, names_file)
+ }
+ dev.off()
+} else if (func_name == "plot_residuals") {
+ if (length(args) != 4) {
+ cat("Usage: plotting_funcs.R plot_residuals <input_file> <names_file> <output_prefix>\n", file=stderr())
+ quit(status=1)
+ }
+
+ input_file <- args[2]
+ names_file <- args[3]
+ output_prefix <- args[4]
+
+ pdf(paste(output_prefix, ".pdf", sep=""))
+ if (check_cov(input_file)) {
+ plot_resid(input_file, names_file)
+ }
+ dev.off()
+
+ # bitmap is preferred, since it works in a headless environment
+ bitmap(paste(output_prefix, ".png", sep=""), height=6, width=6, res=96, taa=4, gaa=4)
+ if (check_cov(input_file)) {
+ plot_resid(input_file, names_file)
+ }
+ dev.off()
+} else if (func_name == "variance") {
+ if (length(args) != 3) {
+ cat("Usage: plotting_funcs.R variance <input_prefix> <output_file>\n", file=stderr())
+ quit(status=1)
+ }
+
+ input_prefix <- args[2]
+ output_file <- args[3]
+
+ cat(sprintf("%f\n", get_f(input_prefix)), file=output_file)
+} else {
+ cat("Unknown function: ", func_name, "\n", file=stderr())
+ quit(status=1)
+}
diff --git a/paleomix/tools/__init__.py b/paleomix/tools/__init__.py
new file mode 100644
index 0000000..90e5529
--- /dev/null
+++ b/paleomix/tools/__init__.py
@@ -0,0 +1,22 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
diff --git a/paleomix/tools/bam_pipeline/__init__.py b/paleomix/tools/bam_pipeline/__init__.py
new file mode 100644
index 0000000..b4da426
--- /dev/null
+++ b/paleomix/tools/bam_pipeline/__init__.py
@@ -0,0 +1,23 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import paths
diff --git a/paleomix/tools/bam_pipeline/config.py b/paleomix/tools/bam_pipeline/config.py
new file mode 100644
index 0000000..c72830b
--- /dev/null
+++ b/paleomix/tools/bam_pipeline/config.py
@@ -0,0 +1,109 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2013 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import optparse
+
+import paleomix
+import paleomix.ui
+
+from paleomix.config import \
+ ConfigError, \
+ PerHostValue, \
+ PerHostConfig, \
+ migrate_config
+
+
+def _run_config_parser(argv, pipeline_variant):
+ per_host_cfg = PerHostConfig("bam_pipeline")
+ pipeline_variant = "%s_pipeline" % (pipeline_variant,)
+
+ usage_str = "paleomix %s <command> [options] [makefiles]" % (pipeline_variant,)
+ version_str = "paleomix %s v%s" % (pipeline_variant, paleomix.__version__)
+ parser = optparse.OptionParser(usage=usage_str, version=version_str)
+
+ paleomix.ui.add_optiongroup(parser,
+ ui_default=PerHostValue("running"),
+ color_default=PerHostValue("on"))
+ paleomix.logger.add_optiongroup(parser, default = PerHostValue("warning"))
+
+ group = optparse.OptionGroup(parser, "Scheduling")
+ group.add_option("--dry-run", action="store_true", default=False,
+ help="If passed, only a dry-run in performed, the "
+ "dependency tree is printed, and no tasks are "
+ "executed.")
+ group.add_option("--max-threads", type=int, default=per_host_cfg.max_threads,
+ help="Maximum number of threads to use in total [%default]")
+ group.add_option("--adapterremoval-max-threads", type=int, default=PerHostValue(1),
+ help="Maximum number of threads to use per AdapterRemoval instance [%default]")
+ group.add_option("--bowtie2-max-threads", type=int, default=PerHostValue(1),
+ help="Maximum number of threads to use per Bowtie2 instance [%default]")
+ group.add_option("--bwa-max-threads", type=int, default=PerHostValue(1),
+ help="Maximum number of threads to use per BWA instance [%default]")
+ group.add_option("--gatk-max-threads", type=int, default=PerHostValue(1),
+ help="Maximum number of threads to use per GATK instance [%default]")
+ parser.add_option_group(group)
+
+ group = optparse.OptionGroup(parser, "Required paths")
+ group.add_option("--jar-root", default = PerHostValue("~/install/jar_root", is_path = True),
+ help = "Folder containing Picard JARs (http://picard.sf.net), " \
+ "and GATK (www.broadinstitute.org/gatk). " \
+ "The latter is only required if realigning is enabled. " \
+ "[%default]")
+ group.add_option("--temp-root", default = per_host_cfg.temp_root,
+ help = "Location for temporary files and folders [%default/]")
+ group.add_option("--destination", default = None,
+ help = "The destination folder for result files. By default, files will be "
+ "placed in the same folder as the makefile which generated it.")
+ parser.add_option_group(group)
+
+ group = optparse.OptionGroup(parser, "Files and executables")
+ group.add_option("--list-input-files", action="store_true", default=False,
+ help="List all input files used by pipeline for the "
+ "makefile(s), excluding any generated by the "
+ "pipeline itself.")
+ group.add_option("--list-output-files", action="store_true", default=False,
+ help="List all output files generated by pipeline for "
+ "the makefile(s).")
+ group.add_option("--list-executables", action="store_true", default=False,
+ help="List all executables required by the pipeline, "
+ "with version requirements (if any).")
+ parser.add_option_group(group)
+
+ group = optparse.OptionGroup(parser, "Misc")
+ group.add_option("--jre-option", dest = "jre_options", action = "append", default = PerHostValue([]),
+ help = "May be specified one or more times with options to be passed "
+ "tot the JRE (Jave Runtime Environment); e.g. to change the "
+ "maximum amount of memory (default is -Xmx4g)")
+ group.add_option("--to-dot-file", dest="dot_file",
+ help="Write dependency tree to the specified dot-file.")
+ parser.add_option_group(group)
+
+ return per_host_cfg.parse_args(parser, argv)
+
+
+def parse_config(argv, pipeline_variant):
+ migrate_config()
+
+ config, args = _run_config_parser(argv, pipeline_variant)
+ paleomix.ui.set_ui_colors(config.ui_colors)
+
+ return config, args
diff --git a/paleomix/tools/bam_pipeline/makefile.py b/paleomix/tools/bam_pipeline/makefile.py
new file mode 100644
index 0000000..d05f308
--- /dev/null
+++ b/paleomix/tools/bam_pipeline/makefile.py
@@ -0,0 +1,838 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import copy
+import glob
+import types
+import string
+import itertools
+import collections
+
+import paleomix.tools.bam_pipeline.paths as paths
+from paleomix.common.utilities import fill_dict
+from paleomix.common.makefile import \
+ MakefileError, \
+ REQUIRED_VALUE, \
+ WithoutDefaults, \
+ read_makefile, \
+ IsInt, \
+ IsUnsignedInt, \
+ IsFloat, \
+ IsStr, \
+ IsNone, \
+ IsBoolean, \
+ And, \
+ Or, \
+ Not, \
+ ValueGE, \
+ ValueIn, \
+ ValuesIntersect, \
+ ValuesSubsetOf, \
+ StringIn, \
+ StringStartsWith, \
+ IsListOf, \
+ IsDictOf, \
+ PreProcessMakefile
+from paleomix.common.console import \
+ print_info, \
+ print_warn
+from paleomix.common.formats.fasta import \
+ FASTA, \
+ FASTAError
+
+import paleomix.common.bedtools as bedtools
+import paleomix.common.sequences as sequences
+
+
+_READ_TYPES = set(("Single", "Singleton",
+ "Collapsed", "CollapsedTruncated",
+ "Paired"))
+
+
+def read_makefiles(config, filenames, pipeline_variant="bam"):
+ if pipeline_variant not in ("bam", "trim"):
+ raise ValueError("'pipeline_variant' must be 'bam' or 'trim', not %r"
+ % (pipeline_variant,))
+
+ makefiles = []
+ for filename in filenames:
+ makefile = read_makefile(filename, _VALIDATION)
+ makefile = _mangle_makefile(makefile, pipeline_variant)
+
+ makefiles.append(makefile)
+
+ return _validate_makefiles(config, makefiles)
+
+
+def _alphanum_check(whitelist):
+ description = "characters a-z, A-Z, 0-9%s allowed"
+ description %= (", and %r" % whitelist,) if whitelist else ""
+
+ whitelist += string.ascii_letters + string.digits
+
+ return And(IsStr(),
+ ValuesSubsetOf(whitelist, description=description))
+
+
+# Valid names for prefixes
+_VALID_PREFIX_NAME = \
+ And(_alphanum_check(whitelist="._-*"),
+ Not(StringIn(["Options"] + [(s + "Reads") for s in _READ_TYPES])))
+
+# Valid paths for prefixes; avoids some problems with e.g. Bowtie2
+_VALID_PREFIX_PATH = \
+ And(IsStr(), Not(ValuesIntersect("\\:?\"<>|() \t\n\v\f\r")),
+ default=REQUIRED_VALUE)
+
+# Valid strings for targets / samples / libraries / lanes
+_VALID_TARGET_NAME = \
+ And(_alphanum_check(whitelist="._-"),
+ ValueGE(2, key=len, description="at least two characters long"))
+
+_VALID_FEATURES_DICT = {
+ "Coverage": IsBoolean(default=True),
+ "Depths": IsBoolean(default=True),
+ "DuplicateHist": IsBoolean(default=False),
+ "RawBAM": IsBoolean(default=False),
+ "RealignedBAM": IsBoolean(default=True),
+ "Summary": IsBoolean(default=True),
+ "mapDamage": Or(IsBoolean,
+ StringIn(('rescale', 'model', 'plot', 'no', 'yes')),
+ default=True)
+}
+
+_VALID_FEATURES_LIST = ValuesSubsetOf(("Coverage",
+ "Depths",
+ "DuplicateHist",
+ "mapDamage",
+ "Raw BAM",
+ "RawBAM",
+ "Realigned BAM",
+ "RealignedBAM",
+ "Summary"))
+
+
+_VALID_EXCLUDE_DICT = {
+ "Single": IsBoolean(default=False),
+ "Collapsed": IsBoolean(default=False),
+ "CollapsedTruncated": IsBoolean(default=False),
+ "Paired": IsBoolean(default=False),
+ "Singleton": IsBoolean(default=False),
+}
+
+_VALID_EXCLUDE_LIST = ValuesSubsetOf(_READ_TYPES)
+
+
+class BAMFeatures(PreProcessMakefile):
+ """Makefile pre-processor that converts convert an old-style 'Features'
+ list to a dictionary of bools, in order to allow backwards compatibility
+ with older makefiles. All listed values are set to True, and any omitted
+ value is set to False, in order to match the old behavior where inheritance
+ was not possible.
+ """
+
+ def __call__(self, path, value):
+ if not isinstance(value, list):
+ return value, _VALID_FEATURES_DICT
+
+ _VALID_FEATURES_LIST(path, value)
+
+ # All values must be set to prevent inheritance
+ result = dict.fromkeys(_VALID_FEATURES_DICT, False)
+ for key in value:
+ result[key.replace(" ", "")] = True
+
+ return result, _VALID_FEATURES_DICT
+
+
+class ExcludeReads(PreProcessMakefile):
+ """Makefile pre-processor that converts convert an old-style 'ExcludeReads'
+ list to a dictionary of bools, in order to allow backwards compatibility
+ with older makefiles. All listed values are set to False (excluded), and
+ any omitted value is set to True, in order to match the old behavior where
+ inheritance was not possible.
+ """
+
+ def __call__(self, path, value):
+ if not isinstance(value, list):
+ return value, _VALID_EXCLUDE_DICT
+
+ _VALID_EXCLUDE_LIST(path, value)
+
+ result = dict.fromkeys(value, True)
+ # 'Singleton' was treated as 'Single' prior to to v1.2
+ result.setdefault("Singleton", result.get("Single", False))
+
+ # All values must be set to prevent inheritance, which would otherwise
+ # change the behavior of old makefiles.
+ for key in _READ_TYPES:
+ result.setdefault(key, False)
+
+ return result, _VALID_EXCLUDE_DICT
+
+
+_VALIDATION_OPTIONS = {
+ # Sequencing platform, used to tag read-groups.
+ "Platform": StringIn(("CAPILLARY", "LS454", "ILLUMINA", "SOLID",
+ "HELICOS", "IONTORRENT", "PACBIO"),
+ default="ILLUMINA"),
+ # Offset for quality scores in FASTQ files.
+ "QualityOffset": ValueIn((33, 64, "Solexa"),
+ default=33),
+ # Split a lane into multiple entries, one for each (pair of) file(s)
+ "SplitLanesByFilenames": Or(IsBoolean, IsListOf(IsStr),
+ default=True),
+ # Format to use when compressing FASTQ files ("gz" or "bz2")
+ "CompressionFormat": ValueIn(("gz", "bz2"),
+ default="bz2"),
+
+ "AdapterRemoval": {
+ "Version": ValueIn(("v1.4", "v1.5+"),
+ default="v1.5+"),
+ "--pcr1": IsStr,
+ "--pcr2": IsStr,
+ "--adapter1": IsStr,
+ "--adapter2": IsStr,
+ "--adapter-list": IsStr,
+ "--maxns": IsUnsignedInt,
+ "--minquality": IsUnsignedInt,
+ "--trimns": Or(IsNone, IsBoolean),
+ "--trimqualities": Or(IsNone, IsBoolean),
+ "--collapse": Or(IsNone, IsBoolean, default=True),
+ "--mm": Or(IsFloat, IsUnsignedInt,
+ default=3),
+ "--minlength": IsUnsignedInt(default=25),
+ "--maxlength": IsUnsignedInt,
+ "--minalignmentlength": IsUnsignedInt,
+ "--minadapteroverlap": IsUnsignedInt,
+ "--shift": IsUnsignedInt,
+ "--qualitymax": IsUnsignedInt,
+ "--mate-separator": IsStr,
+ },
+
+ # Which aliger/mapper to use (BWA/Bowtie2)
+ "Aligners": {
+ "Program": ValueIn(("BWA", "Bowtie2"),
+ default="BWA"),
+ "BWA": {
+ # Mapping algorithm; availability depends on BWA version
+ "Algorithm": StringIn(("backtrack", "mem", "bwasw"),
+ default="backtrack"),
+
+ # Minimum mapping quality (PHREAD) of reads to retain
+ "MinQuality": IsUnsignedInt(default=0),
+ # Remove unmapped reads or not
+ "FilterUnmappedReads": IsBoolean(default=True),
+ # Use seed region during mapping
+ # Verbose name for command-line option "-l 65535"
+ "UseSeed": IsBoolean(default=True),
+ # Any number of user specific options
+ StringStartsWith("-"): Or(IsListOf(IsStr, IsInt, IsFloat),
+ Or(IsStr, IsInt, IsFloat, IsNone)),
+ },
+ "Bowtie2": {
+ # Minimum mapping quality (PHREAD) of reads to retain
+ "MinQuality": IsUnsignedInt(default=0),
+ # Remove unmapped reads or not
+ "FilterUnmappedReads": IsBoolean(default=True),
+ # Any number of user specific options
+ StringStartsWith("-"): Or(IsListOf(IsStr, IsInt, IsFloat),
+ Or(IsStr, IsInt, IsFloat, IsNone)),
+ },
+ },
+
+ # Does sample contain PCR duplicates / what to do about it.
+ # True is equivalent of 'remove'.
+ "PCRDuplicates": StringIn((True, False, 'mark', 'filter'),
+ default='filter'),
+
+ # Qualities should be rescaled using mapDamage (replaced with Features)
+ "RescaleQualities": IsBoolean(),
+
+ "mapDamage": {
+ # Tabulation options
+ "--downsample": Or(IsUnsignedInt, IsFloat),
+ "--length": IsUnsignedInt,
+ "--around": IsUnsignedInt,
+ "--min-basequal": IsUnsignedInt,
+
+ # Plotting options
+ "--ymax": IsFloat,
+ "--readplot": IsUnsignedInt,
+ "--refplot": IsUnsignedInt,
+
+ # Model options
+ "--rand": IsUnsignedInt,
+ "--burn": IsUnsignedInt,
+ "--adjust": IsUnsignedInt,
+ "--iter": IsUnsignedInt,
+ "--forward": IsNone,
+ "--reverse": IsNone,
+ "--var-disp": IsNone,
+ "--jukes-cantor": IsNone,
+ "--diff-hangs": IsNone,
+ "--fix-nicks": IsNone,
+ "--use-raw-nick-freq": IsNone,
+ "--single-stranded": IsNone,
+ "--seq-length": IsUnsignedInt,
+ },
+
+ # Exclude READ_TYPES from alignment/analysis
+ "ExcludeReads": ExcludeReads(),
+
+ # Features of pipeline
+ "Features": BAMFeatures(),
+}
+
+
+_VALIDATION = {
+ "Options": _VALIDATION_OPTIONS,
+
+ "Prefixes": {
+ _VALID_PREFIX_NAME: {
+ "Path": _VALID_PREFIX_PATH,
+ "Label": ValueIn(("nuclear", "mitochondrial", "chloroplast",
+ "plasmid", "bacterial", "viral")),
+ "RegionsOfInterest": IsDictOf(IsStr, IsStr),
+ },
+ },
+
+ _VALID_TARGET_NAME: { # Target
+ _VALID_TARGET_NAME: { # Sample
+ _VALID_TARGET_NAME: { # Library
+ _VALID_TARGET_NAME: Or(IsStr, IsDictOf(IsStr, IsStr)),
+
+ "Options": WithoutDefaults(_VALIDATION_OPTIONS),
+ },
+
+ "Options": WithoutDefaults(_VALIDATION_OPTIONS),
+ },
+
+ "Options": WithoutDefaults(_VALIDATION_OPTIONS),
+ },
+}
+
+
+def _mangle_makefile(makefile, pipeline_variant):
+ makefile = copy.deepcopy(makefile)
+ makefile["Options"] = makefile["Makefile"].pop("Options")
+ makefile["Prefixes"] = makefile["Makefile"].pop("Prefixes")
+ makefile["Targets"] = makefile.pop("Makefile")
+
+ _mangle_features(makefile)
+ _mangle_options(makefile)
+
+ if pipeline_variant != 'trim':
+ _mangle_prefixes(makefile)
+
+ _mangle_lanes(makefile)
+ _mangle_tags(makefile)
+
+ _split_lanes_by_filenames(makefile)
+
+ return makefile
+
+
+def _mangle_options(makefile):
+ def _do_update_options(options, data, path):
+ options = copy.deepcopy(options)
+ if "Options" in data:
+ if "Features" in data["Options"]:
+ raise MakefileError("Features may only be specified at root "
+ "level, not at %r" % (" :: ".join(path),))
+
+ # Fill out missing values using those of prior levels
+ options = fill_dict(destination=data.pop("Options"),
+ source=options)
+
+ if len(path) < 2:
+ for key in data:
+ if key != "Options":
+ _do_update_options(options, data[key], path + (key,))
+ else:
+ data["Options"] = options
+
+ for data in makefile["Targets"].itervalues():
+ _do_update_options(makefile["Options"], data, ())
+
+
+def _mangle_features(makefile):
+ """Updates old-style makefiles to match the current layout.
+
+ Specifically:
+ - v1.2.6 merged the 'RescaleQualities' switch with the 'mapDamage'
+ feature; when the former is present, it is given priority.
+ """
+
+ options = makefile['Options']
+ features = options['Features']
+
+ # Force feature if 'RescaleQualities' is set, for backwards compatibility
+ if options.pop('RescaleQualities', None):
+ features['mapDamage'] = 'rescale'
+
+ if isinstance(features['mapDamage'], bool):
+ features['mapDamage'] = 'plot' if features['mapDamage'] else 'no'
+ elif features['mapDamage'] == 'yes':
+ features['mapDamage'] = 'plot'
+
+
+def _mangle_prefixes(makefile):
+ prefixes = {}
+ for (name, values) in makefile.get("Prefixes", {}).iteritems():
+ filename = values["Path"]
+ if "*" in name[:-1]:
+ raise MakefileError("The character '*' is not allowed in Prefix "
+ "names; if you use to select .fasta files "
+ "using a search-string, then use the prefix "
+ "name '%s*' instead and specify the wildcards "
+ "in the 'Path' instead."
+ % (name.replace("*", "",)))
+ elif name.endswith("*"):
+ records = []
+ for fname in glob.glob(filename):
+ name = os.path.basename(fname).split(".")[0]
+ _VALID_PREFIX_NAME(("Prefixes", name), name)
+ new_prefix = copy.copy(values)
+ new_prefix["Path"] = fname
+
+ records.append((name, new_prefix))
+ if not records:
+ raise MakefileError("Did not find any matches for glob %s"
+ % repr(filename))
+ else:
+ records = [(name, values)]
+
+ for (name, record) in records:
+ if name in prefixes:
+ raise MakefileError("Multiple prefixes with the same name: %s"
+ % name)
+
+ if not record["Path"].endswith(".fasta"):
+ raise MakefileError("Path for prefix %r does not end with "
+ ".fasta:\n %r" % (name, record["Path"]))
+
+ record["Name"] = name
+ record["Reference"] = record["Path"]
+ prefixes[name] = record
+
+ if not prefixes:
+ raise MakefileError("At least one prefix must be specified")
+
+ makefile["Prefixes"] = prefixes
+
+
+def _mangle_lanes(makefile):
+ formatter = string.Formatter()
+ prefixes = makefile["Prefixes"]
+ for (target_name, samples) in makefile["Targets"].iteritems():
+ for (sample_name, libraries) in samples.iteritems():
+ for (library_name, lanes) in libraries.iteritems():
+ options = lanes.pop("Options")
+
+ for (lane, data) in lanes.iteritems():
+ path = (target_name, sample_name, library_name, lane)
+
+ _validate_lane_paths(data, path, formatter)
+
+ lane_type = _determine_lane_type(prefixes, data, path)
+
+ if lane_type == "Trimmed" and \
+ options["QualityOffset"] == "Solexa":
+ path = " :: ".join((target_name, sample_name,
+ library_name, lane))
+
+ raise MakefileError("Pre-trimmed Solexa data is not "
+ "supported; please convert the "
+ "quality scores to Phred (offset "
+ "33 or 64) to continue:\n"
+ " Path = %s" % (path,))
+
+ lanes[lane] = {"Type": lane_type,
+ "Data": data,
+ "Options": options}
+
+
+def _validate_lane_paths(data, path, fmt):
+ filenames = []
+ if isinstance(data, types.StringTypes):
+ filenames.append(data)
+ elif isinstance(data, types.DictType):
+ filenames.extend(data.itervalues())
+
+ for filename in filenames:
+ try:
+ fields = tuple(fmt.parse(filename))
+ except ValueError, error:
+ raise MakefileError("Error parsing path specified at %r; %s; note "
+ "that the characters '}' and '{' should only "
+ "be used as part of the key '{Pair}', in "
+ "order to specify the mate identifier: %r"
+ % (" :: ".join(path), error, filename))
+
+ for _, key, _, _ in fields:
+ if key not in (None, "Pair"):
+ raise MakefileError("Invalid path specified at %r; only the "
+ "key '{Pair}' is allowed, to specify the "
+ "mate 1 / 2 identifier, but the key "
+ "'{%s}' was found in the path: %r"
+ % (" :: ".join(path), key, filename))
+
+
+def _determine_lane_type(prefixes, data, path):
+ if isinstance(data, types.StringTypes):
+ return "Raw"
+ elif isinstance(data, types.DictType):
+ if all((key in _READ_TYPES) for key in data):
+ for (key, files) in data.iteritems():
+ is_paired = paths.is_paired_end(files)
+
+ if is_paired and (key != "Paired"):
+ raise MakefileError("Error at Barcode level; Path "
+ "includes {Pair} key, but read-type "
+ "is not Paired:\n %r"
+ % (" :: ".join(path + (key,)),))
+ elif not is_paired and (key == "Paired"):
+ raise MakefileError("Error at Barcode level; Paired pre-"
+ "trimmed reads specified, but path "
+ "does not contain {Pair} key:\n %r"
+ % (" :: ".join(path + (key,)),))
+
+ return "Trimmed"
+ elif all((key in prefixes) for key in data):
+ print_warn("WARNING: Makefile specifies pre-aligned reads in the\n"
+ " form of a BAM file; support for including\n"
+ " BAMs is deprecated, and will be removed in\n"
+ " future versions of PALEOMIX!\n\n"
+ " Location =\n"
+ " %s\n"
+ % (" :: ".join(path),))
+ return "BAMs"
+
+ raise MakefileError("Error at Barcode level; keys must either be "
+ "prefix-names, OR 'Paired', 'Single', 'Collapsed', "
+ "'CollapsedTruncated', or 'Singleton'. "
+ "Found: %s" % (", ".join(data),))
+
+
+def _mangle_tags(makefile):
+ for (target, samples) in makefile["Targets"].iteritems():
+ for (sample, libraries) in samples.iteritems():
+ for (library, barcodes) in libraries.iteritems():
+ for (barcode, record) in barcodes.iteritems():
+ tags = {"Target": target,
+ "ID": library,
+ "SM": sample,
+ "LB": library,
+ # Source/Current PU may differ if a lane has been
+ # split by filenames, in which case PU_src contains
+ # the original PU, and PU_cur is a derived PU.
+ "PU_src": barcode,
+ "PU_cur": barcode,
+ "PG": record["Options"]["Aligners"]["Program"],
+ "PL": record["Options"]["Platform"].upper()}
+
+ record["Tags"] = tags
+
+
+def _split_lanes_by_filenames(makefile):
+ iterator = _iterate_over_records(makefile)
+ for (target, sample, library, barcode, record) in iterator:
+ if record["Type"] == "Raw":
+ template = record["Data"]
+ path = (target, sample, library, barcode)
+ record["Data"] = files = paths.collect_files(path, template)
+ split = record["Options"]["SplitLanesByFilenames"]
+
+ if (split is True) or (isinstance(split, list) and (barcode in split)):
+ if any(len(v) > 1 for v in files.itervalues()):
+ template = makefile["Targets"][target][sample][library].pop(barcode)
+ keys = ("SE",) if ("SE" in files) else ("PE_1", "PE_2")
+
+ input_files = [files[key] for key in keys]
+ assert len(input_files[0]) == len(input_files[-1]), input_files
+
+ input_files_iter = itertools.izip_longest(*input_files)
+ for (index, filenames) in enumerate(input_files_iter, start=1):
+ assert len(filenames) == len(keys)
+ new_barcode = "%s_%03i" % (barcode, index)
+
+ current = copy.deepcopy(template)
+ current["Data"] = dict((key, [filename]) for (key, filename) in zip(keys, filenames))
+ current["Tags"]["PU_cur"] = new_barcode
+
+ makefile["Targets"][target][sample][library][new_barcode] = current
+
+
+def _validate_makefiles(config, makefiles):
+ for makefile in makefiles:
+ _validate_makefile_libraries(makefile)
+ _validate_makefile_adapters(makefile)
+ _validate_makefiles_duplicate_targets(config, makefiles)
+ _validate_makefiles_duplicate_files(makefiles)
+ _validate_makefiles_features(makefiles)
+ _validate_prefixes(makefiles)
+
+ return makefiles
+
+
+def _validate_makefile_adapters(makefile):
+ """Checks for the default adapter sequences specified in the wrong
+ orientation for AdapterRemoval, which is a typical mistake when using
+ the --pcr2 option.
+ """
+ # The non-reverse complemented mate 2 adapter, as seen in raw FASTQ reads
+ adapter_2 = "AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT"
+
+ tests = {
+ # --pcr2 expects the reverse complement of the mate 2 adapter seq.
+ "--pcr2": adapter_2,
+ # --adapter2 (AdapterRemoval v2) expects the regular sequence
+ "--adapter2": sequences.reverse_complement(adapter_2)
+ }
+
+ def check_options(options, results):
+ for key, value in tests.iteritems():
+ if options.get(key) == value:
+ results[key] = True
+
+ results = dict.fromkeys(tests, False)
+ for (_, _, _, _, record) in _iterate_over_records(makefile):
+ adapterrm_opt = record.get("Options", {}).get("AdapterRemoval", {})
+ check_options(adapterrm_opt, results)
+
+ adapterrm_opt = makefile.get("Options", {}).get("AdapterRemoval", {})
+ check_options(adapterrm_opt, results)
+
+ if any(results.itervalues()):
+ print_warn("WARNING: An adapter specified for AdapterRemoval "
+ "corresponds to the default sequence, but is reverse "
+ "complemented. Please make sure that this is intended! ",
+ end="")
+
+ if results["--pcr2"]:
+ print_warn("For --pcr2, the sequence given should be the "
+ "reverse complement of the sequence observed in the "
+ "mate 2 FASTQ file.\n")
+
+ if results["--adapter2"]:
+ print_warn("For --adapter2 (AdapterRemoval v2, only) the value "
+ "should be exactly as observed in the FASTQ reads.\n")
+
+
+def _validate_makefile_libraries(makefile):
+ libraries = collections.defaultdict(set)
+ iterator = _iterate_over_records(makefile)
+ for (target, sample, library, _, _) in iterator:
+ libraries[(target, library)].add(sample)
+
+ for ((target, library), samples) in libraries.iteritems():
+ if len(samples) > 1:
+ raise MakefileError("Library '%s' in target '%s' spans multiple "
+ " samples: %s" % (library, target,
+ ", ".join(samples)))
+
+
+def _validate_makefiles_duplicate_files(makefiles):
+ filenames = collections.defaultdict(list)
+ for makefile in makefiles:
+ iterator = _iterate_over_records(makefile)
+ for (target, sample, library, barcode, record) in iterator:
+ current_filenames = []
+ if record["Type"] == "Raw":
+ for raw_filenames in record["Data"].itervalues():
+ current_filenames.extend(raw_filenames)
+ else:
+ current_filenames.extend(record["Data"].values())
+
+ for realpath in map(os.path.realpath, current_filenames):
+ filenames[realpath].append((target, sample, library, barcode))
+
+ has_overlap = {}
+ for (filename, records) in filenames.iteritems():
+ if len(records) > 1:
+ has_overlap[filename] = list(set(records))
+
+ by_records = sorted(zip(has_overlap.values(), has_overlap.keys()))
+ for (records, pairs) in itertools.groupby(by_records, lambda x: x[0]):
+ pairs = list(pairs)
+ description = _describe_files_in_multiple_records(records, pairs)
+
+ if len(set(record[0] for record in records)) != len(records):
+ message = "Path included multiple times in target:\n"
+ raise MakefileError(message + description)
+ else:
+ print_warn("WARNING: Path included in multiple targets:\n%s\n"
+ % (description,))
+
+
+def _describe_files_in_multiple_records(records, pairs):
+ descriptions = []
+ for (index, record) in enumerate(records, start=1):
+ descriptions.append("\t- Record {0}: Name: {1}, Sample: {2}, "
+ "Library: {3}, Barcode: {4}".format(index,
+ *record))
+
+ for (index, (_, filename)) in enumerate(sorted(pairs), start=1):
+ message = "\t- Canonical path {0}: {1}"
+ descriptions.append(message.format(index, filename))
+
+ return "\n".join(descriptions)
+
+
+def _validate_makefiles_duplicate_targets(config, makefiles):
+ targets = set()
+ for makefile in makefiles:
+ destination = config.destination
+ if destination is None:
+ filename = makefile["Statistics"]["Filename"]
+ destination = os.path.dirname(filename)
+
+ for target in makefile["Targets"]:
+ key = (destination, target)
+ if key in targets:
+ raise MakefileError("Target name '%s' used multiple times; "
+ "output files would be clobbered!"
+ % target)
+ targets.add(key)
+
+
+def _validate_makefiles_features(makefiles):
+ for makefile in makefiles:
+ features = makefile["Options"]["Features"]
+ roi_enabled = False
+
+ for prefix in makefile["Prefixes"].itervalues():
+ roi_enabled |= bool(prefix.get("RegionsOfInterest"))
+
+ if features["Depths"] and roi_enabled:
+ if not (features["RawBAM"] or features["RealignedBAM"]):
+ raise MakefileError("The feature 'Depths' (depth histograms) "
+ "with RegionsOfInterest enabled, requires "
+ "that either the feature 'RawBAM' or the "
+ "feature 'RalignedBAM' is enabled.")
+
+
+def _validate_prefixes(makefiles):
+ """Validates prefixes and regions-of-interest, including an implementation
+ of the checks included in GATK, which require that the FASTA for the human
+ genome is ordered 1 .. 23. This is required since GATK will not run with
+ human genomes in a different order.
+ """
+ already_validated = set()
+ print_info(" - Validating prefixes ...")
+ for makefile in makefiles:
+ uses_gatk = makefile["Options"]["Features"]["RealignedBAM"]
+ for prefix in makefile["Prefixes"].itervalues():
+ path = prefix["Path"]
+ if path in already_validated:
+ continue
+
+ if not os.path.exists(path):
+ print_info(" - Reference FASTA file does not exist:\n"
+ " %r" % (path,))
+ continue
+ elif not os.path.exists(path + ".fai"):
+ print_info(" - Index does not exist for %r; this may "
+ "take a while ..." % (path,))
+
+ try:
+ contigs = FASTA.index_and_collect_contigs(path)
+ except FASTAError, error:
+ raise MakefileError("Error indexing FASTA:\n %s" % (error,))
+
+ # Implementation of GATK checks for the human genome
+ _do_validate_hg_prefix(makefile, prefix, contigs, fatal=uses_gatk)
+
+ contigs = dict(contigs)
+ regions_of_interest = prefix.get("RegionsOfInterest", {})
+ for (name, fpath) in regions_of_interest.iteritems():
+ try:
+ # read_bed_file returns iterator
+ for _ in bedtools.read_bed_file(fpath, contigs=contigs):
+ pass
+ except (bedtools.BEDError, IOError), error:
+ raise MakefileError("Error reading regions-of-"
+ "interest %r for prefix %r:\n%s"
+ % (name, prefix["Name"], error))
+
+ already_validated.add(path)
+
+
+def _do_validate_hg_prefix(makefile, prefix, contigs, fatal):
+ if not _is_invalid_hg_prefix(contigs):
+ return
+
+ message = \
+ "Prefix appears to be a human genome, but chromosomes are ordered\n" \
+ "lexically (chr1, chr10, chr11, ...), rather than numerically\n" \
+ "(chr1, chr2, chr3, ...):\n\n" \
+ " Makefile = %s\n" \
+ " Prefix = %s\n\n" \
+ "GATK requires that human chromosomes are ordered numerically;\n%s\n" \
+ "See the documentation at the GATK website for more information:\n " \
+ "http://www.broadinstitute.org/gatk/guide/article?id=1204\n"
+
+ prefix_path = prefix["Path"]
+ mkfile_path = makefile["Statistics"]["Filename"]
+ if fatal:
+ details = "Either disable GATK in the makefile, or fix the prefix."
+ message %= (mkfile_path, prefix_path, details)
+
+ raise MakefileError(message)
+ else:
+ details = \
+ "You will not be able to use the resulting BAM file with GATK."
+ message %= (mkfile_path, prefix_path, details)
+ print_warn("\nWARNING:\n", message, sep="")
+
+
+def _is_invalid_hg_prefix(contigs):
+ hg_contigs = {
+ # Contig sizes based on hg18 and hg19 and hg38
+ "chr1": [247249719, 249250621, 248956422],
+ "chr2": [242951149, 243199373, 242193529],
+ "chr10": [135374737, 135534747, 133797422],
+ }
+
+ size_to_idx = dict((size, idx) for (idx, (_, size)) in enumerate(contigs))
+
+ # Equivalent to the GATK 'nonCanonicalHumanContigOrder' function
+ for (key, values) in hg_contigs.iteritems():
+ for value in values:
+ if value in size_to_idx:
+ hg_contigs[key] = size_to_idx[value]
+ break
+ else:
+ # Contig not found; probably not hg18, hg19, or hg38
+ return False
+
+ return not (hg_contigs["chr1"] < hg_contigs["chr2"] < hg_contigs["chr10"])
+
+
+def _iterate_over_records(makefile):
+ for (target, samples) in makefile["Targets"].items():
+ for (sample, libraries) in samples.items():
+ for (library, barcodes) in libraries.items():
+ for (barcode, record) in barcodes.items():
+ yield target, sample, library, barcode, record
diff --git a/paleomix/tools/bam_pipeline/mkfile.py b/paleomix/tools/bam_pipeline/mkfile.py
new file mode 100755
index 0000000..dc81423
--- /dev/null
+++ b/paleomix/tools/bam_pipeline/mkfile.py
@@ -0,0 +1,385 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+from __future__ import print_function
+
+import os
+import sys
+import glob
+import datetime
+from optparse import OptionParser
+
+from paleomix.common.console import \
+ print_info, \
+ print_err
+
+
+_TEMPLATE_TOP = \
+ """# -*- mode: Yaml; -*-
+# Timestamp: %s
+#
+# Default options.
+# Can also be specific for a set of samples, libraries, and lanes,
+# by including the "Options" hierarchy at the same level as those
+# samples, libraries, or lanes below. This does not include
+# "Features", which may only be specific globally.
+Options:
+ # Sequencing platform, see SAM/BAM reference for valid values
+ Platform: Illumina
+ # Quality offset for Phred scores, either 33 (Sanger/Illumina 1.8+)
+ # or 64 (Illumina 1.3+ / 1.5+). For Bowtie2 it is also possible to
+ # specify 'Solexa', to handle reads on the Solexa scale. This is
+ # used during adapter-trimming and sequence alignment
+ QualityOffset: 33
+ # Split a lane into multiple entries, one for each (pair of) file(s)
+ # found using the search-string specified for a given lane. Each
+ # lane is named by adding a number to the end of the given barcode.
+ SplitLanesByFilenames: yes
+ # Compression format for FASTQ reads; 'gz' for GZip, 'bz2' for BZip2
+ CompressionFormat: bz2
+
+ # Settings for trimming of reads, see AdapterRemoval man-page
+ AdapterRemoval:
+ # Adapter sequences, set and uncomment to override defaults
+# --adapter1: AGATCGGAAGAGCACACGTCTGAACTCCAGTCACNNNNNNATCTCGTATGCCGTCTTCTGCTTG
+# --adapter2: AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT
+ # Some BAM pipeline defaults differ from AR defaults;
+ # To override, change these value(s):
+ --mm: 3
+ --minlength: 25
+ # Extra features enabled by default; change 'yes' to 'no' to disable
+ --collapse: yes
+ --trimns: yes
+ --trimqualities: yes
+"""
+
+_TEMPLATE_BAM_OPTIONS = \
+ """ # Settings for aligners supported by the pipeline
+ Aligners:
+ # Choice of aligner software to use, either "BWA" or "Bowtie2"
+ Program: BWA
+
+ # Settings for mappings performed using BWA
+ BWA:
+ # One of "backtrack", "bwasw", or "mem"; see the BWA documentation
+ # for a description of each algorithm (defaults to 'backtrack')
+ Algorithm: backtrack
+ # Filter aligned reads with a mapping quality (Phred) below this value
+ MinQuality: 0
+ # Filter reads that did not map to the reference sequence
+ FilterUnmappedReads: yes
+ # May be disabled ("no") for aDNA alignments, as post-mortem damage
+ # localizes to the seed region, which BWA expects to have few
+ # errors (sets "-l"). See http://pmid.us/22574660
+ UseSeed: yes
+ # Additional command-line options may be specified for the "aln"
+ # call(s), as described below for Bowtie2 below.
+
+ # Settings for mappings performed using Bowtie2
+ Bowtie2:
+ # Filter aligned reads with a mapping quality (Phred) below this value
+ MinQuality: 0
+ # Filter reads that did not map to the reference sequence
+ FilterUnmappedReads: yes
+ # Examples of how to add additional command-line options
+# --trim5: 5
+# --trim3: 5
+ # Note that the colon is required, even if no value is specified
+ --very-sensitive:
+ # Example of how to specify multiple values for an option
+# --rg:
+# - CN:SequencingCenterNameHere
+# - DS:DescriptionOfReadGroup
+
+ # Mark / filter PCR duplicates. If set to 'filter', PCR duplicates are
+ # removed from the output files; if set to 'mark', PCR duplicates are
+ # flagged with bit 0x400, and not removed from the output files; if set to
+ # 'no', the reads are assumed to not have been amplified. Collapsed reads
+ # are filtered using the command 'paleomix rmdup_duplicates', while "normal"
+ # reads are filtered using Picard MarkDuplicates.
+ PCRDuplicates: filter
+
+ # Command-line options for mapDamage; note that the long-form
+ # options are expected; --length, not -l, etc. Uncomment the
+ # "mapDamage" line adding command-line options below.
+ mapDamage:
+ # By default, the pipeline will downsample the input to 100k hits
+ # when running mapDamage; remove to use all hits
+ --downsample: 100000
+
+ # Set to 'yes' exclude a type of trimmed reads from alignment / analysis;
+ # possible read-types reflect the output of AdapterRemoval
+ ExcludeReads:
+ # Exclude single-end reads (yes / no)?
+ Single: no
+ # Exclude non-collapsed paired-end reads (yes / no)?
+ Paired: no
+ # Exclude paired-end reads for which the mate was discarded (yes / no)?
+ Singleton: no
+ # Exclude overlapping paired-ended reads collapsed into a single sequence
+ # by AdapterRemoval (yes / no)?
+ Collapsed: no
+ # Like 'Collapsed', but only for collapsed reads truncated due to the
+ # presence of ambiguous or low quality bases at read termini (yes / no).
+ CollapsedTruncated: no
+
+ # Optional steps to perform during processing.
+ Features:
+ # Generate BAM without realignment around indels (yes / no)
+ RawBAM: no
+ # Generate indel-realigned BAM using the GATK Indel realigner (yes / no)
+ RealignedBAM: yes
+ # To disable mapDamage, write 'no'; to generate basic mapDamage plots,
+ # write 'plot'; to build post-mortem damage models, write 'model',
+ # and to produce rescaled BAMs, write 'rescale'. The 'model' option
+ # includes the 'plot' output, and the 'rescale' option includes both
+ # 'plot' and 'model' results. All analyses are carried out per library.
+ mapDamage: plot
+ # Generate coverage information for the raw BAM (wo/ indel realignment).
+ # If one or more 'RegionsOfInterest' have been specified for a prefix,
+ # additional coverage files are generated for each alignment (yes / no)
+ Coverage: yes
+ # Generate histogram of number of sites with a given read-depth, from 0
+ # to 200. If one or more 'RegionsOfInterest' have been specified for a
+ # prefix, additional histograms are generated for each alignment (yes / no)
+ Depths: yes
+ # Generate summary table for each target (yes / no)
+ Summary: yes
+ # Generate histogram of PCR duplicates, for use with PreSeq (yes / no)
+ DuplicateHist: no
+"""
+
+_TEMPLATE_PREFIXES = """
+# Map of prefixes by name, each having a Path key, which specifies the
+# location of the BWA/Bowtie2 index, and optional label, and an option
+# set of regions for which additional statistics are produced.
+Prefixes:
+ # Replace 'NAME_OF_PREFIX' with name of the prefix; this name
+ # is used in summary statistics and as part of output filenames.
+ NAME_OF_PREFIX:
+ # Replace 'PATH_TO_PREFIX' with the path to .fasta file containing the
+ # references against which reads are to be mapped. Using the same name
+ # as filename is strongly recommended (e.g. /path/to/Human_g1k_v37.fasta
+ # should be named 'Human_g1k_v37').
+ Path: PATH_TO_PREFIX
+
+ # (Optional) Uncomment and replace 'PATH_TO_BEDFILE' with the path to a
+ # .bed file listing extra regions for which coverage / depth statistics
+ # should be calculated; if no names are specified for the BED records,
+ # results are named after the chromosome / contig. Change 'NAME' to the
+ # name to be used in summary statistics and output filenames.
+# RegionsOfInterest:
+# NAME: PATH_TO_BEDFILE
+"""
+
+_TEMPLATE_SAMPLES = """
+# Mapping targets are specified using the following structure. Uncomment and
+# replace 'NAME_OF_TARGET' with the desired prefix for filenames.
+#NAME_OF_TARGET:
+ # Uncomment and replace 'NAME_OF_SAMPLE' with the name of this sample.
+# NAME_OF_SAMPLE:
+ # Uncomment and replace 'NAME_OF_LIBRARY' with the name of this sample.
+# NAME_OF_LIBRARY:
+ # Uncomment and replace 'NAME_OF_LANE' with the name of this lane,
+ # and replace 'PATH_WITH_WILDCARDS' with the path to the FASTQ files
+ # to be trimmed and mapped for this lane (may include wildcards).
+# NAME_OF_LANE: PATH_WITH_WILDCARDS
+"""
+
+_FILENAME = "SampleSheet.csv"
+
+
+def build_makefile(add_full_options=True,
+ add_prefix_tmpl=True,
+ add_sample_tmpl=True):
+ timestamp = datetime.datetime.now().isoformat()
+ template_parts = [_TEMPLATE_TOP % (timestamp,)]
+
+ if add_full_options:
+ template_parts.append(_TEMPLATE_BAM_OPTIONS)
+
+ if add_prefix_tmpl:
+ template_parts.append(_TEMPLATE_PREFIXES)
+
+ if add_sample_tmpl:
+ template_parts.append(_TEMPLATE_SAMPLES)
+
+ return "\n".join(template_parts)
+
+
+def strip_comments(text):
+ lines = text.split("\n")
+
+ # Always include minimal header
+ minimal_template = lines[:3]
+ for line in lines[3:]:
+ if not line.lstrip().startswith("#"):
+ line = line.split("#", 1)[0].rstrip()
+
+ # Avoid too many empty lines
+ if line.strip() or minimal_template[-1].strip():
+ minimal_template.append(line)
+
+ return "\n".join(minimal_template)
+
+
+def read_alignment_records(filename):
+ results = []
+ with open(filename) as records:
+ line = records.readline()
+ if not line:
+ print_err("ERROR: Empty SampleSheet.csv file: %r"
+ % (filename,))
+ return None
+
+ header = line.strip().split(",")
+ missing = set(("SampleID", "Index", "Lane", "FCID")) - set(header)
+ if missing:
+ print_err("ERROR: Required columns missing from SampleSheet file "
+ "%r: %s" % (filename, ", ".join(map(repr, missing))))
+ return None
+
+ for idx, line in enumerate(records, start=2):
+ line = line.strip()
+ if not line:
+ continue
+
+ fields = line.split(",")
+ if len(fields) != len(header):
+ print_err("Line %i in SampleSheet file %r does not contain "
+ "the expected number of columns; expected %i, but "
+ "found %i."
+ % (idx, filename, len(header), len(fields)))
+ return None
+
+ results.append(dict(zip(header, fields)))
+
+ return results
+
+
+def parse_args(argv):
+ parser = OptionParser("Usage: %prog [/path/to/SampleSheet.csv, ...]")
+ parser.add_option("--minimal", default=False, action="store_true",
+ help="Strip comments from makefile template.")
+
+ return parser.parse_args(argv)
+
+
+def select_path(path):
+ has_r1 = bool(glob.glob(path.format(Pair=1)))
+ has_r2 = bool(glob.glob(path.format(Pair=2)))
+
+ if has_r1 and not has_r2:
+ # Single-ended reads
+ return path.format(Pair=1)
+ return path
+
+
+def read_sample_sheets(filenames):
+ records = {}
+ for root in filenames:
+ if os.path.isdir(root):
+ filename = os.path.join(root, _FILENAME)
+ else:
+ root, filename = os.path.split(root)[0], root
+
+ if not os.path.exists(filename):
+ print_err("ERROR: Could not find SampleSheet file: %r" % filename)
+ return None
+
+ sample_sheet = read_alignment_records(filename)
+ if sample_sheet is None:
+ return None
+
+ for record in sample_sheet:
+ record["Lane"] = int(record["Lane"])
+ path = "%(SampleID)s_%(Index)s_L%(Lane)03i_R{Pair}_*.fastq.gz" \
+ % record
+ record["Path"] = select_path(os.path.join(root, path))
+ key = "%(FCID)s_%(Lane)s" % record
+
+ libraries = records.setdefault(record["SampleID"], {})
+ barcodes = libraries.setdefault(record["Index"], {})
+ barcodes.setdefault(key, []).append(path)
+
+ # Clean up names; generate unique names for duplicate lanes
+ for libraries in records.itervalues():
+ for barcodes in libraries.itervalues():
+ for key, paths in barcodes.items():
+ if len(paths) == 1:
+ barcodes[key] = paths[0]
+ continue
+
+ counter = 1
+ for path in paths:
+ new_key = "%s_%i" % (key, counter)
+
+ while new_key in barcodes:
+ counter += 1
+ new_key = "%s_%i" % (key, counter)
+
+ barcodes[new_key] = path
+
+ barcodes.pop(key)
+
+ return records
+
+
+def print_samples(records):
+ print()
+ for (sample, libraries) in sorted(records.iteritems()):
+ print("%s:" % sample)
+ print(" %s:" % sample)
+ for (library, barcodes) in sorted(libraries.iteritems()):
+ print(" %s:" % library)
+ for key, path in sorted(barcodes.iteritems()):
+ print(" %s: %s" % (key, path))
+ print()
+ print()
+
+
+def main(argv, pipeline="bam"):
+ assert pipeline in ("bam", "trim"), pipeline
+
+ options, filenames = parse_args(argv)
+ records = read_sample_sheets(filenames)
+ if records is None:
+ return 1
+
+ template = build_makefile(add_full_options=(pipeline == "bam"),
+ add_prefix_tmpl=(pipeline == "bam"),
+ add_sample_tmpl=not records)
+ if options.minimal:
+ template = strip_comments(template)
+
+ print(template)
+
+ print_samples(records)
+
+ if argv:
+ print_info("Automatically generated makefile printed.\n"
+ "Please check for correctness before running pipeline.")
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv[1:]))
diff --git a/paleomix/tools/bam_pipeline/nodes.py b/paleomix/tools/bam_pipeline/nodes.py
new file mode 100644
index 0000000..5ee01c7
--- /dev/null
+++ b/paleomix/tools/bam_pipeline/nodes.py
@@ -0,0 +1,139 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+
+import paleomix.nodes.picard as picard
+
+from paleomix.common.fileutils import \
+ swap_ext
+from paleomix.atomiccmd.command import \
+ AtomicCmd
+from paleomix.atomiccmd.builder import \
+ AtomicCmdBuilder
+from paleomix.atomiccmd.sets import \
+ ParallelCmds
+
+from paleomix.nodes.picard import \
+ PicardNode, \
+ ValidateBAMNode
+from paleomix.nodes.samtools import \
+ BAMIndexNode
+
+
+def index_and_validate_bam(config, prefix, node, log_file=None,
+ create_index=True):
+ input_file, has_index = _get_input_file(node)
+ if not has_index and create_index:
+ node = BAMIndexNode(infile=input_file,
+ dependencies=node)
+
+ validation_params = ValidateBAMNode.customize(config=config,
+ input_bam=input_file,
+ output_log=log_file,
+ dependencies=node)
+
+ # Ensure that the validation node is re-run if the index changes
+ if has_index or create_index:
+ bai_filename = swap_ext(input_file, ".bai")
+ validation_params.command.set_kwargs(IN_BAI=bai_filename)
+
+ # Check MD tags against reference sequence
+ # FIXME: Disabled due to issues with Picard/Samtools disagreeing,
+ # backwards compatibility. See the discussion at
+ # http://sourceforge.net/mailarchive/message.php?msg_id=31348639
+ # validation_params.command.set_kwargs(IN_REF=prefix["Reference"])
+ # validation_params.command.add_option("R", "%(IN_REF)s", sep="=")
+
+ # Ignored since we may filter out misses and low-quality hits during
+ # mapping, which leads to a large proportion of missing PE mates.
+ validation_params.command.add_option("IGNORE", "MATE_NOT_FOUND",
+ sep="=")
+ # Ignored due to high rate of false positives for lanes with few hits,
+ # where high-quality reads may cause mis-identification of qualities
+ validation_params.command.add_option("IGNORE",
+ "INVALID_QUALITY_FORMAT", sep="=")
+
+ return validation_params.build_node()
+
+
+def _get_input_file(node):
+ input_filename, has_index = None, False
+ for filename in node.output_files:
+ if filename.lower().endswith(".bai"):
+ has_index = True
+ elif filename.lower().endswith(".bam"):
+ input_filename = filename
+
+ return input_filename, has_index
+
+
+class CleanupBAMNode(PicardNode):
+ def __init__(self, config, reference, input_bam, output_bam, tags,
+ min_mapq=0, filter_unmapped=False, dependencies=()):
+ flt_params = AtomicCmdBuilder(("samtools", "view", "-bu"),
+ IN_BAM=input_bam,
+ OUT_STDOUT=AtomicCmd.PIPE)
+
+ if min_mapq:
+ flt_params.set_option("-q", min_mapq, sep="")
+ if filter_unmapped:
+ flt_params.set_option("-F", "0x4", sep="")
+
+ flt_params.add_value("%(IN_BAM)s")
+
+ jar_params = picard.picard_command(config, "AddOrReplaceReadGroups")
+ jar_params.set_option("INPUT", "/dev/stdin", sep="=")
+ # Output is written to a named pipe, since the JVM may, in some cases,
+ # emit warning messages to stdout, resulting in a malformed BAM.
+ jar_params.set_option("OUTPUT", "%(TEMP_OUT_BAM)s", sep="=")
+ jar_params.set_option("COMPRESSION_LEVEL", "0", sep="=")
+ # Ensure that the BAM is sorted; this is required by the pipeline, and
+ # needs to be done before calling calmd (avoiding pathologic runtimes).
+ jar_params.set_option("SORT_ORDER", "coordinate", sep="=")
+
+ # All tags are overwritten; ID is set since the default (e.g. '1')
+ # causes problems with pysam due to type inference (is read as a length
+ # 1 string, but written as a character).
+ for tag in ("ID", "SM", "LB", "PU", "PL"):
+ jar_params.set_option(tag, tags[tag], sep="=")
+
+ jar_params.set_kwargs(IN_STDIN=flt_params,
+ TEMP_OUT_BAM="bam.pipe")
+
+ calmd = AtomicCmdBuilder(["samtools", "calmd", "-b",
+ "%(TEMP_IN_BAM)s", "%(IN_REF)s"],
+ IN_REF=reference,
+ TEMP_IN_BAM="bam.pipe",
+ OUT_STDOUT=output_bam)
+
+ commands = [cmd.finalize() for cmd in (flt_params, jar_params, calmd)]
+ description = "<Cleanup BAM: %s -> '%s'>" \
+ % (input_bam, output_bam)
+ PicardNode.__init__(self,
+ command=ParallelCmds(commands),
+ description=description,
+ dependencies=dependencies)
+
+ def _setup(self, config, temp_root):
+ PicardNode._setup(self, config, temp_root)
+ os.mkfifo(os.path.join(temp_root, "bam.pipe"))
diff --git a/paleomix/tools/bam_pipeline/parts/__init__.py b/paleomix/tools/bam_pipeline/parts/__init__.py
new file mode 100644
index 0000000..df9cb53
--- /dev/null
+++ b/paleomix/tools/bam_pipeline/parts/__init__.py
@@ -0,0 +1,7 @@
+from reads import Reads
+from lane import Lane
+from library import Library
+from sample import Sample
+from prefix import Prefix
+from target import Target
+from statistics import add_statistics_nodes
diff --git a/paleomix/tools/bam_pipeline/parts/lane.py b/paleomix/tools/bam_pipeline/parts/lane.py
new file mode 100644
index 0000000..34ce62d
--- /dev/null
+++ b/paleomix/tools/bam_pipeline/parts/lane.py
@@ -0,0 +1,310 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import copy
+
+import paleomix.tools.bam_pipeline.paths as paths
+
+from paleomix.atomiccmd.builder import \
+ apply_options
+
+from paleomix.common.makefile import \
+ MakefileError
+
+from paleomix.nodes.bwa import \
+ BWAAlgorithmNode, \
+ BWABacktrack, \
+ BWASampe, \
+ BWASamse
+from paleomix.nodes.bowtie2 import \
+ Bowtie2Node
+
+from paleomix.tools.bam_pipeline.parts import \
+ Reads
+from paleomix.tools.bam_pipeline.nodes import \
+ CleanupBAMNode, \
+ index_and_validate_bam
+
+from paleomix.common.fileutils import \
+ swap_ext
+
+
+#
+_TRIMMED_READS_CACHE = {}
+
+
+class Lane:
+ def __init__(self, config, prefix, record, name):
+ self.name = name
+ self.bams = {}
+ self.reads = None
+ self.options = copy.deepcopy(record["Options"])
+ self.tags = tags = copy.deepcopy(record["Tags"])
+ self.tags["PU"] = self.tags["PU_src"]
+ self.tags["PG"] = self.tags["PG"].lower()
+ self.folder = os.path.join(config.destination,
+ tags["Target"],
+ prefix["Name"],
+ tags["SM"],
+ tags["LB"],
+ tags["PU_cur"])
+
+ if record["Type"] == "BAMs":
+ self._init_pre_aligned_lane(config, prefix, record)
+ else:
+ self._init_reads(config, record)
+ self._init_unaligned_lane(config, prefix, record)
+
+ def _init_pre_aligned_lane(self, config, prefix, record):
+ if prefix["Name"] not in record["Data"]:
+ return
+
+ input_filename = record["Data"][prefix["Name"]]
+ output_filename = os.path.join(self.folder, "processed.bam")
+
+ node = CleanupBAMNode(config=config,
+ reference=prefix["Reference"],
+ input_bam=input_filename,
+ output_bam=output_filename,
+ tags=self.tags,
+ dependencies=prefix["Nodes"])
+
+ index_required = self._is_indexing_required(prefix)
+ validated_node = index_and_validate_bam(config, prefix, node,
+ create_index=index_required)
+
+ self.bams["Processed"] = {output_filename: validated_node}
+
+ def _init_reads(self, config, record):
+ key = tuple(self.tags[key] for key in ("Target", "SM", "LB", "PU_cur"))
+ if key not in _TRIMMED_READS_CACHE:
+ _TRIMMED_READS_CACHE[key] \
+ = Reads(config, record, record["Options"]["QualityOffset"])
+ self.reads = _TRIMMED_READS_CACHE[key]
+
+ def _init_unaligned_lane(self, config, prefix, record):
+ prefix_key = "Nodes:%s" % (self.options["Aligners"]["Program"],)
+
+ for (key, input_filename) in self.reads.files.iteritems():
+ # Common parameters between BWA / Bowtie2
+ output_filename = os.path.join(self.folder,
+ "%s.bam" % (key.lower(),))
+
+ parameters = {
+ "input_file": input_filename,
+ "output_file": output_filename,
+ "prefix": prefix["Path"],
+ "reference": prefix["Reference"],
+ "dependencies": self.reads.nodes + prefix[prefix_key],
+ }
+
+ nodes = self._build_alignment_nodes(config=config,
+ record=record,
+ prefix=prefix,
+ parameters=parameters)
+
+ self.bams[key] = {output_filename: nodes}
+
+ def _build_alignment_nodes(self, config, record, prefix, parameters):
+ if self.options["Aligners"]["Program"] == "BWA":
+ algorithm = self.options["Aligners"]["BWA"]["Algorithm"].lower()
+ parameters["threads"] = config.bwa_max_threads
+ parameters["algorithm"] = algorithm
+
+ if algorithm == "backtrack":
+ return self._build_bwa_backtrack(config=config,
+ record=record,
+ prefix=prefix,
+ parameters=parameters)
+ elif algorithm in ('mem', 'bwasw'):
+ return self._build_bwa_algorithm(config=config,
+ record=record,
+ prefix=prefix,
+ parameters=parameters)
+ else:
+ raise NotImplementedError('BWA %r not implemented!'
+ % (algorithm,))
+
+ elif self.options["Aligners"]["Program"] == "Bowtie2":
+ return self._build_bowtie2(config=config,
+ record=record,
+ prefix=prefix,
+ parameters=parameters)
+ else:
+ raise NotImplementedError('Aligner %r not implemented!'
+ % (self.options["Aligners"]["Program"],))
+
+ def _is_indexing_required(self, prefix):
+ """Returns true if indexing lane BAMs is nessesary.
+ """
+ # Indexes are required for all files when calculating region statistics
+ return bool(prefix.get("RegionsOfInterest")) or \
+ (self.options["Features"]["RealignedBAM"] and not
+ # and for BAMs fed to GATK, but in this case we only use these
+ # indexes if we don't generate PCR filtered or recaled BAMs
+ (self.options["Features"]["mapDamage"] == 'rescale' or
+ self.options["PCRDuplicates"]))
+
+ def _set_rg_tags(self, command):
+ command.set_option("--rg-id", self.tags["ID"])
+ for tag_name in ("SM", "LB", "PU", "PL", "PG"):
+ tag_value = "%s:%s" % (tag_name, self.tags[tag_name])
+ command.add_option("--rg", tag_value)
+
+ ###########################################################################
+ ###########################################################################
+ # Construction of mapping nodes
+
+ def _build_bwa_backtrack(self, config, prefix, record, parameters):
+ if paths.is_paired_end(parameters["input_file"]):
+ return self._build_bwa_backtrack_pe(config=config,
+ prefix=prefix,
+ record=record,
+ parameters=parameters)
+ else:
+ return self._build_bwa_backtrack_se(config=config,
+ prefix=prefix,
+ record=record,
+ parameters=parameters)
+
+ def _build_bwa_backtrack_aln(self, parameters, input_file, output_file):
+ """
+ """
+ node = BWABacktrack.customize(input_file=input_file,
+ output_file=output_file,
+ threads=parameters["threads"],
+ prefix=parameters["prefix"],
+ reference=parameters["reference"],
+ dependencies=parameters["dependencies"])
+
+ if not self.options["Aligners"]["BWA"]["UseSeed"]:
+ node.commands["aln"].set_option("-l", 2 ** 16 - 1)
+
+ if self.options["QualityOffset"] in (64, "Solexa"):
+ node.commands["aln"].set_option("-I")
+
+ apply_options(node.commands["aln"], self.options["Aligners"]["BWA"])
+
+ return node.build_node()
+
+ def _build_bwa_backtrack_se(self, config, prefix, record, parameters):
+ input_file_fq = parameters.pop("input_file")
+ output_file_bam = parameters.pop("output_file")
+ output_file_sai = swap_ext(output_file_bam, ".sai")
+
+ aln_node = self._build_bwa_backtrack_aln(parameters=parameters,
+ input_file=input_file_fq,
+ output_file=output_file_sai)
+
+ sam_node = BWASamse.customize(input_file_fq=input_file_fq,
+ input_file_sai=output_file_sai,
+ output_file=output_file_bam,
+ prefix=parameters["prefix"],
+ reference=parameters["reference"],
+ dependencies=aln_node)
+
+ return self._finalize_nodes(config, prefix, parameters, sam_node)
+
+ def _build_bwa_backtrack_pe(self, config, prefix, record, parameters):
+ template = parameters.pop("input_file")
+ output_bam = parameters.pop("output_file")
+
+ aln_files = []
+ aln_nodes = []
+ for mate in (1, 2):
+ input_file = template.format(Pair=mate)
+ output_sai = swap_ext(output_bam, "%i.sai" % (mate,))
+
+ aln_node = self._build_bwa_backtrack_aln(parameters=parameters,
+ input_file=input_file,
+ output_file=output_sai)
+
+ aln_files.append(output_sai)
+ aln_nodes.append(aln_node)
+
+ sam_node = BWASampe.customize(input_file_sai_1=aln_files[0],
+ input_file_sai_2=aln_files[1],
+ input_file_fq_1=template.format(Pair=1),
+ input_file_fq_2=template.format(Pair=2),
+ output_file=output_bam,
+ prefix=parameters['prefix'],
+ reference=parameters["reference"],
+ dependencies=aln_nodes)
+
+ return self._finalize_nodes(config, prefix, parameters, sam_node)
+
+ def _build_bwa_algorithm(self, config, prefix, record, parameters):
+ if self.options["QualityOffset"] != 33:
+ raise MakefileError("Mapping with BWA using the %r algorithm "
+ "currently does not support QualityOffsets "
+ "other than 33; please convert your FASTQ "
+ "if you wish to proceed.")
+
+ self._set_pe_input_files(parameters)
+ node = BWAAlgorithmNode.customize(**parameters)
+
+ return self._finalize_nodes(config, prefix, parameters, node)
+
+ def _build_bowtie2(self, config, prefix, record, parameters):
+ self._set_pe_input_files(parameters)
+ node = Bowtie2Node.customize(threads=config.bowtie2_max_threads,
+ **parameters)
+
+ command = node.commands["aln"]
+ if self.options["QualityOffset"] == 33:
+ command.set_option("--phred33")
+ else:
+ command.set_option("--phred64")
+
+ for (key, value) in self.options["Aligners"]["Bowtie2"].iteritems():
+ if key.startswith("-"):
+ command.set_option(key, value)
+
+ return self._finalize_nodes(config, prefix, parameters, node)
+
+ def _finalize_nodes(self, config, prefix, parameters, node):
+ self._set_rg_tags(node.commands["convert"])
+
+ min_quality = self.options["Aligners"]["BWA"]["MinQuality"]
+ node.commands["convert"].set_option('-q', min_quality)
+
+ if self.options["Aligners"]["BWA"]["FilterUnmappedReads"]:
+ node.commands["convert"].set_option('-F', "0x4")
+
+ index_required = self._is_indexing_required(prefix)
+ validated_node = index_and_validate_bam(config=config,
+ prefix=parameters['prefix'],
+ node=node.build_node(),
+ create_index=index_required)
+
+ return validated_node
+
+ @classmethod
+ def _set_pe_input_files(self, parameters):
+ template = parameters.pop("input_file")
+ if paths.is_paired_end(template):
+ parameters["input_file_1"] = template.format(Pair=1)
+ parameters["input_file_2"] = template.format(Pair=2)
+ else:
+ parameters["input_file_1"] = template
+ parameters["input_file_2"] = None
diff --git a/paleomix/tools/bam_pipeline/parts/library.py b/paleomix/tools/bam_pipeline/parts/library.py
new file mode 100644
index 0000000..55d36fd
--- /dev/null
+++ b/paleomix/tools/bam_pipeline/parts/library.py
@@ -0,0 +1,255 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import types
+
+from paleomix.common.utilities import \
+ safe_coerce_to_tuple
+
+from paleomix.nodes.picard import \
+ MarkDuplicatesNode
+from paleomix.atomiccmd.builder import \
+ apply_options
+from paleomix.nodes.mapdamage import \
+ MapDamagePlotNode, \
+ MapDamageModelNode, \
+ MapDamageRescaleNode
+from paleomix.tools.bam_pipeline.nodes import \
+ index_and_validate_bam
+from paleomix.nodes.commands import \
+ DuplicateHistogramNode, \
+ FilterCollapsedBAMNode
+from paleomix.nodes.validation import \
+ DetectInputDuplicationNode
+
+
+class Library:
+ """Represents a single library in a BAM pipeline.
+
+ Is reponsible for aggregating per-lane BAMS, removal of PCR duplicates,
+ rescaling of quality-scores using mapDamage, as well as running mapDamage
+ for QC purposes.
+
+ Properties:
+ name -- Name of the libray (as specified in makefile)
+ lanes -- Tuple of lanes assosisated with the library
+ options -- Makefile options that apply to the current library
+ folder -- Folder containing files assosisated with library. Is used as
+ a prefix for files generated by this class.
+ bams -- Dictionary of BAM filenames -> nodes, for each BAM generated by
+ the Library class. Depending on options, this may either be newly
+ generated files, or the files produced by Lanes.
+ """
+
+ def __init__(self, config, target, prefix, lanes, name):
+ self.name = name
+ self.lanes = safe_coerce_to_tuple(lanes)
+ self.options = lanes[0].options
+ self.folder = os.path.dirname(self.lanes[0].folder)
+
+ assert all((self.folder == os.path.dirname(lane.folder)) for lane in self.lanes)
+ assert all((self.options == lane.options) for lane in self.lanes)
+
+ lane_bams = self._collect_bams_by_type(self.lanes)
+
+ pcr_duplicates = self.options["PCRDuplicates"]
+ if pcr_duplicates:
+ # pcr_duplicates may be "mark" or any trueish value
+ lane_bams = self._remove_pcr_duplicates(config, prefix, lane_bams, pcr_duplicates)
+
+ # At this point we no longer need to differentiate between types of reads
+ files_and_nodes = self._collect_files_and_nodes(lane_bams)
+
+ # Collect output bams, possible following rescaling
+ self.bams, mapdamage_nodes \
+ = self._build_mapdamage_nodes(config, target, prefix, files_and_nodes)
+
+ nodes = [self._build_dataduplication_node(lane_bams)]
+ nodes.extend(mapdamage_nodes)
+
+ histogram_node = self._build_duphist_nodes(config, target, prefix, lane_bams)
+ if histogram_node:
+ nodes.append(histogram_node)
+
+ self.nodes = tuple(nodes)
+
+ @classmethod
+ def _collect_bams_by_type(cls, lanes):
+ bams = {}
+ for lane in lanes:
+ for key, files in lane.bams.iteritems():
+ key = "collapsed" if (key == "Collapsed") else "normal"
+ bams.setdefault(key, {}).update(files)
+
+ return bams
+
+ @classmethod
+ def _collect_files_and_nodes(cls, bams):
+ files_and_nodes = {}
+ for dd in bams.itervalues():
+ files_and_nodes.update(dd)
+ return files_and_nodes
+
+ def _remove_pcr_duplicates(self, config, prefix, bams, strategy):
+ rmdup_cls = {"collapsed" : FilterCollapsedBAMNode,
+ "normal" : MarkDuplicatesNode}
+
+ keep_duplicates = False
+ if isinstance(strategy, types.StringTypes) and (strategy.lower() == "mark"):
+ keep_duplicates = True
+
+ # Indexing is required if we wish to calulate per-region statistics,
+ index_required = (bool(prefix.get("RegionsOfInterest")) or
+ # or if we wish to run GATK, but only if we don't
+ # use a downstream rescaled BAM as input for GATK
+ (self.options["Features"]["RealignedBAM"] and not
+ self.options["Features"]["mapDamage"] == 'rescale'))
+
+ results = {}
+ for (key, files_and_nodes) in bams.items():
+ output_filename = self.folder + ".rmdup.%s.bam" % key
+ node = rmdup_cls[key](config = config,
+ input_bams = files_and_nodes.keys(),
+ output_bam = output_filename,
+ keep_dupes = keep_duplicates,
+ dependencies = files_and_nodes.values())
+ validated_node = index_and_validate_bam(config, prefix, node,
+ create_index=index_required)
+
+ results[key] = {output_filename : validated_node}
+ return results
+
+ def _build_mapdamage_nodes(self, config, target, prefix, files_and_nodes):
+ # Messing with these does not cause the pipeline to re-do other stuff
+ destination = os.path.join(config.destination,
+ "%s.%s.mapDamage"
+ % (target, prefix["Name"]), self.name)
+
+ run_type = self.options["Features"]["mapDamage"]
+ if run_type == 'rescale':
+ return self._mapdamage_rescale(config=config,
+ destination=destination,
+ prefix=prefix,
+ files_and_nodes=files_and_nodes)
+
+ elif run_type == 'model':
+ # Run of mapDamage including both plots and damage models
+ node = self._mapdamage_model(config=config,
+ destination=destination,
+ prefix=prefix,
+ files_and_nodes=files_and_nodes)
+
+ return files_and_nodes, (node,)
+ elif run_type == 'plot':
+ # Basic run of mapDamage, only generates plots / tables
+ node = self._mapdamage_plot(config=config,
+ destination=destination,
+ prefix=prefix,
+ files_and_nodes=files_and_nodes)
+
+ return files_and_nodes, (node,)
+ else:
+ assert run_type == 'no', run_type
+ return files_and_nodes, ()
+
+ def _mapdamage_plot(self, config, destination, prefix, files_and_nodes):
+ title = "mapDamage plot for library %r" % (self.name,)
+
+ dependencies = files_and_nodes.values()
+ plot = MapDamagePlotNode.customize(config=config,
+ reference=prefix["Path"],
+ input_files=files_and_nodes.keys(),
+ output_directory=destination,
+ title=title,
+ dependencies=dependencies)
+ apply_options(plot.command, self.options["mapDamage"])
+
+ return plot.build_node()
+
+ def _mapdamage_model(self, config, destination, prefix, files_and_nodes):
+ # Generates basic plots / table files
+ plot = self._mapdamage_plot(config=config,
+ destination=destination,
+ prefix=prefix,
+ files_and_nodes=files_and_nodes)
+
+ # Builds model of post-mortem DNA damage
+ model = MapDamageModelNode.customize(reference=prefix["Reference"],
+ directory=destination,
+ dependencies=plot)
+ apply_options(model.command, self.options["mapDamage"])
+ return model.build_node()
+
+ def _mapdamage_rescale(self, config, destination, prefix, files_and_nodes):
+ model = self._mapdamage_model(config=config,
+ destination=destination,
+ prefix=prefix,
+ files_and_nodes=files_and_nodes)
+
+ # Rescales BAM quality scores using model built above
+ input_files = files_and_nodes.keys()
+ output_filename = self.folder + ".rescaled.bam"
+
+ scale = MapDamageRescaleNode.customize(config=config,
+ reference=prefix["Reference"],
+ input_files=input_files,
+ output_file=output_filename,
+ directory=destination,
+ dependencies=model)
+ apply_options(scale.command, self.options["mapDamage"])
+ scale = scale.build_node()
+
+ # Grab indexing and validation nodes, required by ROIs and GATK
+ index_required = bool(prefix.get("RegionsOfInterest")) \
+ or self.options["Features"]["RealignedBAM"]
+ validate = index_and_validate_bam(config, prefix, scale,
+ create_index=index_required)
+
+ return {output_filename: validate}, (model,)
+
+ def _build_duphist_nodes(self, config, target, prefix, files_and_nodes):
+ if not self.options["Features"]["DuplicateHist"]:
+ return None
+
+ input_files = []
+ dependencies = []
+ for values in files_and_nodes.itervalues():
+ for (filename, node) in values.iteritems():
+ input_files.append(filename)
+ dependencies.append(node)
+
+ folder = "%s.%s.duphist" % (target, prefix["Name"])
+ destination = os.path.join(config.destination, folder,
+ self.name + ".txt")
+
+ return DuplicateHistogramNode(config=config,
+ input_files=input_files,
+ output_file=destination,
+ dependencies=dependencies)
+
+ def _build_dataduplication_node(self, bams):
+ files_and_nodes = self._collect_files_and_nodes(bams)
+
+ return DetectInputDuplicationNode(input_files=files_and_nodes.keys(),
+ output_file=self.folder + ".duplications_checked",
+ dependencies=files_and_nodes.values())
diff --git a/paleomix/tools/bam_pipeline/parts/prefix.py b/paleomix/tools/bam_pipeline/parts/prefix.py
new file mode 100644
index 0000000..fe818db
--- /dev/null
+++ b/paleomix/tools/bam_pipeline/parts/prefix.py
@@ -0,0 +1,106 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+
+from paleomix.common.utilities import safe_coerce_to_tuple
+from paleomix.nodes.picard import MergeSamFilesNode
+from paleomix.tools.bam_pipeline.nodes import \
+ index_and_validate_bam
+from paleomix.nodes.validation import \
+ DetectInputDuplicationNode
+
+import paleomix.nodes.gatk as gatk
+
+
+class Prefix:
+ def __init__(self, config, prefix, samples, features, target):
+ self.name = prefix["Name"]
+ self.label = prefix.get("Label") or self.name
+ self.reference = prefix["Reference"]
+ self.roi = prefix.get("RegionsOfInterest", {})
+
+ self.samples = safe_coerce_to_tuple(samples)
+ self.folder = config.destination
+ self.target = target
+
+ files_and_nodes = {}
+ for sample in self.samples:
+ files_and_nodes.update(sample.bams.iteritems())
+
+ self.datadup_check = self._build_dataduplication_node(prefix, files_and_nodes)
+
+ self.bams = {}
+ if features["RawBAM"]:
+ self.bams.update(self._build_raw_bam(config, prefix, files_and_nodes))
+ if features["RealignedBAM"]:
+ self.bams.update(self._build_realigned_bam(config, prefix, files_and_nodes))
+
+ if not self.bams:
+ for sample in self.samples:
+ self.bams.update(sample.bams)
+
+ nodes = [self.datadup_check]
+ for sample in self.samples:
+ nodes.extend(sample.nodes)
+ self.nodes = tuple(nodes)
+
+ def _build_raw_bam(self, config, prefix, files_and_bams):
+ output_filename = os.path.join(self.folder, "%s.%s.bam" % (self.target, prefix["Name"]))
+ validated_filename = os.path.join(self.folder, self.target, prefix["Name"] + ".validated")
+
+ node = MergeSamFilesNode(config = config,
+ input_bams = files_and_bams.keys(),
+ output_bam = output_filename,
+ dependencies = self.datadup_check)
+ validated_node = index_and_validate_bam(config, prefix, node, validated_filename)
+
+ return {output_filename : validated_node}
+
+ def _build_realigned_bam(self, config, prefix, bams):
+ output_filename = os.path.join(self.folder, "%s.%s.realigned.bam" % (self.target, prefix["Name"]))
+ intervals_filename = os.path.join(self.folder, self.target, prefix["Name"] + ".intervals")
+ validated_filename = os.path.join(self.folder, self.target, prefix["Name"] + ".realigned.validated")
+
+ trainer = gatk.GATKIndelTrainerNode(config=config,
+ reference=prefix["Reference"],
+ infiles=bams.keys(),
+ outfile=intervals_filename,
+ threads=config.gatk_max_threads,
+ dependencies=self.datadup_check)
+
+ aligner = gatk.GATKIndelRealignerNode(config=config,
+ reference=prefix["Reference"],
+ infiles=bams.keys(),
+ intervals=intervals_filename,
+ outfile=output_filename,
+ dependencies=trainer)
+
+ validated_node = index_and_validate_bam(config, prefix, aligner, validated_filename)
+
+ return {output_filename: validated_node}
+
+ def _build_dataduplication_node(self, prefix, files_and_nodes):
+ destination = os.path.join(self.folder, self.target, prefix["Name"] + ".duplications_checked")
+ return DetectInputDuplicationNode(input_files = files_and_nodes.keys(),
+ output_file = destination,
+ dependencies = files_and_nodes.values())
diff --git a/paleomix/tools/bam_pipeline/parts/reads.py b/paleomix/tools/bam_pipeline/parts/reads.py
new file mode 100644
index 0000000..4416ff1
--- /dev/null
+++ b/paleomix/tools/bam_pipeline/parts/reads.py
@@ -0,0 +1,117 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+
+from paleomix.atomiccmd.builder import apply_options
+from paleomix.nodes.adapterremoval import \
+ SE_AdapterRemovalNode, \
+ PE_AdapterRemovalNode
+from paleomix.nodes.validation import \
+ ValidateFASTQFilesNode
+
+
+class Reads(object):
+ def __init__(self, config, record, quality_offset):
+ self.quality_offset = quality_offset
+ self.files = {}
+ self.stats = None
+ self.nodes = ()
+
+ tags = record["Tags"]
+ self.folder = os.path.join(config.destination, tags["Target"], "reads",
+ tags["SM"], tags["LB"], tags["PU_cur"])
+
+ lane_type = record.get("Type")
+ if lane_type == "Raw":
+ self._init_raw_reads(config, record)
+ elif lane_type == "Trimmed":
+ self._init_pretrimmed_reads(record)
+ else:
+ assert False, "Unexpected data type in Reads(): %s" \
+ % (repr(lane_type))
+
+ for name, value in record["Options"]["ExcludeReads"].iteritems():
+ if value:
+ self.files.pop(name, None)
+
+ def _init_pretrimmed_reads(self, record):
+ self.files.update(record["Data"])
+ output_file = os.path.join(self.folder, "reads.pretrimmed.validated")
+ input_files = set()
+ for (read_type, filename) in self.files.iteritems():
+ if read_type == "Paired":
+ input_files.add(filename.format(Pair=1))
+ input_files.add(filename.format(Pair=2))
+ else:
+ input_files.add(filename)
+
+ node = ValidateFASTQFilesNode(input_files=input_files,
+ output_file=output_file,
+ offset=self.quality_offset)
+ self.nodes = (node,)
+
+ def _init_raw_reads(self, config, record):
+ ar_options = dict(record["Options"]["AdapterRemoval"])
+ # Setup of "--collapsed" is handled by the node itself
+ collapse_reads = ar_options.pop("--collapse")
+ collapse_reads = collapse_reads or collapse_reads is None
+
+ init_args = {"output_prefix": os.path.join(self.folder, "reads"),
+ "output_format": record["Options"]["CompressionFormat"],
+ "threads": config.adapterremoval_max_threads}
+ output_tmpl = "{output_prefix}.%s.{output_format}".format(**init_args)
+
+ if ("SE" in record["Data"]):
+ self.files["Single"] = output_tmpl % ("truncated",)
+ init_args["input_files"] = record["Data"]["SE"]
+ command = SE_AdapterRemovalNode.customize(**init_args)
+ else:
+ self.files["Singleton"] = output_tmpl % ("singleton.truncated",)
+ self.files["Paired"] = output_tmpl % ("pair{Pair}.truncated",)
+
+ if collapse_reads:
+ self.files["Collapsed"] = output_tmpl % ("collapsed",)
+ self.files["CollapsedTruncated"] = output_tmpl % ("collapsed.truncated",)
+
+ init_args["collapse"] = collapse_reads
+ init_args["input_files_1"] = record["Data"]["PE_1"]
+ init_args["input_files_2"] = record["Data"]["PE_2"]
+ command = PE_AdapterRemovalNode.customize(**init_args)
+
+ # Ensure that any user-specified list of adapters is tracked
+ if "--adapter-list" in ar_options:
+ adapter_list = ar_options.pop("--adapter-list")
+ command.command.set_option("--adapter-list", "%(IN_ADAPTER_LIST)s")
+ command.command.set_kwargs(IN_ADAPTER_LIST=adapter_list)
+
+ apply_options(command.command, ar_options)
+
+ output_quality = self.quality_offset
+ if output_quality == "Solexa":
+ output_quality = "64"
+
+ command.command.set_option("--qualitybase", self.quality_offset)
+ command.command.set_option("--qualitybase-output", output_quality)
+
+ self.stats = os.path.join(self.folder, "reads.settings")
+ self.nodes = (command.build_node(),)
diff --git a/paleomix/tools/bam_pipeline/parts/sample.py b/paleomix/tools/bam_pipeline/parts/sample.py
new file mode 100644
index 0000000..c1ce889
--- /dev/null
+++ b/paleomix/tools/bam_pipeline/parts/sample.py
@@ -0,0 +1,41 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+
+from paleomix.common.utilities import safe_coerce_to_tuple
+
+
+class Sample:
+ def __init__(self, config, prefix, libraries, name):
+ self.name = name
+ self.libraries = safe_coerce_to_tuple(libraries)
+ self.folder = os.path.dirname(self.libraries[0].folder)
+
+ self.bams = {}
+ for library in self.libraries:
+ self.bams.update(library.bams.iteritems())
+
+ nodes = []
+ for library in self.libraries:
+ nodes.extend(library.nodes)
+ self.nodes = tuple(nodes)
diff --git a/paleomix/tools/bam_pipeline/parts/statistics.py b/paleomix/tools/bam_pipeline/parts/statistics.py
new file mode 100644
index 0000000..140de45
--- /dev/null
+++ b/paleomix/tools/bam_pipeline/parts/statistics.py
@@ -0,0 +1,210 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import collections
+
+from paleomix.common.fileutils import \
+ swap_ext
+
+from paleomix.nodes.commands import \
+ CoverageNode, \
+ MergeCoverageNode, \
+ DepthHistogramNode
+from paleomix.tools.bam_pipeline.parts.summary import \
+ SummaryTableNode
+
+
+def add_statistics_nodes(config, makefile, target):
+ features = makefile["Options"]["Features"]
+
+ nodes = []
+ if features["Depths"]:
+ nodes.extend(_build_depth(config, target))
+
+ if features["Summary"] or features["Coverage"]:
+ make_summary = features["Summary"]
+ coverage = _build_coverage(config, target, make_summary)
+ if make_summary:
+ summary_node = _build_summary_node(config, makefile,
+ target, coverage)
+ nodes.append(summary_node)
+ elif features["Coverage"]:
+ nodes.extend(coverage["Nodes"])
+
+ target.nodes.extend(nodes)
+
+
+def _build_summary_node(config, makefile, target, coverage):
+ coverage_by_label = _build_coverage_nodes(config, target, use_label=True)
+
+ return SummaryTableNode(config=config,
+ makefile=makefile,
+ target=target,
+ cov_for_lanes=coverage_by_label["Lanes"],
+ cov_for_libs=coverage_by_label["Libraries"],
+ dependencies=coverage["Nodes"])
+
+
+def _build_depth(config, target):
+ nodes = []
+ for prefix in target.prefixes:
+ for (roi_name, roi_filename) in _get_roi(prefix, name_prefix="."):
+ if roi_filename is not None:
+ # ROIs require indexed access, and hence that the final BAM
+ # (either raw or realigned) has been built. By default, the
+ # the realigned BAM is used (based on lexical order).
+ bam_files = tuple(sorted(prefix.bams.items()))
+ input_files, dependencies = bam_files[-1]
+ else:
+ input_files = {}
+ for sample in prefix.samples:
+ input_files.update(sample.bams)
+ dependencies = input_files.values()
+ input_files = input_files.keys()
+
+ output_filename = "%s.%s%s.depths" % (target.name, prefix.name,
+ roi_name)
+ output_fpath = os.path.join(config.destination, output_filename)
+
+ node = DepthHistogramNode(config=config,
+ target_name=target.name,
+ input_files=input_files,
+ regions_file=roi_filename,
+ output_file=output_fpath,
+ dependencies=dependencies)
+ nodes.append(node)
+
+ return nodes
+
+
+def _aggregate_for_prefix(cov, prefix, roi_name=None, into=None):
+ prefix = _get_prefix_label(prefix, roi_name)
+ results = {} if into is None else into
+ for (key, files_and_nodes) in cov.iteritems():
+ if prefix is None or (key[0] == prefix):
+ results.update(files_and_nodes)
+ return results
+
+
+def _build_coverage(config, target, make_summary):
+ merged_nodes = []
+ coverage = _build_coverage_nodes(config, target)
+ for prefix in target.prefixes:
+ for (roi_name, _) in _get_roi(prefix):
+ label = _get_prefix_label(prefix.name, roi_name)
+ if not roi_name:
+ postfix = prefix.name
+ else:
+ postfix = "%s.%s" % (prefix.name, roi_name)
+
+ files_and_nodes = _aggregate_for_prefix(coverage["Libraries"],
+ label)
+ output_filename = os.path.join(config.destination,
+ "%s.%s.coverage"
+ % (target.name, postfix))
+ merged = MergeCoverageNode(input_files=files_and_nodes.keys(),
+ output_file=output_filename,
+ dependencies=files_and_nodes.values())
+
+ merged_nodes.append(merged)
+
+ files_and_nodes = _aggregate_for_prefix(coverage["Libraries"], None)
+ if make_summary:
+ files_and_nodes = _aggregate_for_prefix(coverage["Lanes"], None,
+ into=files_and_nodes)
+
+ all_nodes = []
+ all_nodes.extend(files_and_nodes.itervalues())
+ all_nodes.extend(merged_nodes)
+
+ coverage["Nodes"] = tuple(all_nodes)
+
+ return coverage
+
+
+def _build_coverage_nodes(config, target, use_label=False):
+ coverage = {"Lanes": collections.defaultdict(dict),
+ "Libraries": collections.defaultdict(dict)}
+
+ cache = {}
+ for prefix in target.prefixes:
+ for (roi_name, roi_filename) in _get_roi(prefix):
+ prefix_label = prefix.label if use_label else prefix.name
+ prefix_label = _get_prefix_label(prefix_label, roi_name)
+
+ for sample in prefix.samples:
+ for library in sample.libraries:
+ key = (prefix_label, target.name,
+ sample.name, library.name)
+
+ for lane in library.lanes:
+ for bams in lane.bams.values():
+ bams = _build_coverage_nodes_cached(config, bams,
+ target.name,
+ roi_name,
+ roi_filename,
+ cache)
+
+ coverage["Lanes"][key].update(bams)
+
+ bams = _build_coverage_nodes_cached(config, library.bams,
+ target.name, roi_name,
+ roi_filename, cache)
+ coverage["Libraries"][key].update(bams)
+ return coverage
+
+
+def _build_coverage_nodes_cached(config, files_and_nodes, target_name,
+ roi_name, roi_filename, cache):
+ output_ext = ".coverage"
+ if roi_name:
+ output_ext = ".%s.coverage" % roi_name
+
+ coverages = {}
+ for (input_filename, node) in files_and_nodes.iteritems():
+ output_filename = swap_ext(input_filename, output_ext)
+
+ cache_key = (roi_filename, input_filename)
+ if cache_key not in cache:
+ cache[cache_key] = CoverageNode(config=config,
+ input_file=input_filename,
+ output_file=output_filename,
+ target_name=target_name,
+ regions_file=roi_filename,
+ dependencies=node)
+
+ coverages[output_filename] = cache[cache_key]
+ return coverages
+
+
+def _get_roi(prefix, name_prefix=""):
+ roi = [("", None)]
+ for (name, path) in prefix.roi.iteritems():
+ roi.append((name_prefix + name, path))
+ return roi
+
+
+def _get_prefix_label(label, roi_name):
+ if not roi_name:
+ return label
+ return "%s:%s" % (label, roi_name)
diff --git a/paleomix/tools/bam_pipeline/parts/summary.py b/paleomix/tools/bam_pipeline/parts/summary.py
new file mode 100644
index 0000000..d6040db
--- /dev/null
+++ b/paleomix/tools/bam_pipeline/parts/summary.py
@@ -0,0 +1,463 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import re
+import sys
+import math
+import numbers
+import collections
+
+from paleomix.node import Node, NodeError
+from paleomix.common.utilities import set_in, get_in
+from paleomix.common.fileutils import move_file, reroot_path
+from paleomix.tools.bam_stats.coverage import \
+ read_table as read_coverage_table
+from paleomix.common.bedtools import BEDRecord
+
+import paleomix.common.text as text
+
+
+_PE_READS = frozenset(("Paired", "Singleton",
+ "Collapsed", "CollapsedTruncated"))
+_SE_READS = frozenset(("Single",))
+_BAMS = frozenset(())
+
+
+class SummaryTableNode(Node):
+ def __init__(self, config, makefile, target, cov_for_lanes, cov_for_libs, dependencies = ()):
+ self._target = target.name
+ self._output_file = os.path.join(config.destination, self._target + ".summary")
+ self._prefixes = makefile["Prefixes"]
+ self._makefile = makefile["Statistics"]
+
+ self._in_raw_bams = cov_for_lanes
+ self._in_lib_bams = cov_for_libs
+ input_files = set()
+ input_files.update(sum(map(list, self._in_raw_bams.values()), []))
+ input_files.update(sum(map(list, self._in_lib_bams.values()), []))
+
+ self._in_raw_read = collections.defaultdict(list)
+ for prefix in target.prefixes:
+ for sample in prefix.samples:
+ for library in sample.libraries:
+ for lane in library.lanes:
+ if lane.reads:
+ if lane.reads.stats:
+ value = lane.reads.stats
+ input_files.add(value)
+ elif set(lane.reads.files) & _PE_READS:
+ value = _PE_READS
+ elif set(lane.reads.files) & _SE_READS:
+ value = _SE_READS
+ else:
+ assert False
+ else:
+ value = _BAMS
+ self._in_raw_read[(sample.name, library.name, lane.name)] = value
+
+ Node.__init__(self,
+ description = "<Summary: %s>" % self._output_file,
+ input_files = filter(None, input_files),
+ output_files = [self._output_file],
+ dependencies = dependencies)
+
+
+ def _run(self, config, temp):
+ rois = self._stat_areas_of_interest(self._prefixes)
+ genomes = self._stat_prefixes(self._prefixes)
+ with open(reroot_path(temp, self._output_file), "w") as table:
+ table.write("# Command:\n")
+ table.write("# %s\n" % (" ".join(sys.argv)),)
+ table.write("#\n")
+ table.write("# Directory:\n")
+ table.write("# %s\n" % (os.getcwd()),)
+ table.write("#\n")
+ table.write("# Makefile:\n")
+ table.write("# Filename: %s\n" % (self._makefile["Filename"],))
+ table.write("# SHA1Sum: %s\n" % (self._makefile["Hash"],))
+ table.write("# MTime: %s\n" % (self._makefile["MTime"],))
+ table.write("#\n")
+ self._write_genomes(table, genomes)
+ table.write("#\n")
+ self._write_areas_of_interest(table, rois)
+ table.write("#\n#\n")
+
+ for roi in rois.itervalues():
+ genomes[roi["Label"]] = {"Size" : roi["Size"]}
+ self._write_tables(table, genomes)
+
+
+ def _teardown(self, _config, temp):
+ move_file(reroot_path(temp, self._output_file), self._output_file)
+
+
+ def _write_genomes(self, table, genomes):
+ table.write("# Genomes:\n")
+ rows = [["Name", "Label", "Contigs", "Size", "Prefix"]]
+ for (_, prefix) in sorted(self._prefixes.items()):
+ stats = genomes[prefix["Name"]]
+ rows.append((prefix["Name"], prefix.get("Label", "-"), stats["NContigs"], stats["Size"], prefix["Path"]))
+
+ for line in text.padded_table(rows):
+ table.write("# %s\n" % (line,))
+
+
+ def _write_areas_of_interest(self, table, rois):
+ table.write("# Regions Of Interest:\n")
+ rows = [["Genome", "ROI", "Size", "NFeatures", "NIntervals", "Path"]]
+ for (_, roi) in sorted(rois.items()):
+ rows.append([roi[key] for key in ("Genome", "Name", "Size", "NFeatures", "NIntervals", "Path")])
+
+ for line in text.padded_table(rows):
+ table.write("# %s\n" % (line,))
+
+
+ def _write_tables(self, out, genomes):
+ rows = [["Target", "Sample", "Library", "Measure", "Value", "# Description"]]
+ for (target, samples) in sorted(self._read_tables(self._prefixes, genomes).iteritems()):
+ for (sample, libraries) in sorted(samples.iteritems()):
+ for (library, prefixes) in sorted(libraries.iteritems()):
+ ordered = [("reads", prefixes.pop("reads"))] if "reads" in prefixes else []
+ ordered.extend(sorted(prefixes.items()))
+
+ for (prefix, table) in ordered:
+ table.pop("hits_unique_nts(%s)" % prefix, None)
+
+ for (key, (value, comment)) in sorted(table.iteritems(), key = _measure_ordering):
+ if isinstance(value, numbers.Number) and math.isnan(value):
+ value = "NA"
+ rows.append((target, sample, library, key, value, comment))
+ rows.append("")
+ rows.append("")
+
+ for line in text.padded_table(rows):
+ out.write("%s\n" % line)
+
+
+ def _read_tables(self, prefixes, genomes):
+ table = {}
+ self._read_reads_settings(table)
+ self._read_raw_bam_stats(table)
+ self._read_lib_bam_stats(table)
+
+ for (target, samples) in table.items():
+ merged_samples = {}
+ for (sample, libraries) in samples.items():
+ merged_libraries = {}
+ for (library, subtables) in libraries.items():
+ for (tblname, subtable) in subtables.items():
+ merged_libraries[tblname] = self._merge_tables((merged_libraries.get(tblname, {}), subtable))
+ merged_samples[tblname] = self._merge_tables((merged_samples.get(tblname, {}), subtable))
+ libraries[library] = self._annotate_subtables(subtables, genomes)
+ set_in(table, (target, sample, "*"), self._annotate_subtables(merged_libraries, genomes))
+ set_in(table, (target, "*", "*"), self._annotate_subtables(merged_samples, genomes))
+
+ return table
+
+
+ @classmethod
+ def _annotate_subtables(cls, subtables, genomes):
+ if "mitochondrial" in subtables and "nuclear" in subtables:
+ subtables["endogenous"] = cls._create_endogenous_subtable(subtables, genomes)
+
+ for (tblname, subtable) in subtables.iteritems():
+ if tblname == "reads":
+ fractions = [("seq_trash_se", "seq_reads_se", "seq_trash_se_frac", "# Fraction of SE reads trashed"),
+ ("seq_trash_pe_1", "seq_reads_pairs", "seq_trash_pe_1_frac", "# Fraction of PE mate 1 reads trashed"),
+ ("seq_trash_pe_2", "seq_reads_pairs", "seq_trash_pe_2_frac", "# Fraction of PE mate 2 reads trashed"),
+ ("seq_collapsed", "seq_reads_pairs", "seq_collapsed_frac", "# Fraction of PE pairs collapsed into one read"),
+ ("seq_retained_nts", "seq_retained_reads", "seq_retained_length", "# Average number of NTs in retained reads")]
+
+ for (numerator, denominator, measure, comment) in fractions:
+ if (numerator in subtable) and (denominator in subtable):
+ value = float(subtable[numerator][0]) / subtable[denominator][0]
+ subtable[measure] = (value, comment)
+ else:
+ total_hits = subtable["hits_raw(%s)" % tblname][0]
+ total_nts = subtable["hits_unique_nts(%s)" % tblname][0]
+ total_uniq = subtable["hits_unique(%s)" % tblname][0]
+ total_reads = subtables.get("reads",{}).get("seq_retained_reads", (float("NAN"),))[0]
+
+ subtable["hits_raw_frac(%s)" % tblname] = (total_hits / float(total_reads), "# Total number of hits vs. total number of reads retained")
+ subtable["hits_unique_frac(%s)" % tblname] = (total_uniq / float(total_reads), "# Total number of unique hits vs. total number of reads retained")
+ subtable["hits_clonality(%s)" % tblname] = (1 - total_uniq / (float(total_hits) or float("NaN")), "# Fraction of hits that were PCR duplicates")
+ subtable["hits_length(%s)" % tblname] = (total_nts / (float(total_uniq) or float("NaN")), "# Average number of aligned bases per unique hit")
+ subtable["hits_coverage(%s)" % tblname] = (total_nts / float(genomes[tblname]["Size"]), "# Estimated coverage from unique hits")
+
+ return subtables
+
+
+ @classmethod
+ def _create_endogenous_subtable(self, subtables, genomes):
+ nucl = subtables["nuclear"]
+ mito = subtables["mitochondrial"]
+
+ total_hits = mito["hits_raw(mitochondrial)"][0] + nucl["hits_raw(nuclear)"][0]
+ total_hits_unique = mito["hits_unique(mitochondrial)"][0] + nucl["hits_unique(nuclear)"][0]
+ total_hits_unique_nts = mito["hits_unique_nts(mitochondrial)"][0] + nucl["hits_unique_nts(nuclear)"][0]
+
+ ratio_hits, ratio_genome, ratio_genome_inv = "NA", "NA", "NA"
+ if mito["hits_unique(mitochondrial)"][0]:
+ ratio_nts = float(nucl["hits_unique_nts(nuclear)"][0]) / mito["hits_unique_nts(mitochondrial)"][0]
+ ratio_hits = float(nucl["hits_unique(nuclear)"][0]) / mito["hits_unique(mitochondrial)"][0]
+ ratio_genome = ratio_nts / ((float(genomes["nuclear"]["Size"]) * 2) / float(genomes["mitochondrial"]["Size"]))
+ ratio_genome_inv = ratio_genome ** -1
+
+ return {
+ "hits_raw(endogenous)" : (total_hits, "# Total number of hits against the nuclear and mitochondrial genome"),
+ "hits_unique(endogenous)" : (total_hits_unique, "# Total number of unique reads (PCR duplicates removed)"),
+ "hits_unique_nts(endogenous)" : (total_hits_unique_nts, None),
+ "ratio_reads(nuc,mito)" : (ratio_hits, "# Ratio of unique hits: Hits(nuc) / H(mito)"),
+ "ratio_genome(nuc,mito)" : (ratio_genome, "# Ratio of NTs of unique hits corrected by genome sizes: (NTs(nuc) / NTs(mito)) / ((2 * Size(nuc)) / Size(mito))"),
+ "ratio_genome(mito,nuc)" : (ratio_genome_inv, "# Ratio of NTs of unique hits corrected by genome sizes: (NTs(mito) / NTs(nuc)) / (Size(mito) / (2 * Size(nuc)))")
+ }
+
+
+ def _read_reads_settings(self, table):
+ for ((sample, library, barcode), filename) in self._in_raw_read.iteritems():
+ key = (self._target, sample, library, "reads", barcode)
+ set_in(table, key, self._stat_read_settings(filename))
+
+ for (target, samples) in table.iteritems():
+ for (sample, libraries) in samples.iteritems():
+ for (library, prefixes) in libraries.iteritems():
+ prefixes["reads"] = self._merge_tables(prefixes["reads"].values())
+
+ return table
+
+
+ def _read_raw_bam_stats(self, table):
+ for ((genome, target, sample, library), filenames) in self._in_raw_bams.iteritems():
+ key = (target, sample, library)
+ hits, _ = self._read_coverage_tables(key, filenames)
+
+ value = (hits, "# Total number of hits (prior to PCR duplicate filtering)")
+ set_in(table, (target, sample, library, genome, "hits_raw(%s)" % genome), value)
+
+
+ def _read_lib_bam_stats(self, table):
+ for ((genome, target, sample, library), filenames) in self._in_lib_bams.iteritems():
+ key = (target, sample, library)
+ hits, nts = self._read_coverage_tables(key, filenames)
+
+ value = (hits, "# Total number of hits (excluding any PCR duplicates)")
+ set_in(table, (target, sample, library, genome, "hits_unique(%s)" % genome), value)
+ set_in(table, (target, sample, library, genome, "hits_unique_nts(%s)" % genome), (nts, None))
+
+
+ @classmethod
+ def _read_coverage_tables(cls, key, filenames):
+ hits = nts = 0
+ for filename in filenames:
+ subtable = {}
+ read_coverage_table(subtable, filename)
+ contigtables = get_in(subtable, key)
+
+ if contigtables is None:
+ raise NodeError("Error reading table %r; row not found:"
+ "\n %s ...\n\nIf files have been renamed "
+ "during the run, then please remove this file "
+ "in that it may be re-generated.\nHowever, "
+ "note that read-group tags in the BAM files "
+ "may not be correct!"
+ % (filename, " ".join(key)))
+
+ for contigtable in contigtables.itervalues():
+ hits += contigtable["Hits"]
+ nts += contigtable["M"]
+ return hits, nts
+
+
+ @classmethod
+ def _merge_tables(cls, tables):
+ merged = {}
+ for table in tables:
+ for (measure, (value, comment)) in table.iteritems():
+ if not isinstance(value, numbers.Number):
+ other, _ = merged.get(measure, (value, None))
+ merged[measure] = (value if (value == other) else "*", comment)
+ else:
+ other, _ = merged.get(measure, (0, None))
+ merged[measure] = (value + other, comment)
+ return merged
+
+
+ @classmethod
+ def _stat_read_settings(cls, filename):
+ if isinstance(filename, frozenset):
+ if (filename == _SE_READS):
+ return {
+ "lib_type" : ("SE", "# SE, PE, or * (for both)"),
+ "seq_reads_se" : (float("nan"), "# Total number of single-ended reads"),
+ "seq_trash_se" : (float("nan"), "# Total number of trashed reads"),
+ "seq_retained_nts" : (float("nan"), "# Total number of NTs in retained reads"),
+ "seq_retained_reads" : (float("nan"), "# Total number of retained reads"),
+ }
+ elif (filename == _PE_READS):
+ return {
+ "lib_type" : ("PE", "# SE, PE, or * (for both)"),
+ "seq_reads_pairs" : (float("nan"), "# Total number of reads"),
+ "seq_trash_pe_1" : (float("nan"), "# Total number of reads"),
+ "seq_trash_pe_2" : (float("nan"), "# Total number of reads"),
+ "seq_retained_nts" : (float("nan"), "# Total number of NTs in retained reads"),
+ "seq_retained_reads" : (float("nan"), "# Total number of retained reads"),
+ "seq_collapsed" : (float("nan"), "# Total number of pairs collapsed into one read"),
+ }
+ else:
+ return {
+ "lib_type" : ("*", "# SE, PE, or * (for both)"),
+ "seq_reads_se" : (float("nan"), "# Total number of single-ended reads"),
+ "seq_trash_se" : (float("nan"), "# Total number of trashed reads"),
+ "seq_reads_pairs" : (float("nan"), "# Total number of reads"),
+ "seq_trash_pe_1" : (float("nan"), "# Total number of reads"),
+ "seq_trash_pe_2" : (float("nan"), "# Total number of reads"),
+ "seq_retained_nts" : (float("nan"), "# Total number of NTs in retained reads"),
+ "seq_retained_reads" : (float("nan"), "# Total number of retained reads"),
+ "seq_collapsed" : (float("nan"), "# Total number of pairs collapsed into one read"),
+ }
+
+ with open(filename) as settings_file:
+ settings = settings_file.read()
+ def _re_search(regexp, default = None):
+ match = re.search(regexp, settings)
+ if not match:
+ if default is not None:
+ return default
+ raise KeyError("Could not find match with RegExp %s in file '%s'" \
+ % (repr(regexp), filename))
+
+ return int(match.groups()[0])
+
+ if "Paired end mode" in settings or "paired-end reads" in settings:
+ return {
+ "lib_type" : ("PE", "# SE, PE, or * (for both)"),
+ "seq_reads_pairs" : (_re_search("number of read pairs: ([0-9]+)"), "# Total number of pairs"),
+ "seq_trash_pe_1" : (_re_search("discarded mate 1 reads: ([0-9]+)"), "# Total number of reads"),
+ "seq_trash_pe_2" : (_re_search("discarded mate 2 reads: ([0-9]+)"), "# Total number of reads"),
+ "seq_retained_nts" : (_re_search("retained nucleotides: ([0-9]+)"), "# Total number of NTs in retained reads"),
+ "seq_retained_reads" : (_re_search("retained reads: ([0-9]+)"), "# Total number of retained reads"),
+ "seq_collapsed" : (_re_search("of (?:full-length )?collapsed pairs: ([0-9]+)", 0) + \
+ _re_search("of truncated collapsed pairs: ([0-9]+)", 0),
+ "# Total number of pairs collapsed into one read"),
+ }
+ elif "Single end mode" in settings or "single-end reads" in settings:
+ return {
+ "lib_type" : ("SE", "# SE, PE, or * (for both)"),
+ "seq_reads_se" : (_re_search("number of (?:reads|read pairs): ([0-9]+)"), "# Total number of single-ended reads"),
+ "seq_trash_se" : (_re_search("discarded mate 1 reads: ([0-9]+)"), "# Total number of trashed reads"),
+ "seq_retained_nts" : (_re_search("retained nucleotides: ([0-9]+)"), "# Total number of NTs in retained reads"),
+ "seq_retained_reads" : (_re_search("retained reads: ([0-9]+)"), "# Total number of retained reads"),
+ }
+ else:
+ assert False, filename
+
+
+ @classmethod
+ def _stat_areas_of_interest(cls, prefixes):
+ """Returns (size, number of named intervals, total number of intervals)
+ for a set of areas of interest."""
+ areas_of_interest = {}
+ for (prefix_name, prefix) in prefixes.iteritems():
+ prefix_label = prefix.get("Label", prefix_name)
+ for (roi_name, roi_filename) in prefix.get("RegionsOfInterest", {}).iteritems():
+ count, names, size = 0, set(), 0
+ with open(roi_filename) as handle:
+ for line in handle:
+ bed = BEDRecord(line)
+ names.add(bed.name if len(bed) >= 4 else (bed.contig + "*"))
+ size += (bed.end - bed.start)
+ count += 1
+ areas_of_interest[(prefix_name, roi_name)] = {"Size" : size,
+ "NFeatures" : len(names),
+ "NIntervals" : count,
+ "Genome" : prefix["Name"],
+ "Name" : roi_name,
+ "Label" : "%s:%s" % (prefix_label, roi_name),
+ "Path" : roi_filename}
+ return areas_of_interest
+
+
+ @classmethod
+ def _stat_prefixes(cls, prefixes):
+ """Returns (size, number of contigs) for a set of BWA prefix."""
+ genomes = {}
+ for prefix in prefixes:
+ with open(prefixes[prefix]["Reference"] + ".fai") as table:
+ lengths = [int(line.split()[1]) for line in table]
+
+ labels = [prefix]
+ if "Label" in prefixes[prefix]:
+ labels.append(prefixes[prefix].get("Label"))
+
+ for label in labels:
+ if label not in genomes:
+ genomes[label] = {"Size" : 0, "NContigs" : 0}
+
+ statistics = genomes[label]
+ statistics["Size"] += sum(lengths)
+ statistics["NContigs"] += len(lengths)
+
+ if "mitochondrial" in genomes and "nuclear" in genomes:
+ nucl = genomes["nuclear"]
+ mito = genomes["mitochondrial"]
+
+ genomes["endogenous"] = {"Size" : nucl["Size"] + mito["Size"],
+ "NContigs" : nucl["NContigs"] + mito["NContigs"]}
+
+ return genomes
+
+
+
+def _measure_ordering(pair):
+ measure = pair[0]
+ key = measure.split("(")[0]
+ return (__ORDERING[key], measure)
+
+
+__ORDERING = {
+ "lib_type" : 00,
+ "seq_reads_se" : 10,
+ "seq_trash_se" : 20,
+ "seq_trash_se_frac" : 30,
+ "seq_reads_pairs" : 40,
+ "seq_trash_pe_1" : 50,
+ "seq_trash_pe_1_frac" : 60,
+ "seq_trash_pe_2" : 70,
+ "seq_trash_pe_2_frac" : 80,
+ "seq_collapsed" : 90,
+ "seq_collapsed_frac" : 100,
+ "seq_retained_reads" : 110,
+ "seq_retained_nts" : 120,
+ "seq_retained_length" : 130,
+
+
+ "hits_raw" : 140,
+ "hits_raw_frac" : 150,
+ "hits_clonality" : 160,
+ "hits_unique" : 170,
+ "hits_unique_frac" : 180,
+ "hits_coverage" : 190,
+ "hits_length" : 200,
+ "ratio_reads" : 210,
+ "ratio_genome" : 220,
+ }
diff --git a/paleomix/tools/bam_pipeline/parts/target.py b/paleomix/tools/bam_pipeline/parts/target.py
new file mode 100644
index 0000000..3b44878
--- /dev/null
+++ b/paleomix/tools/bam_pipeline/parts/target.py
@@ -0,0 +1,35 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+from paleomix.common.utilities import safe_coerce_to_tuple
+
+
+class Target:
+ def __init__(self, config, prefixes, name):
+ self.name = name
+ self.prefixes = safe_coerce_to_tuple(prefixes)
+
+ self.bams = {}
+ self.nodes = []
+ for prefix in self.prefixes:
+ self.nodes.extend(prefix.nodes)
+ self.bams.update(prefix.bams.iteritems())
diff --git a/paleomix/tools/bam_pipeline/paths.py b/paleomix/tools/bam_pipeline/paths.py
new file mode 100644
index 0000000..a685f7d
--- /dev/null
+++ b/paleomix/tools/bam_pipeline/paths.py
@@ -0,0 +1,85 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import re
+import glob
+
+from paleomix.common.makefile import \
+ MakefileError
+
+
+def is_paired_end(template):
+ """Returns true if a template contains a Pair component."""
+ return (template.format(Pair=1) != template)
+
+
+def collect_files(path, template):
+ """
+
+ """
+ if is_paired_end(template):
+ if _has_glob_magic(template):
+ result = {"PE_1": _sorted_glob(template.format(Pair=1)),
+ "PE_2": _sorted_glob(template.format(Pair=2))}
+
+ if not (result["PE_1"] or result["PE_2"]):
+ _raise_missing_files("paired-end", path, template)
+ elif len(result["PE_1"]) != len(result["PE_2"]):
+ raise MakefileError("Unequal number of mate 1 and mate 2 "
+ "files found at path %r; found %i mate 1 "
+ "files, and %i mate 2 files; specified in "
+ "makefile at %r. Please verify that the "
+ "path is correct, and update the makefile!"
+ % (template,
+ len(result["PE_1"]),
+ len(result["PE_2"]),
+ " :: ".join(path)))
+ else:
+ result = {"PE_1": [template.format(Pair=1)],
+ "PE_2": [template.format(Pair=2)]}
+ elif _has_glob_magic(template):
+ result = {"SE": _sorted_glob(template)}
+ if not result["SE"]:
+ _raise_missing_files("single-end", path, template)
+ else:
+ result = {"SE": [template]}
+
+ return result
+
+
+def _has_glob_magic(filename):
+ """Returns true if a path contains wildcards used by glob / fnmatch."""
+ return _GLOB_MAGIC.search(filename) is not None
+
+
+def _sorted_glob(tmpl):
+ return list(sorted(glob.iglob(tmpl)))
+
+
+def _raise_missing_files(description, path, template):
+ raise MakefileError("No files found for %s reads using path %r; "
+ "specified in makefile at %r. Please verify that the "
+ "path is correct, and update the makefile!"
+ % (description, template, " :: ".join(path)))
+
+
+_GLOB_MAGIC = re.compile('[*?[]')
diff --git a/paleomix/tools/bam_pipeline/pipeline.py b/paleomix/tools/bam_pipeline/pipeline.py
new file mode 100755
index 0000000..59a9928
--- /dev/null
+++ b/paleomix/tools/bam_pipeline/pipeline.py
@@ -0,0 +1,305 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import time
+import logging
+
+import paleomix
+import paleomix.logger
+import paleomix.resources
+import paleomix.yaml
+
+from paleomix.common.console import \
+ print_err, \
+ print_info
+
+from paleomix.pipeline import \
+ Pypeline
+from paleomix.nodes.picard import \
+ BuildSequenceDictNode
+from paleomix.nodes.samtools import \
+ FastaIndexNode
+from paleomix.nodes.bwa import \
+ BWAIndexNode
+from paleomix.nodes.bowtie2 import \
+ Bowtie2IndexNode
+from paleomix.nodes.validation import \
+ ValidateFASTAFilesNode
+
+from paleomix.tools.bam_pipeline.makefile import \
+ MakefileError, \
+ read_makefiles
+
+from paleomix.tools.bam_pipeline.parts import \
+ Reads
+
+import paleomix.tools.bam_pipeline.parts as parts
+import paleomix.tools.bam_pipeline.config as bam_config
+import paleomix.tools.bam_pipeline.mkfile as bam_mkfile
+
+
+def build_pipeline_trimming(config, makefile):
+ """Builds only the nodes required to produce trimmed reads.
+ This reduces the required complexity of the makefile to a minimum."""
+
+ nodes = []
+ for (_, samples) in makefile["Targets"].iteritems():
+ print_info(".", end='')
+
+ for (_, libraries) in samples.iteritems():
+ for (_, barcodes) in libraries.iteritems():
+ for (barcode, record) in barcodes.iteritems():
+ if record["Type"] in ("Raw", "Trimmed"):
+ offset = record["Options"]["QualityOffset"]
+ reads = Reads(config, record, offset)
+
+ nodes.extend(reads.nodes)
+
+ return nodes
+
+
+def build_pipeline_full(config, makefile, return_nodes=True):
+ result = []
+ features = makefile["Options"]["Features"]
+ for (target_name, sample_records) in makefile["Targets"].iteritems():
+ print_info(".", end='')
+
+ prefixes = []
+ for (_, prefix) in makefile["Prefixes"].iteritems():
+ samples = []
+ for (sample_name, library_records) in sample_records.iteritems():
+ libraries = []
+ for (library_name, barcode_records) in library_records.iteritems():
+ lanes = []
+ for (barcode, record) in barcode_records.iteritems():
+ lane = parts.Lane(config, prefix, record, barcode)
+
+ # ExcludeReads settings may exlude entire lanes
+ if lane.bams:
+ lanes.append(lane)
+
+ if lanes:
+ libraries.append(parts.Library(config, target_name, prefix, lanes, library_name))
+
+ if libraries:
+ samples.append(parts.Sample(config, prefix, libraries, sample_name))
+
+ if samples:
+ prefixes.append(parts.Prefix(config, prefix, samples, features, target_name))
+
+ if prefixes:
+ target = parts.Target(config, prefixes, target_name)
+
+ # Construct coverage, depth-histogram, and summary nodes, etc.
+ parts.add_statistics_nodes(config, makefile, target)
+
+ if return_nodes:
+ # Extra tasks (e.g. coverage, depth-histograms, etc.)
+ result.extend(target.nodes)
+ # Output BAM files (raw, realigned)
+ result.extend(target.bams.itervalues())
+ else:
+ result.append(target)
+
+ return result
+
+
+def index_references(config, makefiles):
+ references = {}
+ references_bwa = {}
+ references_bowtie2 = {}
+ for makefile in makefiles:
+ for subdd in makefile["Prefixes"].itervalues():
+ reference = subdd["Reference"]
+ if reference not in references:
+ # Validation of the FASTA file; not blocking for the other
+ # steps, as it is only expected to fail very rarely, but will
+ # block subsequent analyses depending on the FASTA.
+ valid_node = ValidateFASTAFilesNode(input_files=reference,
+ output_file=reference +
+ ".validated")
+ # Indexing of FASTA file using 'samtools faidx'
+ faidx_node = FastaIndexNode(reference)
+ # Indexing of FASTA file using 'BuildSequenceDictionary.jar'
+ dict_node = BuildSequenceDictNode(config=config,
+ reference=reference,
+ dependencies=(valid_node,))
+
+ # Indexing of FASTA file using 'bwa index'
+ bwa_node = BWAIndexNode(input_file=reference,
+ dependencies=(valid_node,))
+ # Indexing of FASTA file using ''
+ bowtie2_node = Bowtie2IndexNode(input_file=reference,
+ dependencies=(valid_node,))
+
+ references[reference] = (valid_node, faidx_node, dict_node)
+ references_bwa[reference] = (valid_node, faidx_node,
+ dict_node, bwa_node)
+ references_bowtie2[reference] = (valid_node, faidx_node,
+ dict_node, bowtie2_node)
+
+ subdd["Nodes"] = references[reference]
+ subdd["Nodes:BWA"] = references_bwa[reference]
+ subdd["Nodes:Bowtie2"] = references_bowtie2[reference]
+
+
+def run(config, args, pipeline_variant):
+ if pipeline_variant not in ("bam", "trim"):
+ raise ValueError("Unexpected BAM pipeline variant (%r)"
+ % (pipeline_variant,))
+
+ if not os.path.exists(config.temp_root):
+ try:
+ os.makedirs(config.temp_root)
+ except OSError, error:
+ print_err("ERROR: Could not create temp root:\n\t%s" % (error,))
+ return 1
+
+ if not os.access(config.temp_root, os.R_OK | os.W_OK | os.X_OK):
+ print_err("ERROR: Insufficient permissions for temp root: '%s'"
+ % (config.temp_root,))
+ return 1
+
+ # Init worker-threads before reading in any more data
+ pipeline = Pypeline(config)
+
+ try:
+ print_info("Reading makefiles ...")
+ makefiles = read_makefiles(config, args, pipeline_variant)
+ except (MakefileError, paleomix.yaml.YAMLError, IOError), error:
+ print_err("Error reading makefiles:",
+ "\n %s:\n " % (error.__class__.__name__,),
+ "\n ".join(str(error).split("\n")))
+ return 1
+
+ logfile_template = time.strftime("bam_pipeline.%Y%m%d_%H%M%S_%%02i.log")
+ paleomix.logger.initialize(config, logfile_template)
+ logger = logging.getLogger(__name__)
+
+ pipeline_func = build_pipeline_trimming
+ if pipeline_variant == "bam":
+ # Build .fai files for reference .fasta files
+ index_references(config, makefiles)
+
+ pipeline_func = build_pipeline_full
+
+ print_info("Building BAM pipeline ", end='')
+ for makefile in makefiles:
+ # If a destination is not specified, save results in same folder as the
+ # makefile
+ filename = makefile["Statistics"]["Filename"]
+ old_destination = config.destination
+ if old_destination is None:
+ config.destination = os.path.dirname(filename)
+
+ try:
+ nodes = pipeline_func(config, makefile)
+ except paleomix.node.NodeError, error:
+ logger.error("Error while building pipeline for '%s':\n%s",
+ filename, error)
+ return 1
+
+ config.destination = old_destination
+
+ pipeline.add_nodes(*nodes)
+
+ print_info("")
+
+ if config.list_input_files:
+ logger.info("Printing output files ...")
+ pipeline.print_input_files()
+ return 0
+ elif config.list_output_files:
+ logger.info("Printing output files ...")
+ pipeline.print_output_files()
+ return 0
+ elif config.list_executables:
+ logger.info("Printing required executables ...")
+ pipeline.print_required_executables()
+ return 0
+ elif config.dot_file:
+ logger.info("Writing dependency graph to %r ...", config.dot_file)
+ if not pipeline.to_dot(config.dot_file):
+ return 1
+ return 0
+
+ logger.info("Running BAM pipeline ...")
+ if not pipeline.run(dry_run=config.dry_run,
+ max_threads=config.max_threads,
+ progress_ui=config.progress_ui):
+ return 1
+
+ return 0
+
+
+def _print_usage(pipeline):
+ basename = "%s_pipeline" % (pipeline,)
+
+ print_info("BAM Pipeline v%s\n" % (paleomix.__version__,))
+ print_info("Usage:")
+ print_info(" -- %s help -- Display this message" % basename)
+ print_info(" -- %s example [...] -- Create example project in folder." % basename)
+ print_info(" -- %s makefile [...] -- Print makefile template." % basename)
+ print_info(" -- %s dryrun [...] -- Perform dry run of pipeline on provided makefiles." % basename)
+ print_info(" %s Equivalent to 'bam_pipeline run --dry-run [...]'." % (" " * len(basename),))
+ print_info(" -- %s run [...] -- Run pipeline on provided makefiles." % basename)
+ print_info(" -- %s remap [...] -- Re-map hits from previous alignment." % basename)
+
+
+def main(argv, pipeline="bam"):
+ assert pipeline in ("bam", "trim"), pipeline
+
+ commands = ("makefile", "mkfile", "run",
+ "dry_run", "dry-run", "dryrun",
+ "remap", "example", "examples")
+
+ if not argv or (argv[0] == "help"):
+ _print_usage(pipeline)
+ return 0
+ elif argv[0] not in commands:
+ _print_usage(pipeline)
+ return 1
+ elif argv[0] in ("mkfile", "makefile"):
+ return bam_mkfile.main(argv[1:], pipeline=pipeline)
+ elif argv[0] in ("remap", "remap_prefix"):
+ # Import here to avoid circular dependency issues
+ import paleomix.tools.bam_pipeline.remap as bam_remap
+
+ return bam_remap.main(argv[1:])
+ elif argv[0] in ("example", "examples"):
+ return paleomix.resources.copy_example("bam_pipeline", argv[1:])
+
+ try:
+ config, args = bam_config.parse_config(argv, pipeline)
+
+ if not args[1:]:
+ print_err("Please specify at least one makefile!")
+ print_err("Use --help for more information.")
+ return 1
+ elif args and args[0].startswith("dry"):
+ config.dry_run = True
+ except bam_config.ConfigError, error:
+ print_err(error)
+ return 1
+
+ return run(config, args[1:], pipeline_variant=pipeline)
diff --git a/paleomix/tools/bam_pipeline/remap.py b/paleomix/tools/bam_pipeline/remap.py
new file mode 100644
index 0000000..e81926f
--- /dev/null
+++ b/paleomix/tools/bam_pipeline/remap.py
@@ -0,0 +1,291 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2013 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+from __future__ import print_function
+
+import os
+import sys
+import bz2
+import pysam
+
+from optparse import OptionParser
+
+import paleomix.tools.bam_pipeline.config as bam_cfg
+import paleomix.tools.bam_pipeline.mkfile as bam_mkfile
+import paleomix.tools.bam_pipeline.pipeline as bam_pipeline
+from paleomix.common.fileutils import make_dirs, reroot_path, add_postfix
+from paleomix.common.sequences import reverse_complement
+from paleomix.common.utilities import set_in, get_in
+
+
+# Indentation used when printing makefile
+_INDENTATION = " " * 4
+
+
+def samefile(fname_a, fname_b):
+ if not (os.path.exists(fname_a) and os.path.exists(fname_b)):
+ return False
+ return os.path.samefile(fname_a, fname_b)
+
+
+class ReadSink(object):
+ # Cache of opened file-handles; used to ensure that lanes containing both
+ # PE (potentialy with orphaned reads) and SE reads are all collected.
+ _cache = {}
+
+ def __init__(self, filename, handle):
+ """See ReadSink.open"""
+ self.filename = filename
+ self._handle = handle
+
+ def close(self):
+ self._handle.close()
+
+ def write_records(self, records):
+ for record in records:
+ seq = record.seq
+ qual = record.qual
+ if record.is_reverse:
+ seq = reverse_complement(seq)
+ qual = qual[::-1]
+
+ assert len(qual) == len(seq), record.qname
+ self._handle.write("@%s\n" % (record.qname,))
+ self._handle.write("%s\n" % (seq,))
+ self._handle.write("+\n")
+ self._handle.write("%s\n" % (qual,))
+
+ @classmethod
+ def get_filename(cls, destination, prefix):
+ return os.path.join(destination, "%s.fastq.bz2" % (prefix,))
+
+ @classmethod
+ def open(cls, prefix, filename):
+ if filename not in cls._cache:
+ handle = bz2.BZ2File(os.path.join(prefix, filename), "w")
+ cls._cache[filename] = ReadSink(filename, handle)
+
+ return cls._cache[filename]
+
+ @classmethod
+ def close_all_sinks(cls):
+ for handle in cls._cache.itervalues():
+ handle.close()
+ cls._cache.clear()
+
+
+class PEReadSink(ReadSink):
+ def __init__(self, prefix, destination):
+ ReadSink.__init__(self, self.get_filename(destination, "paired.{Pair}"), None)
+ self._sink_se = ReadSink.open(prefix, self.get_filename(destination, "singleton"))
+ self._sink_pe_1 = ReadSink.open(prefix, self.get_filename(destination, "paired.1"))
+ self._sink_pe_2 = ReadSink.open(prefix, self.get_filename(destination, "paired.2"))
+
+ def write_records(self, records):
+ record_cache = {}
+ for record in records:
+ num = 0
+ if record.is_read1:
+ num = 1
+ elif record.is_read2:
+ num = 2
+ set_in(record_cache, (record.qname, num), record)
+
+ for pair in record_cache.itervalues():
+ # Only write complete pairs
+ if (1 in pair) and (2 in pair):
+ self._sink_pe_1.write_records([pair.pop(1)])
+ self._sink_pe_2.write_records([pair.pop(2)])
+
+ # Any orphan files are written to the SE sink
+ for record in pair.itervalues():
+ self._sink_se.write_records([record])
+
+ @classmethod
+ def open(cls, prefix, destination):
+ return PEReadSink(prefix, destination)
+
+
+def convert_reads(config, destination, record, sink_cache):
+ # Source name is used, to re-merge split lanes
+ name = record.tags.get("PU_src")
+ destination = os.path.join(destination, name)
+ make_dirs(os.path.join(config.destination, destination))
+
+ def _open_se_sink(reads_type):
+ key = (name, reads_type)
+ if not get_in(sink_cache, key):
+ filename = ReadSink.get_filename(destination, reads_type.lower())
+ set_in(sink_cache, key, ReadSink.open(config.destination, filename))
+ return key
+
+ for (reads_type, bam_files) in record.bams.iteritems():
+ # Processed reads are pre-aligned BAMs which have been cleaned up
+ if reads_type in ("Paired", "Processed"):
+ # Record "Single" reads; these may result from orphan SE reads
+ _open_se_sink("Singleton")
+
+ key = (name, "Paired")
+ if not get_in(sink_cache, key):
+ set_in(sink_cache, key, PEReadSink.open(config.destination,
+ destination))
+ else:
+ key = _open_se_sink(reads_type)
+
+ sink = get_in(sink_cache, key)
+ for filename in bam_files:
+ print("%sProcessing file %r" % (_INDENTATION * 4, filename))
+ with pysam.Samfile(filename) as handle:
+ def _keep_record(record):
+ return (record.qual >= config.min_quality) and \
+ (len(record.seq) >= config.min_length)
+
+ sink.write_records(record for record in handle
+ if _keep_record(record))
+
+
+def parse_options(argv):
+ tmpl = "%s <Prefix> <Makefile> [<Makefile>, ...]"
+
+ parser = OptionParser(tmpl % "bam_pipeline remap")
+ parser.add_option("--destination",
+ default="remapping", dest="destination",
+ help="Destination for resulting files [%default]")
+ parser.add_option("--output-name-postfix",
+ default="_remapping", dest="postfix",
+ help="Postfix added to filenames/target names of "
+ "generated files [%default]")
+ parser.add_option("--min-quality", default=0, type=int,
+ help="Minimum quality of hits to include in output "
+ "[%default]")
+ parser.add_option("--min-length", default=0, type=int,
+ help="Minimum length of hits to include in output "
+ "[%default]")
+
+ config, args = parser.parse_args(argv)
+ if (len(args) < 2):
+ parser.print_usage()
+ return None, None
+
+ config.prefix = args[0]
+
+ return config, args[1:]
+
+
+def main(argv):
+ config, args = parse_options(argv)
+ if config is None:
+ return 1
+
+ # Get default options for bam_pipeline
+ bam_config, _ = bam_cfg.parse_config(args, "bam")
+ makefiles = bam_pipeline.read_makefiles(bam_config, args)
+ # Build .fai files for reference .fasta files
+ bam_pipeline.index_references(bam_config, makefiles)
+
+ for makefile in makefiles:
+ mkfile_fname = makefile["Statistics"]["Filename"]
+ bam_config.destination = os.path.dirname(mkfile_fname)
+ tasks = bam_pipeline.build_pipeline_full(bam_config, makefile,
+ return_nodes=False)
+
+ make_dirs(config.destination)
+ makefile_name = add_postfix(makefile["Statistics"]["Filename"],
+ config.postfix)
+ makefile_path = reroot_path(config.destination, makefile_name)
+ if samefile(makefile["Statistics"]["Filename"], makefile_path):
+ sys.stderr.write("ERROR: Would overwrite source makefile at %r\n" % (makefile_path,))
+ sys.stderr.write(" Please set --destination and/or --output-name-postfix\n")
+ sys.stderr.write(" before continuing.\n")
+ return 1
+
+ print("Writing makefile", makefile_path)
+
+ found_prefix = False
+ for prefix in makefile["Prefixes"]:
+ if prefix != config.prefix:
+ print("%sSkipping %s" % (_INDENTATION, prefix))
+ else:
+ found_prefix = True
+
+ if not found_prefix:
+ sys.stderr.write("\nERROR:\n")
+ sys.stderr.write("Could not find prefix %r in %r! Aborting ...\n"
+ % (config.prefix, mkfile_fname))
+ return 1
+
+ with open(makefile_path, "w") as makefile_handle:
+ template = bam_mkfile.build_makefile(add_sample_tmpl=False)
+ makefile_handle.write(template)
+ makefile_handle.write("\n" * 3)
+
+ for target in tasks:
+ target_name = add_postfix(target.name, config.postfix)
+ print("%sTarget: %s -> %s" % (_INDENTATION,
+ target.name,
+ target_name))
+
+ makefile_handle.write('%s"%s":\n' % (_INDENTATION * 0,
+ target_name))
+ for prefix in target.prefixes:
+ if prefix.name != config.prefix:
+ continue
+
+ for sample in prefix.samples:
+ print("%sSample: %s" % (_INDENTATION * 2, sample.name))
+
+ makefile_handle.write('%s"%s":\n' % (_INDENTATION * 1,
+ sample.name))
+
+ for library in sample.libraries:
+ print("%sLibrary: %s" % (_INDENTATION * 3,
+ library.name))
+ makefile_handle.write('%s"%s":\n'
+ % (_INDENTATION * 2,
+ library.name))
+
+ sink_cache = {}
+ destination = os.path.join(target_name,
+ "reads",
+ sample.name,
+ library.name)
+
+ for lane in library.lanes:
+ convert_reads(config, destination, lane, sink_cache)
+ ReadSink.close_all_sinks()
+
+ for lane_name in sorted(sink_cache):
+ makefile_handle.write('%s"%s":\n' % (_INDENTATION * 3, lane_name))
+ for (reads_type, sink) in sorted(sink_cache[lane_name].items()):
+ makefile_handle.write('%s%s "%s"\n'
+ % (_INDENTATION * 4,
+ ("%s:" % (reads_type,)).ljust(20),
+ sink.filename))
+ makefile_handle.write("\n")
+ print("\tDone ...")
+ print()
+
+ return 0
+
+
+if __name__ == '__main__':
+ sys.exit(main(sys.argv[1:]))
diff --git a/paleomix/tools/bam_pipeline/trim_pipeline.py b/paleomix/tools/bam_pipeline/trim_pipeline.py
new file mode 100644
index 0000000..320136a
--- /dev/null
+++ b/paleomix/tools/bam_pipeline/trim_pipeline.py
@@ -0,0 +1,28 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import paleomix.tools.bam_pipeline.pipeline as pipeline
+
+
+def main(argv):
+ """Wrapper to invoke the trimming pipeline; used by paleomix.main."""
+ return pipeline.main(argv, pipeline="trim")
diff --git a/paleomix/tools/bam_stats/__init__.py b/paleomix/tools/bam_stats/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/paleomix/tools/bam_stats/common.py b/paleomix/tools/bam_stats/common.py
new file mode 100644
index 0000000..cc9dc0e
--- /dev/null
+++ b/paleomix/tools/bam_stats/common.py
@@ -0,0 +1,183 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import argparse
+import collections
+
+import pysam
+
+from paleomix.ui import \
+ print_err, \
+ print_msg, \
+ print_warn
+
+from paleomix.common.fileutils import \
+ swap_ext
+from paleomix.common.bedtools import \
+ sort_bed_by_bamfile, \
+ read_bed_file
+
+
+class BAMStatsError(RuntimeError):
+ pass
+
+
+def collect_readgroups(args, handle):
+ readgroups = {None: {"SM": "<NA>", "LB": "<NA>"}}
+ if args.ignore_readgroups:
+ return readgroups
+
+ for readgroup in handle.header.get("RG", ()):
+ key_id = readgroup["ID"]
+ sample = readgroup["SM"]
+ library = readgroup["LB"]
+
+ readgroups[key_id] = {"SM": sample, "LB": library}
+ return readgroups
+
+
+def collect_references(args, handle):
+ if args.regions:
+ lengths = collections.defaultdict(int)
+ for region in args.regions:
+ lengths[region.name] += region.end - region.start
+
+ lengths = dict(lengths)
+ elif handle.nreferences <= args.max_contigs:
+ lengths = dict(zip(handle.references, handle.lengths))
+ else:
+ lengths = {"<Genome>": sum(handle.lengths)}
+
+ return lengths
+
+
+def collect_bed_regions(filename):
+ regions = []
+ for record in read_bed_file(filename):
+ if len(record) < 4:
+ record.name = "%s*" % (record.contig,)
+
+ regions.append(record)
+
+ return regions
+
+
+def parse_arguments(argv, ext):
+ prog = "paleomix %s" % (ext.strip("."),)
+ usage = "%s [options] sorted.bam [out%s]" % (prog, ext)
+ parser = argparse.ArgumentParser(prog=prog, usage=usage)
+
+ parser.add_argument("infile", metavar="BAM",
+ help="Filename of a sorted BAM file. If set to '-' "
+ "the file is read from STDIN.")
+ parser.add_argument("outfile", metavar="OUTPUT", nargs='?',
+ help="Filename of output table; defaults to name of "
+ "the input BAM with a '%s' extension. If "
+ "set to '-' the table is printed to STDOUT."
+ % (ext,))
+ parser.add_argument("--target-name", default=None, metavar="NAME",
+ help="Name used for 'Target' column; defaults to the "
+ "filename of the BAM file.")
+ parser.add_argument("--regions-file", default=None, dest="regions_fpath",
+ help="BED file containing regions of interest; %s "
+ "is calculated only for these grouping by the "
+ "name used in the BED file, or the contig name "
+ "if no name has been specified for a record."
+ % (ext.strip("."),))
+ parser.add_argument('--max-contigs', default=100, type=int,
+ help="The maximum number of contigs allowed in a BAM "
+ "file. If this number is exceeded, the entire "
+ "set of contigs is aggregated into one pseudo-"
+ "contig named '<Genome>'. This is done to "
+ "limit table sizes [default: %(default)s]")
+ parser.add_argument('--ignore-readgroups',
+ default=False, action="store_true",
+ help="Ignore readgroup information in reads, and only "
+ "provide aggregated statistics; this is required "
+ "if readgroup information is missing or partial "
+ "[default: %(default)s]")
+ parser.add_argument('--overwrite-output',
+ default=False, action="store_true",
+ help="Overwrite output file if it it exists; by "
+ "default, the script will terminate if the file "
+ "already exists.")
+
+ args = parser.parse_args(argv)
+ if not args.outfile:
+ args.outfile = swap_ext(args.infile, ext)
+
+ if args.ignore_readgroups:
+ args.get_readgroup_func = _get_readgroup_ignored
+ else:
+ args.get_readgroup_func = _get_readgroup
+
+ if not args.target_name:
+ if args.infile == "-":
+ args.target_name = "<STDIN>"
+ else:
+ args.target_name = os.path.basename(args.infile)
+
+ if os.path.exists(args.outfile) and not args.overwrite_output:
+ parser.error("Destination filename already exists (%r); use option "
+ "--overwrite-output to allow overwriting of this file."
+ % (args.outfile,))
+
+ return args
+
+
+def main_wrapper(process_func, argv, ext):
+ args = parse_arguments(argv, ext)
+ args.regions = None
+ if args.regions_fpath:
+ try:
+ args.regions = collect_bed_regions(args.regions_fpath)
+ except ValueError, error:
+ print_err("ERROR: Failed to parse BED file %r:\n%s"
+ % (args.regions_fpath, error))
+ return 1
+
+ print_msg("Opening %r" % (args.infile,))
+ with pysam.Samfile(args.infile) as handle:
+ sort_order = handle.header.get('HD', {}).get('SO')
+ if sort_order is None:
+ print_warn("WARNING: BAM file %r is not marked as sorted!"
+ % (args.infile,))
+ elif sort_order != 'coordinate':
+ print_err("ERROR: BAM file %r is %s-sorted, but only "
+ "coordinate-sorted BAMs are supported!"
+ % (args.infile, sort_order))
+ return 1
+
+ sort_bed_by_bamfile(handle, args.regions)
+ return process_func(handle, args)
+
+
+def _get_readgroup(record):
+ try:
+ return record.opt("RG")
+ except KeyError:
+ return None
+
+
+def _get_readgroup_ignored(_):
+ return None
diff --git a/paleomix/tools/bam_stats/coverage.py b/paleomix/tools/bam_stats/coverage.py
new file mode 100644
index 0000000..6536974
--- /dev/null
+++ b/paleomix/tools/bam_stats/coverage.py
@@ -0,0 +1,191 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import sys
+import datetime
+import collections
+
+from paleomix.common.utilities import \
+ get_in, \
+ set_in
+from paleomix.common.text import \
+ padded_table, \
+ parse_padded_table
+
+from paleomix.tools.bam_stats.common import \
+ BAMStatsError
+
+
+##############################################################################
+##############################################################################
+##
+
+READGROUP_TEMPLATE = {"SE": 0, "PE_1": 0, "PE_2": 0, "Collapsed": 0,
+ "Hits": 0, "M": 0, "I": 0, "D": 0, "Size": 0}
+
+
+# Header prepended to output tables
+TABLE_HEADER = """# Timestamp: %s
+#
+# Columns:
+# Contig: Contig, chromosome, or feature for which a depth histogram was
+# created. Unnamed features are named after the chromosome or
+# contig on which they are located, with a star appended. For
+# example "chr1*".
+# Size: The total size of the region. Multiple features with the same
+# name are combined into one row, with the size representing to
+# total of these. Note that overlapping bases are counted 2 (or
+# more) times.
+# Hits: Sum of SE, PE_1, and PE_2 hits. Note that supplementary
+# alignments, duplicates, reads that failed QC, secondary
+# alignments, and unmapped reads are ignored.
+# SE, PE_*: Number of Single Ended, and Pair Ended (mate 1 and 2) hits
+# overlapping the current contig or intervals. Note that a hit
+# may be counted multiple times if it overlaps multiple intervals
+# Collapsed: Number of hits for PE pair collapsed into a single read.
+# M, I, D: Number of aligned (M), inserted (I) and deleted (D) bases
+# relative to references.
+# Coverage: Average number of bases covering each position in the
+# contig(s)/intervals(s).
+"""
+
+
+def calculate_totals(table):
+ lengths = {}
+ for samples in table.itervalues():
+ for libraries in samples.values():
+ for contigs in libraries.values():
+ for (name, contig) in contigs.iteritems():
+ size = lengths.get(name)
+ if (size is not None) and (size != contig["Size"]):
+ raise BAMStatsError(name)
+ lengths[name] = contig["Size"]
+
+ for (name, samples) in sorted(table.items()):
+ for (sample, libraries) in sorted(samples.items()):
+ for (library, contigs) in sorted(libraries.items()):
+ totals = _calculate_totals_in(contigs, lengths)
+ set_in(table, (name, sample, library), totals)
+
+ totals = _calculate_totals_in(libraries, lengths)
+ set_in(table, (name, sample, "*"), totals)
+
+ set_in(table, (name, "*", "*"), _calculate_totals_in(table, lengths))
+ return table
+
+
+def build_rows(table):
+ rows = [("Name", "Sample", "Library", "Contig", "Size", "Hits", "SE",
+ "PE_1", "PE_2", "Collapsed", "M", "I", "D", "Coverage")]
+
+ for (name, samples) in sorted(table.items()):
+ for (sample, libraries) in sorted(samples.items()):
+ for (library, contigs) in sorted(libraries.items()):
+ for (contig, subtable) in sorted(contigs.items()):
+ row = [name,
+ sample,
+ library,
+ contig,
+ subtable["Size"],
+ subtable["SE"] + subtable["PE_1"]
+ + subtable["PE_2"]
+ + subtable["Collapsed"],
+ subtable["SE"],
+ subtable["PE_1"],
+ subtable["PE_2"],
+ subtable["Collapsed"],
+ subtable["M"],
+ subtable["I"],
+ subtable["D"],
+ float(subtable["M"]) / subtable["Size"]]
+ rows.append(row)
+ rows.append("#")
+ rows.append("#")
+
+ while rows[-1] == "#":
+ rows.pop()
+ return rows
+
+
+def read_table(table, filename):
+ with open(filename) as table_file:
+ for record in parse_padded_table(table_file):
+ key = (record["Name"], record["Sample"],
+ record["Library"], record["Contig"])
+ if "*" in key:
+ continue
+
+ subtable = get_in(table, key)
+ if subtable is None:
+ subtable = dict(READGROUP_TEMPLATE)
+ subtable["Size"] = int(record["Size"])
+ set_in(table, key, subtable)
+
+ assert int(subtable["Size"]) == int(record["Size"])
+ for key in READGROUP_TEMPLATE:
+ if key != "Size":
+ subtable[key] += int(record.get(key, 0))
+
+
+def write_table(table, filename):
+ table = calculate_totals(table)
+ rows = build_rows(table)
+
+ if filename == "-":
+ output_handle = sys.stdout
+ else:
+ output_handle = open(filename, "w")
+
+ try:
+ output_handle.write(TABLE_HEADER % datetime.datetime.now().isoformat())
+ for line in padded_table(rows):
+ output_handle.write(line)
+ output_handle.write("\n")
+ finally:
+ if output_handle is not sys.stdout:
+ output_handle.close()
+
+
+def _calculate_totals_in(tables, lengths):
+ def _defaults():
+ return dict(READGROUP_TEMPLATE)
+
+ totals = collections.defaultdict(_defaults)
+ total_size = sum(lengths.itervalues())
+
+ subtables = tables.items()
+ while subtables:
+ subtable_key, subtable = subtables.pop()
+ if subtable_key == "*":
+ totals[subtable_key]["Size"] = total_size
+ elif "SE" in subtable:
+ for key in READGROUP_TEMPLATE:
+ if key != "Size":
+ totals[subtable_key][key] += subtable[key]
+ totals["*"][key] += subtable[key]
+ else:
+ totals[subtable_key][key] = lengths[subtable_key]
+ totals["*"][key] = total_size
+ else:
+ subtables.extend(subtable.items())
+
+ return dict(totals)
diff --git a/paleomix/tools/cat.py b/paleomix/tools/cat.py
new file mode 100755
index 0000000..78e349b
--- /dev/null
+++ b/paleomix/tools/cat.py
@@ -0,0 +1,104 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+"""
+Wrapper around cat / zcat / bzcat, which selects the appropriate commmand
+based on the files specified on the command-line. Input files may be a mix
+of different types of compressed / uncompressed files.
+"""
+import os
+import sys
+import argparse
+import itertools
+import subprocess
+
+
+def _select_output(filename):
+ """Returns a file-handle for 'filename'; if filename is '-' is stdout."""
+ if filename in (None, '-'):
+ return sys.stdout
+
+ return open(filename, 'wb')
+
+
+def _select_cat(filename):
+ """Identifies the compression scheme of a given file (if any) and return a
+ tuple with the appropriate cat command to decompress it.
+ """
+ with open(filename) as source:
+ header = source.read(2)
+ # The command "gzip -cd" is used instead of "zcat" because
+ # OSX ships a broken zcat command (only accepts *.Z files).
+ if header == "\x1f\x8b":
+ return ("gzip", "-cd")
+ elif header == "BZ":
+ return ("bzip2", "-cd")
+ return ("cat",)
+
+
+def _call(input_files, output_file):
+ """Call an appropriate cat on each input file, writing the contents to the
+ file specified by 'output_file'; if the latter is '-', STDOUT is used.
+ """
+ with _select_output(output_file) as out_handle:
+ for (command, filenames) in itertools.groupby(input_files,
+ _select_cat):
+ command = list(command)
+ command.extend(filenames)
+
+ subprocess.check_call(command,
+ stdout=out_handle,
+ preexec_fn=os.setsid,
+ close_fds=True)
+ return 0
+
+
+def parse_args(argv):
+ parser = argparse.ArgumentParser(prog="paleomix cat")
+ parser.add_argument("file", nargs="+",
+ help="One or more input files; these may be "
+ "uncompressed, compressed using gzip, or "
+ "compressed using bzip2.")
+ parser.add_argument("--output", default=None,
+ help="Write output to this file; by default, output "
+ "is written to STDOUT.")
+
+ return parser.parse_args(argv)
+
+
+def main(argv):
+ """Main function; takes a list of arguments but excluding sys.argv[0]."""
+ args = parse_args(argv)
+
+ try:
+ return _call(input_files=args.file,
+ output_file=args.output)
+ except Exception, error:
+ sys.stderr.write("Error running 'paleomix cat':\n %s\n\n" % error)
+ sys.stderr.write("Command = %s\n" % (" ".join(sys.argv),))
+ return 1
+
+ return 0
+
+
+if __name__ == '__main__':
+ sys.exit(main(sys.argv[1:]))
diff --git a/paleomix/tools/cleanup.py b/paleomix/tools/cleanup.py
new file mode 100755
index 0000000..c654456
--- /dev/null
+++ b/paleomix/tools/cleanup.py
@@ -0,0 +1,386 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+"""
+Reads SAM or BAM from STDIN and outputs cleaned and sorted BAM to STDOUT.
+
+The cleanup involves setting header-flags, fixing mate information, clearing
+various fields of unmapped reads (due to BWA leaving CIGARs in unmapped reads),
+sorting the input, and updating MD and NM tags.
+
+This script also solves a problem with parsing record-less files by SAMTools,
+which fails with a parse error if the input SAM file only contains a header.
+This is mainly a problem when aligning ancient DNA lanes against mt genomes,
+as there may not be any hits in an entire lane. For example, the following
+command will not work:
+$ samtools view -H INPUT.BAM | samtools view -Sbu -
+
+"""
+import sys
+import copy
+import argparse
+
+import pysam
+
+import paleomix.tools.factory
+
+import paleomix.common.procs as processes
+import paleomix.common.versions as versions
+
+from paleomix.nodes.samtools import SAMTOOLS_VERSION
+
+
+# Mask to select flags that are relevant to SE reads; this excludes flags where
+# no assumptions can if 0x1 is not set, per the SAM specification (see below).
+_SE_FLAGS_MASK = ~(0x2 | 0x8 | 0x20 | 0x40 | 0x80)
+
+
+def _set_sort_order(header):
+ """Updates a BAM header to indicate coordinate sorting."""
+ hd_dict = header.setdefault("HD", {"GO": "none", "VN": "1.0"})
+ hd_dict["SO"] = "coordinate"
+
+
+def _set_pg_tags(header, tags):
+ """Updates PG tags in a BAM header, taking a sequence of ID:TAG:VALUEs."""
+ for tag in tags:
+ pg_id, pg_field, pg_value = tag.split(":")
+
+ for pg_dict in header.setdefault("PG", []):
+ if pg_dict.get("ID") == pg_id:
+ pg_dict[pg_field] = pg_value
+ break
+ else:
+ header["PG"].append({"ID": pg_id, pg_field: pg_value})
+
+
+def _set_rg_tags(header, rg_id, rg_tags):
+ """Updates RG tags in a BAM header, taking a sequence of TAG:VALUEs."""
+ readgroup = {"ID": rg_id}
+ for tag in rg_tags:
+ rg_field, rg_value = tag.split(":")
+ readgroup[rg_field] = rg_value
+ header["RG"] = [readgroup]
+
+
+def _pipe_to_bam():
+ """Simply pipes a BAM/SAM file to stdout; this is required to handle SAM
+ files that do not contain records (i.e. only a header), which are not
+ properly handled by "samtools view -S -", resulting in a parse failure.
+ """
+ with pysam.Samfile("-", "r") as input_handle:
+ with pysam.Samfile("-", "wbu", template=input_handle) as output_handle:
+ for record in input_handle:
+ output_handle.write(record)
+
+ return 0
+
+
+def _cleanup_record(record):
+ """Cleans up the properties of a BAM record, ensuring that only appropriate
+ flags and values are set, such that the record follows section 1.4 of the
+ SAM specification (https://samtools.github.io/hts-specs/SAMv1.pdf). The
+ record itself (or a new record) is returned.
+ """
+ if not record.is_paired:
+ # Unset 0x2 (properly aligned), 0x8 (next mate unmapped),
+ # 0x20 (next mate reverse), 0x40 (first mate), and 0x80 (last mate).
+ record.flag = record.flag & (~0xEA)
+ record.rnext = -1
+ record.pnext = -1
+ record.tlen = 0
+
+ if record.is_unmapped:
+ record.mapq = 0
+ record.cigar = None
+ # Unset 0x2 (properly aligned), 0x100 (secondary), and 0x800 (chimeric)
+ record.flag = record.flag & (~0x902)
+
+ if record.mate_is_unmapped:
+ record.rnext = -1
+ record.pnext = -1
+
+ # Per the spec, unmapped reads should be placed with their mate
+ record.tid = record.rnext
+ record.pos = record.pnext
+ record.tlen = 0
+ elif record.mate_is_unmapped:
+ record.rnext = record.tid
+ record.pnext = record.pos
+ record.tlen = 0
+
+ return record
+
+
+def _filter_record(args, record):
+ """Returns True if the record should be filtered (excluded), based on the
+ --exclude-flags and --require-flags options. Certain flags are ignored when
+ filtering SE reads, namely those not included in _SE_FLAGS_MASK (above).
+ """
+ if record.is_paired:
+ exclude_flags = args.exclude_flags
+ require_flags = args.require_flags
+ else:
+ exclude_flags = args.exclude_flags & _SE_FLAGS_MASK
+ require_flags = args.require_flags & _SE_FLAGS_MASK
+
+ if (record.flag & exclude_flags):
+ return True
+ elif ~(record.flag & require_flags) & require_flags:
+ return True
+
+ return False
+
+
+def _cleanup_unmapped(args, cleanup_sam):
+ """Reads a BAM (or SAM, if cleanup_sam is True) file from STDIN, and
+ filters reads according to the filters specified in the commandline
+ arguments 'args'. The resulting records are written to STDOUT in
+ uncompressed BAM format. The output BAM is marked as sorted (under the
+ assumption that 'samtools sort' is to be run on the output) and PG tags are
+ updated if specified in the args.
+ """
+
+ filter_by_flag = bool(args.exclude_flags or args.require_flags)
+ spec = "r" if cleanup_sam else "rb"
+ with pysam.Samfile("-", spec) as input_handle:
+ header = copy.deepcopy(input_handle.header)
+ _set_sort_order(header)
+ _set_pg_tags(header, args.update_pg_tag)
+ if args.rg_id is not None:
+ _set_rg_tags(header, args.rg_id, args.rg)
+
+ with pysam.Samfile("-", "wbu", header=header) as output_handle:
+ for record in input_handle:
+ # Ensure that the properties make sense before filtering
+ record = _cleanup_record(record)
+
+ if not record.is_unmapped and (record.mapq < args.min_quality):
+ continue
+ elif filter_by_flag and _filter_record(args, record):
+ continue
+
+ if args.rg_id is not None:
+ # Ensure that only one RG tag is set
+ tags = [(key, value) for (key, value) in record.tags
+ if key != "RG"]
+ tags.append(("RG", args.rg_id))
+ record.tags = tags
+
+ output_handle.write(record)
+
+ return 0
+
+
+def _setup_single_ended_pipeline(procs, bam_cleanup):
+ # Convert input to BAM and cleanup / filter reads
+ procs["pipe"] = processes.open_proc(bam_cleanup + ['cleanup-sam'],
+ stdin=sys.stdin,
+ stdout=processes.PIPE)
+ sys.stdin.close()
+
+ return procs["pipe"]
+
+
+def _setup_paired_ended_pipeline(args, procs, bam_cleanup):
+ # Convert input to (uncompressed) BAM
+ procs["pipe"] = processes.open_proc(bam_cleanup + ["pipe"],
+ stdin=sys.stdin,
+ stdout=processes.PIPE)
+ sys.stdin.close()
+
+ # Fix mate information for PE reads
+ call_fixmate = ['samtools', 'fixmate']
+ if args.samtools1x == "yes":
+ call_fixmate.extend(("-O", "bam"))
+
+ procs["fixmate"] = processes.open_proc(call_fixmate + ['-', '-'],
+ stdin=procs["pipe"].stdout,
+ stdout=processes.PIPE)
+ procs["pipe"].stdout.close()
+
+ # Cleanup / filter reads. Must be done after 'fixmate', as BWA may produce
+ # hits where the mate-unmapped flag is incorrect, which 'fixmate' fixes.
+ procs["cleanup"] = processes.open_proc(bam_cleanup + ['cleanup'],
+ stdin=procs["fixmate"].stdout,
+ stdout=processes.PIPE)
+ procs["fixmate"].stdout.close()
+
+ return procs["cleanup"]
+
+
+def _build_wrapper_command(args):
+ bam_cleanup = paleomix.tools.factory.new("cleanup")
+ if args.fasta is not None:
+ bam_cleanup.set_option('--fasta', args.fasta)
+ bam_cleanup.set_option('--temp-prefix', args.temp_prefix)
+ bam_cleanup.set_option('--min-quality', str(args.min_quality))
+ bam_cleanup.set_option('--exclude-flags', hex(args.exclude_flags))
+ bam_cleanup.set_option('--samtools1x', args.samtools1x)
+
+ for value in args.update_pg_tag:
+ bam_cleanup.add_option('--update-pg-tag', value)
+
+ if args.rg_id is not None:
+ bam_cleanup.set_option('--rg-id', args.rg_id)
+ for value in args.rg:
+ bam_cleanup.add_option('--rg', value)
+
+ return bam_cleanup.call
+
+
+def _run_cleanup_pipeline(args):
+ bam_cleanup = _build_wrapper_command(args)
+ procs = {}
+ try:
+ # Update 'procs' and get the last process in the pipeline
+ if args.paired_end:
+ last_proc = _setup_paired_ended_pipeline(args, procs, bam_cleanup)
+ else:
+ last_proc = _setup_single_ended_pipeline(procs, bam_cleanup)
+
+ call_sort = ['samtools', 'sort', '-l', '0']
+ if args.samtools1x == "yes":
+ call_sort.extend(('-O', 'bam', '-T', args.temp_prefix))
+ else:
+ # Sort, output to stdout (-o)
+ call_sort.extend(('-o', '-', args.temp_prefix))
+
+ sort_stdout = None if args.fasta is None else processes.PIPE
+ procs["sort"] = processes.open_proc(call_sort,
+ stdin=last_proc.stdout,
+ stdout=sort_stdout)
+ last_proc.stdout.close()
+
+ # Update NM and MD tags; output BAM (-b) to stdout
+ if args.fasta is not None:
+ call_calmd = ['samtools', 'calmd', '-b', '-', args.fasta]
+ procs["calmd"] = processes.open_proc(call_calmd,
+ stdin=procs["sort"].stdout)
+ procs["sort"].stdout.close()
+
+ if any(processes.join_procs(procs.values())):
+ return 1
+ return 0
+ except:
+ for proc in procs.itervalues():
+ proc.terminate()
+ raise
+
+
+def parse_args(argv):
+ prog = "paleomix cleanup"
+ usage = "%s --temp-prefix prefix --fasta reference.fasta < in.sam" \
+ % (prog,)
+
+ parser = argparse.ArgumentParser(prog=prog, usage=usage)
+ # "Hidden" commands, invoking the various sub-parts of this script
+ parser.add_argument('command', choices=('pipe', 'cleanup', 'cleanup-sam'),
+ nargs="?", help=argparse.SUPPRESS)
+ # Specifies if the 'cleanup' step should expect SAM input
+ parser.add_argument('--cleanup-sam', default=False, action="store_true",
+ help=argparse.SUPPRESS)
+
+ parser.add_argument('--fasta', default=None,
+ help="Reference FASTA sequence; if set, the calmd "
+ "command is used to re-calculate MD tags.")
+ parser.add_argument('--temp-prefix', required=True,
+ help="REQUIRED: Prefix for temp files")
+ parser.add_argument("-q", "--min-quality", type=int, default=0,
+ help="Exclude aligned reads with a mapping quality "
+ "below this value; note that this filter ONLY "
+ "applies to aligned reads [Default: %(default)s]")
+ parser.add_argument("-f", "--require-flags", default=0,
+ type=lambda value: int(value, 0), # Handle hex, etc.
+ help="Only include reads with all of these flags set; "
+ "note that flags only valid for paired-end reads "
+ "(0x2, 0x8, 0x20, 0x40, 0x80) are ignored when "
+ "processing single-end reads "
+ "[Default: %(default)s].")
+ parser.add_argument("-F", "--exclude-flags", default=0,
+ type=lambda value: int(value, 0), # Handle hex, etc.
+ help="Exclude reads with any of these flags set; "
+ "note that flags only valid for paired-end reads "
+ "(0x2, 0x8, 0x20, 0x40, 0x80) are ignored when "
+ "processing single-end reads "
+ "[Default: %(default)s].")
+ parser.add_argument('--paired-end', default=False, action="store_true",
+ help='If enabled, additional processing of PE reads '
+ 'is carried out, including updating of mate '
+ 'information [Default: off]')
+ # TODO: Remove alias added for backwards compatibility:
+ parser.add_argument('--paired-ended', dest='paired_end',
+ action="store_true", help=argparse.SUPPRESS)
+
+ parser.add_argument("--update-pg-tag", default=[], action="append",
+ help="Update one PG tags with the given values, "
+ "creating the tag if it does not already exist. "
+ "Takes arguments in the form \"PGID:TAG:VALUE\".")
+ parser.add_argument('--rg-id', default=None,
+ help="If set, the read-group is overwritten based "
+ "on tags set using the --rg option, using the "
+ "id specified using --rg-id.")
+ parser.add_argument('--rg', default=[], action="append",
+ help="Create readgroup values 'ID:TAG:VALUE' "
+ "represented using a string as shown.")
+
+ # Option to select between incompatible parameters for SAMTools v0.1.x and
+ # for samtools v1.x; this is needed for samtools 'sort' and 'fixmate'.
+ parser.add_argument('--samtools1x', choices=('yes', 'no'),
+ help=argparse.SUPPRESS)
+
+ return parser.parse_args(argv)
+
+
+def main(argv):
+ args = parse_args(argv)
+
+ if args.samtools1x is None:
+ sys.stderr.write("Determining SAMTools version ... ")
+ sys.stderr.flush()
+
+ try:
+ sys.stderr.write("v%i.%i.%i found\n" % SAMTOOLS_VERSION.version)
+
+ if SAMTOOLS_VERSION.version >= (1, 0):
+ args.samtools1x = "yes"
+ elif SAMTOOLS_VERSION.version == (0, 1, 19):
+ args.samtools1x = "no"
+ else:
+ sys.stderr.write("ERROR: Only SAMTools versions v0.1.19 and "
+ "v1.0+ are supported; please upgrade / "
+ "replace the installed copy of SAMTools!\n")
+ return 1
+ except versions.VersionRequirementError, error:
+ sys.stderr.write("ERROR: Could not determine SAMTools version: "
+ "%s\n" % (error,))
+ return 1
+
+ if args.command == "pipe":
+ return _pipe_to_bam()
+ elif args.command == "cleanup":
+ return _cleanup_unmapped(args, cleanup_sam=False)
+ elif args.command == "cleanup-sam":
+ return _cleanup_unmapped(args, cleanup_sam=True)
+
+ sys.stderr.write("Reading SAM file from STDIN ...\n")
+ return _run_cleanup_pipeline(args)
diff --git a/paleomix/tools/coverage.py b/paleomix/tools/coverage.py
new file mode 100755
index 0000000..b28b1ef
--- /dev/null
+++ b/paleomix/tools/coverage.py
@@ -0,0 +1,193 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import sys
+import copy
+
+from paleomix.common.utilities import \
+ get_in, \
+ set_in
+from paleomix.common.timer import \
+ BAMTimer
+from paleomix.common.bamfiles import \
+ BAMRegionsIter
+
+from paleomix.tools.bam_stats.common import \
+ collect_readgroups, \
+ collect_references, \
+ main_wrapper
+from paleomix.tools.bam_stats.coverage import \
+ READGROUP_TEMPLATE, \
+ write_table
+
+
+##############################################################################
+##############################################################################
+
+def build_region_template(args, handle):
+ template = {}
+ for key in collect_readgroups(args, handle):
+ template[key] = dict(READGROUP_TEMPLATE)
+ return template
+
+
+def get_region_table(counts, region, template):
+ subtable = counts.get(region)
+ if subtable is not None:
+ return subtable
+
+ subtable = copy.deepcopy(template)
+ counts[region] = subtable
+
+ return subtable
+
+
+##############################################################################
+##############################################################################
+
+def create_or_get_subtable(table, subtable_key, size):
+ subtable = get_in(table, subtable_key)
+ if subtable is None:
+ subtable = dict(READGROUP_TEMPLATE)
+ subtable["Size"] = size
+ set_in(table, subtable_key, subtable)
+ return subtable
+
+
+def build_table(args, handle, counts):
+ references = collect_references(args, handle)
+
+ table = {}
+ for (key, readgroup) in collect_readgroups(args, handle).iteritems():
+ sample = readgroup["SM"]
+ library = readgroup["LB"]
+
+ # Exclude counts for reads with no read-groups, if none such were seen
+ if key is None and not args.ignore_readgroups:
+ for reference in references:
+ if any(counts.get(reference, {}).get(key, {}).itervalues()):
+ break
+ else:
+ continue
+
+ for (reference, size) in references.iteritems():
+ subtable_key = (args.target_name, sample, library, reference)
+ subtable = create_or_get_subtable(table, subtable_key, size)
+
+ statistics = counts.get(reference, {}).get(key, {})
+ for (stat, value) in statistics.iteritems():
+ subtable[stat] += value
+
+ return table
+
+
+def print_table(args, handle, counts):
+ table = build_table(args, handle, counts)
+ write_table(table, args.outfile)
+
+
+##############################################################################
+##############################################################################
+
+def process_record(subtable, record, flags, region):
+ qname = record.qname
+ if qname.startswith("M_") or qname.startswith("MT_"):
+ subtable["Collapsed"] += 1
+ elif flags & 0x40: # first of pair
+ subtable["PE_1"] += 1
+ elif flags & 0x80: # second of pair
+ subtable["PE_2"] += 1
+ else: # Singleton
+ subtable["SE"] += 1
+
+ position = record.pos
+ start = region.start
+ end = region.end
+
+ for (cigar, num) in record.cigar:
+ left = min(max(position, start), end)
+ right = min(max(position + num, start), end)
+ bases_in_region = right - left
+ assert 0 <= bases_in_region <= num
+
+ # 0 = 'M', 1 = 'I', 2 = 'D', 7 = '=', 8 = 'X'
+ if cigar in (0, 1, 2, 7, 8):
+ if bases_in_region:
+ subtable["MID MM"[cigar]] += bases_in_region
+
+ if cigar != 1: # Everything but insertions
+ position += num
+ elif cigar == 3: # N
+ position += num
+
+
+def process_file(handle, args):
+ timer = BAMTimer(handle, step=1000000)
+
+ counts = {}
+ last_tid = 0
+ region_template = build_region_template(args, handle)
+ for region in BAMRegionsIter(handle, args.regions):
+ if region.name is None:
+ # Trailing unmapped reads
+ break
+
+ name = region.name
+ if not args.regions and (handle.nreferences > args.max_contigs):
+ name = '<Genome>'
+
+ last_pos = 0
+ region_table = get_region_table(counts, name, region_template)
+ for (position, records) in region:
+ for record in records:
+ readgroup = args.get_readgroup_func(record)
+ readgroup_table = region_table.get(readgroup)
+ if readgroup_table is None:
+ # Unknown readgroups are treated as missing readgroups
+ readgroup_table = region_table[None]
+
+ process_record(readgroup_table, record, record.flag, region)
+ timer.increment(read=record)
+
+ if (region.tid, position) < (last_tid, last_pos):
+ sys.stderr.write("ERROR: Input BAM file is unsorted\n")
+ return 1
+
+ last_pos = position
+ last_tid = region.tid
+
+ timer.finalize()
+
+ print_table(args, handle, counts)
+
+ return 0
+
+
+def main(argv):
+ return main_wrapper(process_file, argv, ".coverage")
+
+
+##############################################################################
+##############################################################################
+
+if __name__ == '__main__':
+ sys.exit(main(sys.argv[1:]))
diff --git a/paleomix/tools/depths.py b/paleomix/tools/depths.py
new file mode 100755
index 0000000..31e3b63
--- /dev/null
+++ b/paleomix/tools/depths.py
@@ -0,0 +1,404 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import sys
+import datetime
+import itertools
+import collections
+
+from paleomix.common.timer import \
+ BAMTimer
+from paleomix.common.text import \
+ padded_table
+from paleomix.common.bamfiles import \
+ BAMRegionsIter
+
+from paleomix.tools.bam_stats.common import \
+ collect_references, \
+ collect_readgroups, \
+ main_wrapper
+
+
+##############################################################################
+##############################################################################
+##
+
+# Maximum depth to record, and hence the number of columns in output table
+_MAX_DEPTH = 200
+# Maximum number of count patterns (numbers of bases per library for a given
+# site) to cache for bulk processing; see MappingsToTotals for implementation
+_MAX_CACHE_SIZE = 10000
+
+
+# Header prepended to output tables
+_HEADER = """# Timestamp: %s
+#
+# Columns:
+# Contig: Contig, chromosome, or feature for which a depth histogram was
+# created. Unnamed features are named after the chromosome or
+# contig on which they are located, with a star appended. For
+# example "chr1*". If the maximum number of contigs was exceeded,
+# these are collapsed into one meta-contig named "<Genome>".
+# Size: The total size of the region. Multiple features with the same
+# name are combined into one row, with the size representing the
+# total for these. Note that overlapping bases are counted 2 (or
+# more) times.
+# MaxDepth: Maximum depth to use when calling SNPs, in order to exclude
+# (at least) the 0.5%% most extreme sites based on read depth,
+# not including sites with depth 0.
+# MD_*: Fraction of sites with a minimum depth of 1-200.
+#"""
+
+
+##############################################################################
+##############################################################################
+
+class MappingToTotals(object):
+ def __init__(self, totals, region, smlbid_to_smlb):
+ self._region = region
+ self._totals = totals
+ self._map_by_smlbid, self._totals_src_and_dst \
+ = self._build_mappings(totals, region.name, smlbid_to_smlb)
+ self._cache = collections.defaultdict(int)
+
+ def process_counts(self, counts, last_pos, cur_pos):
+ start = self._region.start
+ end = self._region.end
+
+ # Pileups tends to contain identical stretches, so
+ # try to avoid repeated lookups by aggregating these
+ repeats = 1
+ last_count = None
+ while counts and (last_pos < cur_pos):
+ count = counts.popleft()
+ if start <= last_pos < end:
+ if count == last_count:
+ repeats += 1
+ else:
+ if last_count is not None:
+ self._cache[tuple(last_count)] += repeats
+ last_count = count
+ repeats = 1
+ last_pos += 1
+
+ if last_count is not None:
+ self._cache[tuple(last_count)] += repeats
+
+ if len(self._cache) > _MAX_CACHE_SIZE:
+ self.finalize()
+
+ def finalize(self):
+ """Process cached counts."""
+ for (count, multiplier) in self._cache.iteritems():
+ self._update_totals(count, multiplier)
+ self._cache.clear()
+
+ def _update_totals(self, count, multiplier=1):
+ for (smlbid, count) in enumerate(count):
+ if count:
+ for lst in self._map_by_smlbid[smlbid]:
+ lst[0] += count
+
+ for (dst_counts, src_count) in self._totals_src_and_dst:
+ if src_count[0]:
+ dst_counts[src_count[0]] += multiplier
+ src_count[0] = 0
+
+ @classmethod
+ def _build_mappings(cls, totals, name, smlbid_to_smlb):
+ # Accumulators mapped by sample+library IDs
+ totals_by_smlbid = [None] * len(smlbid_to_smlb)
+ # Accumulators mapped by the corresponding table keys
+ totals_by_table_key = {}
+
+ for (smlbid, (sm_key, lb_key)) in enumerate(smlbid_to_smlb):
+ keys = [('*', '*', '*'),
+ (sm_key, '*', '*'),
+ (sm_key, '*', name),
+ (sm_key, lb_key, '*'),
+ (sm_key, lb_key, name)]
+
+ mappings = cls._nonoverlapping_mappings(keys, totals,
+ totals_by_table_key)
+ totals_by_smlbid[smlbid] = mappings
+
+ totals_src_and_dst = []
+ for (key, dst) in totals_by_table_key.iteritems():
+ totals_src_and_dst.append((totals[key], dst))
+
+ return totals_by_smlbid, totals_src_and_dst
+
+ @classmethod
+ def _nonoverlapping_mappings(cls, keys, totals, totals_by_table_key):
+ """Returns a tuple of accumulators for a given set of table keys. As
+ multiple table keys may share the same accumulator (e.g. if there is
+ only one sample, then sample "*" and that sample will be identical),
+ the tuple of accumulators may contain fewer items than keys."""
+
+ mapping = []
+ totals_used = set()
+ for key in keys:
+ # Check that accumulator is not already included
+ totals_id = id(totals[key])
+ if totals_id not in totals_used:
+ totals_used.add(totals_id)
+ accumulator = totals_by_table_key.setdefault(key, [0])
+ mapping.append(accumulator)
+ return tuple(mapping)
+
+
+##############################################################################
+##############################################################################
+
+def calc_max_depth(counts):
+ counts = dict(counts)
+ counts.pop(0, None)
+
+ running_total = sum(counts.values())
+ if not running_total:
+ return "NA"
+
+ total = float(running_total)
+ for (index, count) in sorted(counts.items()):
+ # Stop when less than the 0.5% most extreme values are included
+ if running_total / total < 0.005:
+ # The max is inclusive, so return the depth just before this one
+ return index - 1
+ running_total -= count
+
+ return "NA"
+
+
+def print_table(handle, args, totals):
+ lengths = collect_references(args, handle)
+
+ if args.outfile == "-":
+ output_handle = sys.stdout
+ else:
+ output_handle = open(args.outfile, "w")
+
+ with output_handle:
+ rows = build_table(args.target_name, totals, lengths)
+ output_handle.write(_HEADER % datetime.datetime.now().isoformat())
+ output_handle.write("\n")
+ for line in padded_table(rows):
+ output_handle.write(line)
+ output_handle.write("\n")
+
+
+def calculate_depth_pc(counts, length):
+ final_counts = [0] * (_MAX_DEPTH + 1)
+ for (depth, count) in counts.iteritems():
+ final_counts[min(_MAX_DEPTH, depth)] += count
+
+ running_total = sum(final_counts)
+ total = float(length)
+ for count in final_counts[1:]:
+ yield "%.4f" % (running_total / total,)
+ running_total -= count
+
+
+def build_table(name, totals, lengths):
+ header = ["Name", "Sample", "Library", "Contig", "Size", "MaxDepth"]
+ for index in xrange(1, _MAX_DEPTH + 1):
+ header.append("MD_%03i" % (index,))
+
+ rows = [header]
+ last_sm = last_lb = None
+ for ((sm_key, lb_key, ct_key), counts) in sorted(totals.items()):
+ if (sm_key != last_sm) and (last_sm is not None):
+ rows.extend("##")
+ elif (lb_key != last_lb) and (last_lb is not None):
+ rows.append("#")
+ last_sm, last_lb = sm_key, lb_key
+
+ if ct_key == "*":
+ length = sum(lengths.itervalues())
+ else:
+ length = lengths[ct_key]
+
+ row = [name, sm_key, lb_key, ct_key, str(length),
+ str(calc_max_depth(counts))]
+ row.extend(calculate_depth_pc(counts, length))
+
+ rows.append(row)
+
+ return rows
+
+
+##############################################################################
+##############################################################################
+
+def build_key_struct(args, handle):
+ structure = collections.defaultdict(set)
+ for readgroup in collect_readgroups(args, handle).itervalues():
+ lb_key = readgroup["LB"]
+ sm_key = readgroup["SM"]
+ structure[sm_key].add(lb_key)
+
+ return structure
+
+
+def build_new_dicts(totals, dst_sm, dst_lb, references):
+ totals[(dst_sm, dst_lb, '*')] = collections.defaultdict(int)
+ for contig in references:
+ totals[(dst_sm, dst_lb, contig)] = collections.defaultdict(int)
+
+
+def reuse_dicts(totals, dst_sm, dst_lb, src_sm, src_lb, references):
+ totals[(dst_sm, dst_lb, '*')] = totals[(src_sm, src_lb, '*')]
+ for contig in references:
+ totals[(dst_sm, dst_lb, contig)] = totals[(src_sm, src_lb, contig)]
+
+
+def build_totals_dict(args, handle):
+ references = tuple(collect_references(args, handle))
+ structure = build_key_struct(args, handle)
+
+ totals = {}
+ for (sm_key, libraries) in structure.iteritems():
+ for lb_key in libraries:
+ if len(references) == 1:
+ key = references[0]
+ counts = collections.defaultdict(int)
+ totals[(sm_key, lb_key, key)] = counts
+ totals[(sm_key, lb_key, '*')] = counts
+ else:
+ build_new_dicts(totals, sm_key, lb_key, references)
+
+ if len(libraries) == 1:
+ key = list(libraries)[0]
+ reuse_dicts(totals, sm_key, '*', sm_key, key, references)
+ else:
+ build_new_dicts(totals, sm_key, '*', references)
+
+ if len(structure) == 1:
+ key = list(structure)[0]
+ reuse_dicts(totals, '*', '*', key, '*', references)
+ else:
+ build_new_dicts(totals, '*', '*', references)
+
+ return totals
+
+
+def count_bases(args, counts, record, rg_to_smlbid, template):
+ for _ in xrange(record.alen - len(counts)):
+ counts.append(list(template))
+
+ key = rg_to_smlbid.get(args.get_readgroup_func(record))
+ if key is None:
+ # Unknown readgroups are treated as missing readgroups
+ key = rg_to_smlbid[None]
+
+ index = 0
+ for (cigar, count) in record.cigar:
+ if cigar in (0, 7, 8):
+ for counter in itertools.islice(counts, index, index + count):
+ counter[key] += 1
+ index += count
+ elif cigar in (2, 3, 6):
+ index += count
+
+
+def build_rg_to_smlbid_keys(args, handle):
+ """Returns a dictionary which maps a readgroup ID to an index value,
+ as well as a list containing a tuple (samples, library) corresponding
+ to each index. Typically, this list will be shorter than the map of read-
+ groups, as multiple read-groups will map to the same sample / library.
+ """
+
+ rg_to_lbsmid = {}
+ lbsm_to_lbsmid = {}
+ lbsmid_to_smlb = []
+ for (key_rg, readgroup) in collect_readgroups(args, handle).iteritems():
+ key_sm = readgroup["SM"]
+ key_lb = readgroup["LB"]
+
+ key_lbsm = (key_sm, key_lb)
+ if key_lbsm not in lbsm_to_lbsmid:
+ lbsm_to_lbsmid[key_lbsm] = len(lbsm_to_lbsmid)
+ lbsmid_to_smlb.append(key_lbsm)
+
+ rg_to_lbsmid[key_rg] = lbsm_to_lbsmid[key_lbsm]
+ return rg_to_lbsmid, lbsmid_to_smlb
+
+
+def process_file(handle, args):
+ timer = BAMTimer(handle, step=1000000)
+
+ last_tid = 0
+ totals = build_totals_dict(args, handle)
+ rg_to_smlbid, smlbid_to_smlb = build_rg_to_smlbid_keys(args, handle)
+ template = [0] * len(smlbid_to_smlb)
+
+ for region in BAMRegionsIter(handle, args.regions):
+ if region.name is None:
+ # Trailing unmapped reads
+ break
+ elif not args.regions and (handle.nreferences > args.max_contigs):
+ region.name = '<Genome>'
+
+ last_pos = 0
+ counts = collections.deque()
+ mapping = MappingToTotals(totals, region, smlbid_to_smlb)
+ for (position, records) in region:
+ mapping.process_counts(counts, last_pos, position)
+
+ for record in records:
+ timer.increment(read=record)
+ count_bases(args, counts, record, rg_to_smlbid, template)
+
+ if (region.tid, position) < (last_tid, last_pos):
+ sys.stderr.write("ERROR: Input BAM file is unsorted\n")
+ return 1
+
+ last_pos = position
+ last_tid = region.tid
+
+ # Process columns in region after last read
+ mapping.process_counts(counts, last_pos, float("inf"))
+ mapping.finalize()
+ timer.finalize()
+
+ if not args.ignore_readgroups:
+ # Exclude counts for reads with no read-groups, if none such were seen
+ for (key, _, _), value in totals.iteritems():
+ if key == '<NA>' and value:
+ break
+ else:
+ for key in totals.keys():
+ if key[0] == '<NA>':
+ totals.pop(key)
+
+ print_table(handle, args, totals)
+
+ return 0
+
+
+def main(argv):
+ return main_wrapper(process_file, argv, ".depths")
+
+##############################################################################
+##############################################################################
+
+if __name__ == '__main__':
+ sys.exit(main(sys.argv[1:]))
diff --git a/paleomix/tools/duphist.py b/paleomix/tools/duphist.py
new file mode 100755
index 0000000..42d7dff
--- /dev/null
+++ b/paleomix/tools/duphist.py
@@ -0,0 +1,107 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+"""Tool for building histogram of PCR duplicates to be used with 'preseq'.
+
+This allows estimation of the library complexity, and potential further gains
+from sequencing the library. Unlike the tools included in 'preseq', this tool
+handles collapsed reads.
+
+Preseq is located at http://smithlabresearch.org/software/preseq/
+"""
+import sys
+import random
+import argparse
+import collections
+
+import pysam
+
+import paleomix.common.bamfiles as bamfiles
+
+
+def get_template_length(record):
+ """Returns the template length of the given record for paired or collapsed
+ reads; for single-ended reads, None is returned."""
+ if record.is_paired:
+ return record.tlen
+ elif record.qname.startswith("M_"):
+ return record.alen
+ return None
+
+
+def process_records(records, counts):
+ """Processes a set of records aligned to the same positon; dulpicates are
+ inferred based on the template lengths of the records, using the sequence
+ length for collapsed reads (where qname.startswith("M_")). Single-ended
+ reads are assumed to represent a random sampling of reads for which the
+ insert size is known.
+ """
+ alignments = collections.defaultdict(int)
+ for record in records:
+ if record.is_paired:
+ if (not record.is_proper_pair) or record.is_read2:
+ continue
+
+ alignment = get_template_length(record)
+ alignments[alignment] += 1
+
+ if (None in alignments) and len(alignments) > 1:
+ ambigious_count = alignments.pop(None)
+
+ # PE reads are assummed to represent a random sample of PE / collapsed
+ # reads, and distributed randomly across these to approximate this
+ keys = tuple(alignments)
+ for _ in xrange(ambigious_count):
+ key = random.choice(keys)
+ alignments[key] += 1
+
+ for count in alignments.itervalues():
+ counts[count] += 1
+
+
+def parse_args(argv):
+ prog = "paleomix duphist"
+ usage = "%s sorted.bam > out.histogram" % (prog,)
+ parser = argparse.ArgumentParser(prog=prog, usage=usage)
+ parser.add_argument("bamfile", help="Sorted BAM file.")
+
+ return parser.parse_args(argv)
+
+
+def main(argv):
+ """Main function; takes a list of arguments equivalent to sys.argv[1:]."""
+ args = parse_args(argv)
+
+ # Default filters, excepting that PCR duplicates are not filtered
+ mask = bamfiles.EXCLUDED_FLAGS & ~bamfiles.BAM_PCR_DUPLICATE
+ counts = collections.defaultdict(int)
+ with pysam.Samfile(args.bamfile) as handle:
+ for region in bamfiles.BAMRegionsIter(handle, exclude_flags=mask):
+ for (_, records) in region:
+ process_records(records, counts)
+
+ for (key, count) in sorted(counts.iteritems()):
+ print "%i\t%i" % (key, count)
+
+
+if __name__ == '__main__':
+ sys.exit(main(sys.argv[1:]))
diff --git a/paleomix/tools/ena.py b/paleomix/tools/ena.py
new file mode 100755
index 0000000..5bad8e8
--- /dev/null
+++ b/paleomix/tools/ena.py
@@ -0,0 +1,499 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+"""
+This is a tool to ease the process of submitting data previously processed.
+using the BAM Pipeline to ENA. This accomplished in roughly 5 steps:
+
+1. Create the project on ENA and create each of the samples that are to be
+ uploaded for that project.
+
+2. Run 'paleomix ena build-table' on one or more makefiles; this generates a
+ basic template for the lanes to be uploaded. This table is edited to
+ reflect the desired layout of uploaded data (except the MAKEFILE_PATH
+ column, which MUST NOT be edited).
+
+3. Run 'paleomix ena collect-files' on the table built in step 2; this collects
+ all FASTQ files into a single folder, merging lanes into one (for SE) or two
+ (for PE) files and calculates MD5 sums. The folder containing the merged
+ FASTQ files is uploaded to ENA, using the same path (e.g. upload to
+ /ena_submission/fastq/ by default).
+
+4. Run 'paleomix ena finalize-table' on the table, with the 'se' and 'pe'
+ options. This is required since the two types of tables differ in layout.
+
+5. Uploaded the resulting 'se' and 'pe' tables to ENA.
+"""
+import argparse
+import glob
+import os
+import string
+import sys
+
+import paleomix.yaml
+
+import paleomix.ui as ui
+
+import paleomix.tools.factory as factory
+
+from paleomix.atomiccmd.command import \
+ AtomicCmd
+
+from paleomix.atomiccmd.sets import \
+ ParallelCmds
+
+from paleomix.node import \
+ CommandNode
+
+from paleomix.pipeline import \
+ Pypeline
+
+import paleomix.common.fileutils as fileutils
+
+
+###############################################################################
+
+class ENAError(RuntimeError):
+ pass
+
+
+###############################################################################
+
+class CatFilesNode(CommandNode):
+ """Runs the equivalent of 'paleomix cat $@ | gzip > ${DST}'."""
+ def __init__(self, input_files, destination, dependencies=()):
+ cat_cmd = factory.new("cat")
+ cat_cmd.add_multiple_values(input_files)
+ cat_cmd.set_kwargs(OUT_STDOUT=AtomicCmd.PIPE)
+ cat_cmd = cat_cmd.finalize()
+
+ zip_cmd = AtomicCmd("gzip",
+ IN_STDIN=cat_cmd,
+ OUT_STDOUT=destination)
+
+ description = "<Cat %s -> %s>" \
+ % (fileutils.describe_files(input_files), destination)
+
+ CommandNode.__init__(self,
+ description=description,
+ command=ParallelCmds((cat_cmd, zip_cmd)),
+ dependencies=dependencies)
+
+
+class MD5FilesNode(CommandNode):
+ """Runs the equivalent of 'md5sum $1 > $2'."""
+ def __init__(self, input_file, destination, dependencies=()):
+ md5_cmd = AtomicCmd(("md5sum", "%(IN_FILE)s"),
+ IN_FILE=input_file,
+ OUT_STDOUT=destination)
+
+ description = "<MD5Sum %s -> %s>" \
+ % (input_file, destination)
+
+ CommandNode.__init__(self,
+ description=description,
+ command=md5_cmd,
+ dependencies=dependencies)
+
+
+###############################################################################
+
+def _is_paired_end(template):
+ """Returns true if a path from a makefile is for paired-end data."""
+ return (template.format(Pair=1) != template)
+
+
+def _sorted_glob(tmpl):
+ return list(sorted(glob.glob(tmpl))) or [tmpl]
+
+
+def _collect_files(template):
+ if not _is_paired_end(template):
+ return {"": _sorted_glob(template)}
+
+ files = {}
+ for (mate, name) in enumerate(("_R1", "_R2"), start=1):
+ files[name] = _sorted_glob(template.format(Pair=mate))
+
+ assert len(files["_R1"]) == len(files["_R2"]), template
+
+ return files
+
+
+def _build_filename(args, row, extensions, postfix=""):
+ return os.path.join(args.destination,
+ extensions.split(".")[0],
+ row["MAKEFILE_TARGET"],
+ "%s__%s__%s%s.%s"
+ % (row["MAKEFILE_SAMPLE"],
+ row["MAKEFILE_LIBRARY"],
+ row["MAKEFILE_LANE"],
+ postfix,
+ extensions))
+
+
+###############################################################################
+
+
+def _main_collect_files(args):
+ """Main function for collecting FASTQ files and calculating MD5 sums."""
+ args.temp_root = os.path.join(args.destination, "temp")
+
+ nodes = _build_collect_nodes(args)
+ if nodes is None:
+ return 1
+
+ pipeline = Pypeline(args)
+ pipeline.add_nodes(nodes)
+
+ if args.list_output_files:
+ pipeline.print_output_files()
+ elif args.list_executables:
+ pipeline.print_required_executables()
+ elif not pipeline.run(max_threads=args.max_threads,
+ dry_run=args.dry_run):
+ return 1
+
+ return 0
+
+
+def _build_collect_nodes(args):
+ """Builds a set of nodes for merging and gzip-compressing the reads from
+ each lane, as well as calculating the MD5 sums for each resulting file. By
+ default, the resulting files are located in the folders
+ - ena_submission/fastq
+ - ena_submission/md5
+ """
+ nodes = []
+
+ for row in _read_table(args.table):
+ path = row["MAKEFILE_PATH"]
+
+ for postfix, filenames in sorted(_collect_files(path).iteritems()):
+ destination_fastq = _build_filename(args, row, "fastq.gz", postfix)
+ destination_md5 = _build_filename(args, row, "md5", postfix)
+
+ for filename in filenames:
+ if not os.path.exists(filename):
+ _build_collect_nodes_error(row, "File not found", filename)
+ elif not os.path.isfile(filename):
+ _build_collect_nodes_error(row, "Not a file", filename)
+
+ cat_node = CatFilesNode(input_files=filenames,
+ destination=destination_fastq)
+
+ md5_node = MD5FilesNode(input_file=destination_fastq,
+ destination=destination_md5,
+ dependencies=(cat_node,))
+
+ nodes.append(md5_node)
+
+ return nodes
+
+
+def _build_collect_nodes_error(row, msg, filename):
+ row = dict(row)
+ row["PATH"] = filename
+ row["MSG"] = msg
+
+ raise ENAError(("%(MSG)s; "
+ "target = %(MAKEFILE_TARGET)s; "
+ "sample = %(MAKEFILE_SAMPLE)s; "
+ "library = %(MAKEFILE_LIBRARY)s; "
+ "lane = %(MAKEFILE_LANE)s; "
+ "path = %(PATH)s") % row)
+
+
+###############################################################################
+
+def _main_build_table(args):
+ keys = ("MAKEFILE_TARGET", "MAKEFILE_SAMPLE", "MAKEFILE_LIBRARY",
+ "MAKEFILE_LANE", "sample_alias", "instrument_model",
+ "library_source", "library_selection",
+ "library_strategy", "design_description",
+ "library_construction_protocol", "insert_size",
+ "MAKEFILE_PATH")
+
+ result = []
+ for row in _build_table_rows(args):
+ result.append([row[key] for key in keys])
+
+ print "\t".join(keys)
+ for row in sorted(result):
+ print "\t".join(row)
+
+
+def _build_table_rows(args):
+ rows = []
+ for filename in args.makefile:
+ for (target, sample, library, lane, path) in _parse_makefile(filename):
+ if isinstance(path, dict):
+ ui.print_err("WARNING: Found pre-processed data "
+ "at %s:%s:%s:%s; cannot collect raw "
+ "FASTQ data."
+ % (target, sample, library, lane))
+ continue
+
+ row = {"sample_alias": "*",
+ "instrument_model": "*",
+ "library_source": "GENOMIC",
+ "library_selection": "RANDOM",
+ "library_strategy": "WGS",
+ "design_description": "",
+ "library_construction_protocol": "",
+ "insert_size": "0",
+
+ "MAKEFILE_TARGET": target,
+ "MAKEFILE_SAMPLE": sample,
+ "MAKEFILE_LIBRARY": library,
+ "MAKEFILE_LANE": lane,
+ "MAKEFILE_PATH": path}
+
+ rows.append(row)
+
+ return rows
+
+
+def _parse_makefile(filename):
+ with open(filename) as handle:
+ mkfile = paleomix.yaml.safe_load(handle.read())
+ mkfile.pop("Options", None)
+ mkfile.pop("Prefixes", None)
+
+ for target, samples in sorted(mkfile.iteritems()):
+ samples.pop("Options", None)
+
+ for sample, libraries in sorted(samples.iteritems()):
+ libraries.pop("Options", None)
+
+ for library, lanes in sorted(libraries.iteritems()):
+ lanes.pop("Options", None)
+
+ for lane, path in sorted(lanes.iteritems()):
+ yield (target, sample, library, lane, path)
+
+
+###############################################################################
+
+def _main_finalize_table(args):
+ table = _read_table(args.table)
+ for row in table:
+ if _is_paired_end(row["MAKEFILE_PATH"]):
+ if args.mode == "pe":
+ row['forward_file_name'] = _build_filename(args, row,
+ "fastq.gz", "_R1")
+ row['forward_file_md5'] = read_md5(args, row, "_R1")
+ row['reverse_file_name'] = _build_filename(args, row,
+ "fastq.gz", "_R2")
+ row['reverse_file_md5'] = read_md5(args, row, "_R2")
+ elif args.mode == "se":
+ row["file_name"] = _build_filename(args, row, "fastq.gz")
+ row["file_md5"] = read_md5(args, row)
+
+ row.setdefault("library_name", row["MAKEFILE_LIBRARY"])
+
+ keys = ("sample_alias", "instrument_model", "library_name",
+ "library_source", "library_selection", "library_strategy",
+ "design_description", "library_construction_protocol")
+
+ if args.mode == "pe":
+ keys += ('insert_size',
+ 'forward_file_name', 'forward_file_md5',
+ 'reverse_file_name', 'reverse_file_md5')
+ else:
+ keys += ('file_name', 'file_md5')
+
+ results = []
+ do_warn = False
+ for row in table:
+ if _is_paired_end(row["MAKEFILE_PATH"]) == (args.mode == "pe"):
+ results.append([row[key] for key in keys])
+ else:
+ do_warn = True
+
+ print "\t".join(keys)
+ for row in sorted(results):
+ print "\t".join(row)
+
+ if do_warn:
+ current, other = "single-end", "paired-end"
+ if args.mode == "pe":
+ other, current = current, other
+
+ ui.print_warn("WARNING: Table for {current} has been printed, but\n"
+ " the table also contains data for {other}\n"
+ " reads. Remember to construct a table for\n"
+ " {other} reads as well!"
+ .format(current=current, other=other))
+
+ return 0
+
+
+def read_md5(args, row, postfix=""):
+ """Reads the md5-sum generated for a given lane; arguments correspond to
+ 'build_filename', excepting that the extension is already defined.
+ """
+ filename = _build_filename(args, row, "md5", postfix)
+ with open(filename) as handle:
+ lines = handle.readlines()
+
+ if len(lines) != 1:
+ raise ENAError("MD5-sum file (%r) does not match expected format; "
+ "exactly one line expected, but found %i line(s); "
+ "please remove this file and re-run the "
+ "'collect-files' command!"
+ % (filename, len(lines)))
+
+ fields = lines[0].split()
+ if len(fields) != 2:
+ raise ENAError("MD5-sum file (%r) does not match expected format; "
+ "exactly two columns expected, but found %i; "
+ "please remove this file and re-run the "
+ "'collect-files' command!"
+ % (filename, len(fields)))
+
+ if len(fields[0]) != 32 or (set(fields[0]) - set(string.hexdigits)):
+ raise ENAError("MD5-sum file (%r) does not match expected format; "
+ "32 digit hexadecimal expected, but found %r; "
+ "please remove this file and re-run the "
+ "'collect-files' command!"
+ % (filename, fields[0]))
+
+ return fields[0]
+
+
+def _read_table(filename):
+ """Reads table generated using 'build-table'."""
+ rows = []
+ with open(filename) as handle:
+ header = None
+ for line in handle:
+ line = line.rstrip("\r\n")
+ if line.startswith("#"):
+ continue
+
+ fields = line.split("\t")
+ if header is None:
+ header = fields
+ elif fields and line.strip():
+ if len(fields) == len(header):
+ rows.append(dict(zip(header, fields)))
+ else:
+ assert False
+
+ return rows
+
+
+###############################################################################
+
+def parse_args(argv):
+ """Parses the result of sys.argv[1:] or equivalent.
+
+ The resulting object contains a 'function' value, corresponding to the
+ command supplied by the user. This function should simply be called with
+ the parse_args result as its only parameter.
+ """
+
+ description = "This is a tool to ease the process of submitting data " \
+ "previously processed using the BAM Pipeline to ENA. Please see the " \
+ "documentation for instructions on how to use."
+
+ parser = argparse.ArgumentParser(prog="paleomix ena",
+ description=description)
+
+ subparsers = parser.add_subparsers()
+
+ ###########################################################################
+ # Parser for the 'build-table' command
+ parser_build_table = subparsers.add_parser('build-table')
+ parser_build_table.add_argument('makefile', nargs="+",
+ help="One or more BAM Pipeline makefiles.")
+ parser_build_table.set_defaults(function=_main_build_table)
+
+ ###########################################################################
+ # Parser for the 'collect-files' command
+ parser_collect = subparsers.add_parser('collect-files')
+ parser_collect.add_argument('table',
+ help="Table generated using the 'build-table' "
+ "command.")
+ parser_collect.add_argument('--destination', default="ena_submission",
+ metavar="FOLDER",
+ help="Destination folder for temporary files, "
+ "merged FASTQ files, and MD5-sum files "
+ "[default: %(default)s].")
+ parser_collect.add_argument('--max-threads', default=2, type=int,
+ metavar="N",
+ help="Maximum number of simultanous steps "
+ "to execute [default: %(default)s].")
+ parser_collect.add_argument('--list-output-files', default=False,
+ action="store_true",
+ help="List the (status of) files (to be) "
+ "generated by this command.")
+ parser_collect.add_argument('--list-executables', default=False,
+ action="store_true",
+ help="List all executables required by the "
+ "pipeline, with version requirements "
+ "(if any).")
+ parser_collect.add_argument("--dry-run",
+ action="store_true", default=False,
+ help="If passed, only a dry-run in performed, "
+ "the dependency tree is printed, and no "
+ "tasks are executed.")
+ parser_collect.set_defaults(function=_main_collect_files)
+
+ ###########################################################################
+ # Parser for the 'finalize-table' command
+ parser_finalize_table = subparsers.add_parser('finalize-table')
+ parser_finalize_table.add_argument('mode', choices=("se", "pe"),
+ help="Output table containing either "
+ "the single-end ('se') or paired"
+ "-end ('pe') reads in the input "
+ "table.")
+ parser_finalize_table.add_argument('table',
+ help="Table generated using the "
+ "'build-table' command.")
+ parser_finalize_table.add_argument('--destination',
+ default="ena_submission",
+ metavar="FOLDER",
+ help="Destination folder for temporary "
+ "files, merged FASTQ files, and "
+ "MD5-sum files "
+ "[default: %(default)s/].")
+ parser_finalize_table.set_defaults(function=_main_finalize_table)
+
+ return parser.parse_args(argv)
+
+
+def main(argv):
+ """Main function; takes a list of arguments but excluding sys.argv[0]."""
+ args = parse_args(argv)
+
+ try:
+ return args.function(args)
+ except ENAError, error:
+ ui.print_err("FATAL ERROR:\n %s" % (error,))
+
+ return 1
+
+
+if __name__ == '__main__':
+ sys.exit(main(sys.argv[1:]))
diff --git a/paleomix/tools/factory.py b/paleomix/tools/factory.py
new file mode 100644
index 0000000..b5df772
--- /dev/null
+++ b/paleomix/tools/factory.py
@@ -0,0 +1,67 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2014 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+"""Factory for AtomicCmdBuilders for the various PALEOMIX commands.
+
+Ensures that the version called corresponds to the running version, in case
+multiple versions are present in the users' PATH, or that the current version
+is not available from the users' PATH.
+"""
+import sys
+
+import paleomix.main
+
+from paleomix.atomiccmd.builder import \
+ AtomicCmdBuilder
+
+
+def new(command, *args, **kwargs):
+ """Returns AtomicCmdBuilder setup to call the tools accessible through the
+ 'paleomix' command-line tool. This builder adds executable / version checks
+ for the specified command, but does not add any arguments. Thus, calling
+ new with the argument "cat" produces the equivalent of ["paleomix", "cat"].
+ """
+ if command in _SPECIAL_COMMANDS:
+ return _SPECIAL_COMMANDS[command](*args, **kwargs)
+ return _build_paleomix_command(command, *args, **kwargs)
+
+
+def _build_cat_command():
+ """Returns an AtomicCmdBuilder for the 'paleomix cat' command."""
+ return _build_paleomix_command("cat",
+ EXEC_GZIP="gzip",
+ EXEC_BZIP="bzip2",
+ EXEC_CAT="cat")
+
+
+def _build_paleomix_command(*args, **kwargs):
+ """Returns an AtomicCmdBuilder for a regular 'paleomix ...' command."""
+ interpreter = sys.executable
+ script = paleomix.main.__file__
+
+ return AtomicCmdBuilder((interpreter, script) + args,
+ AUX_PALEOMIX=script,
+ **kwargs)
+
+
+_SPECIAL_COMMANDS = {
+ "cat": _build_cat_command,
+}
diff --git a/paleomix/tools/genotype.py b/paleomix/tools/genotype.py
new file mode 100755
index 0000000..b62af67
--- /dev/null
+++ b/paleomix/tools/genotype.py
@@ -0,0 +1,524 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+"""Wrapper around "samtools mpileup | bcftools view", to improve performance
+when genotyping sparse regions (e.g. sets of exons / RNAs / or similar), and
+allow transparent multi-threading. Alternatively, only "samtools mpileup" may
+be called, in order to generate pileups for a set of regions.
+
+There are 3 main motivations for this wrapper:
+ 1. Current versions of SAMTools read the full contents of the input BAM,
+ when a BED file of regions is specified, even if these regions cover
+ only a fraction of sites. This can be somewhat mitigated by ALSO
+ specifying a region using -r, which fetches just that region, but this
+ does not scale well for thousands of individual regions.
+ 2. It provides transparent parallelization, allowing any set of bed regions
+ to be split and processed in parallel.
+"""
+import os
+import sys
+import shutil
+import signal
+import argparse
+import traceback
+import multiprocessing
+
+import pysam
+
+from paleomix.common.bedtools import \
+ read_bed_file, \
+ sort_bed_by_bamfile
+
+import paleomix.tools.factory as factory
+import paleomix.common.procs as processes
+
+
+class BatchError(RuntimeError):
+ pass
+
+
+# Size of smallest block in (linear) BAM index (= 2 << 14)
+_BAM_BLOCK_SIZE = 16384
+
+
+###############################################################################
+###############################################################################
+# CLI functions
+
+def build_call(call, args, positional, new_args):
+ call = list(call)
+ args = dict(args)
+
+ for new_arg in new_args:
+ key, value = new_arg, None
+ if "=" in new_arg:
+ key, value = new_arg.split("=", 1)
+ args[key] = value
+
+ for (key, value) in sorted(args.iteritems()):
+ call.append(key)
+ if value is not None:
+ call.append(value)
+
+ call.extend(positional)
+
+ return call
+
+
+###############################################################################
+###############################################################################
+# BAM filtering mode
+
+def filter_bam(bamfile, bedfile):
+ with pysam.Samfile(bamfile) as bam_handle_in:
+ regions = collect_regions(bedfile, bam_handle_in)
+ regions.reverse()
+
+ with pysam.Samfile("-", "wbu",
+ template=bam_handle_in) as bam_handle_out:
+ while regions:
+ region_aend = 0
+ contig, start, end = regions[-1]
+ for record in bam_handle_in.fetch(contig, start):
+ current_aend = record.aend
+ region_aend = max(region_aend, current_aend)
+ if record.pos > end:
+ last_contig, _, _ = regions.pop()
+ if not regions:
+ break
+
+ contig, start, end = regions[-1]
+ if (region_aend + _BAM_BLOCK_SIZE < start) \
+ or (contig != last_contig):
+ break
+
+ if current_aend >= start:
+ bam_handle_out.write(record)
+ else: # Reached the end of this contig
+ while regions and (regions[-1][0] == contig):
+ regions.pop()
+
+ return 0
+
+
+###############################################################################
+###############################################################################
+# Common functions
+
+def cleanup_batch(setup):
+ sys.stderr.write("Cleaning up batch ...\n")
+ for handle in setup["handles"].itervalues():
+ handle.close()
+
+ for proc in setup["procs"].itervalues():
+ if proc.poll() is None:
+ proc.terminate()
+ proc.wait()
+
+ for filename in setup["temp_files"].itervalues():
+ sys.stderr.write("Removing temporary file %r\n" % (filename,))
+ os.remove(filename)
+
+
+def write_bed_file(prefix, regions):
+ fpath = prefix + ".bed"
+ with open(fpath, "w") as bed_handle:
+ for (contig, start, end) in regions:
+ bed_handle.write("%s\t%i\t%i\n" % (contig, start, end))
+ bed_handle.flush()
+ return fpath
+
+
+def setup_basic_batch(args, regions, prefix, func, first_batch=True):
+ setup = {"files": {},
+ "temp_files": {},
+ "procs": {},
+ "handles": {}}
+
+ try:
+ setup["files"]["bed"] = write_bed_file(prefix, regions)
+ setup["temp_files"]["bed"] = setup["files"]["bed"]
+
+ filter_builder = factory.new("genotype")
+ filter_builder.set_option("--filter-only")
+ filter_builder.set_option("--bedfile", setup["files"]["bed"])
+ filter_builder.add_option(args.bamfile)
+ filter_builder.add_option(args.destination)
+
+ setup["procs"]["filter"] \
+ = processes.open_proc(filter_builder.call,
+ stdout=processes.PIPE)
+
+ call_stdout = func(setup)
+ if not first_batch:
+ setup["procs"]["grep"] = processes.open_proc(('grep', '-v', '^#'),
+ stdin=call_stdout,
+ stdout=processes.PIPE)
+ call_stdout = setup["procs"]["grep"].stdout
+
+ setup["handles"]["outfile"] = open(prefix, "w")
+ zip_proc = processes.open_proc(["bgzip"],
+ stdin=call_stdout,
+ stdout=setup["handles"]["outfile"])
+
+ setup["procs"]["gzip"] = zip_proc
+
+ return setup
+ except:
+ sys.stderr.write(traceback.format_exc() + "\n")
+ cleanup_batch(setup)
+ raise
+
+
+###############################################################################
+###############################################################################
+# Pileup batch generation
+
+def setup_mpileup_batch(args, regions, prefix, first_batch=True):
+ def _create_mpileup_proc(setup):
+ mpileup_args = {"-l": setup["files"]["bed"]}
+ call = build_call(call=("samtools", "mpileup"),
+ args=mpileup_args,
+ new_args=args.mpileup_argument,
+ positional=("-",))
+
+ sys.stderr.write("Running 'samtools mpileup': %s\n" % (" ".join(call)))
+ procs = setup["procs"]
+ procs["mpileup"] \
+ = processes.open_proc(call,
+ stdin=procs["filter"].stdout,
+ stdout=processes.PIPE)
+
+ return procs["mpileup"].stdout
+
+ return setup_basic_batch(args, regions, prefix, _create_mpileup_proc,
+ first_batch=first_batch)
+
+
+###############################################################################
+###############################################################################
+# Genotyping batch generation
+
+def setup_genotyping_batch(args, regions, prefix, first_batch=True):
+ def _create_genotyping_proc(setup):
+ mpileup_args = {"-u": None,
+ "-l": setup["files"]["bed"]}
+ mpileup_call = build_call(call=("samtools", "mpileup"),
+ args=mpileup_args,
+ new_args=args.mpileup_argument,
+ positional=("-",))
+
+ sys.stderr.write("Running 'samtools mpileup': %s\n"
+ % (" ".join(mpileup_call)))
+
+ procs = setup["procs"]
+ procs["mpileup"] \
+ = processes.open_proc(mpileup_call,
+ stdin=procs["filter"].stdout,
+ stdout=processes.PIPE)
+
+ bcftools_call = build_call(call=("bcftools", "view"),
+ args={},
+ new_args=args.bcftools_argument,
+ positional=("-",))
+
+ sys.stderr.write("Running 'bcftools call': %s\n"
+ % (" ".join(bcftools_call)))
+
+ procs["bcftools"] \
+ = processes.open_proc(bcftools_call,
+ stdin=procs["mpileup"].stdout,
+ stdout=processes.PIPE)
+
+ return procs["bcftools"].stdout
+
+ return setup_basic_batch(args, regions, prefix, _create_genotyping_proc,
+ first_batch=first_batch)
+
+
+###############################################################################
+###############################################################################
+
+def setup_batch(args, regions, filename, first_batch):
+ """Setup a batch; either a full genotyping, or just a pileup depending on
+ 'args.pileup_only'; the results are written to 'filename'.
+ """
+ if args.pileup_only:
+ return setup_mpileup_batch(args, regions, filename, first_batch)
+ return setup_genotyping_batch(args, regions, filename, first_batch)
+
+
+def run_batch((args, regions, filename, first_batch)):
+ setup = setup_batch(args, regions, filename, first_batch)
+ try:
+ if any(processes.join_procs(setup["procs"].values())):
+ return None
+
+ return filename
+ except:
+ # Re-wrap exception with full-traceback; otherwise this information
+ # is lost when the exception is retrieved in the main process.
+ raise BatchError(traceback.format_exc())
+ finally:
+ cleanup_batch(setup)
+
+
+###############################################################################
+###############################################################################
+
+def init_worker_thread():
+ """Init function for subprocesses created by multiprocessing.Pool: Ensures
+ that KeyboardInterrupts only occur in the main process, allowing us to do
+ proper cleanup.
+ """
+ signal.signal(signal.SIGINT, signal.SIG_IGN)
+
+
+###############################################################################
+###############################################################################
+
+def merge_bed_regions(regions):
+ """Takes a sequence of bed regions [(contig, start, end), ...], which is
+ assumed to be sorted by contig and coordiates, and returns a list in which
+ overlapping records are merged into one larger region.
+ """
+ merged = []
+ last_contig = last_start = last_end = None
+ for record in regions:
+ if (record.contig != last_contig) or (record.start > last_end):
+ if last_contig is not None:
+ merged.append((last_contig, last_start, last_end))
+ last_contig = record.contig
+ last_start = record.start
+ last_end = record.end
+ else:
+ last_start = min(last_start or 0, record.start)
+ last_end = max(last_end, record.end)
+
+ if last_contig is not None:
+ merged.append((last_contig, last_start, last_end))
+
+ return merged
+
+
+def create_batches(args, regions):
+ """Yields a sequence of batches that may be passed to the 'run_batch'
+ function; each batch consists of the 'args' object, a set of BED regions,
+ and a destination filename. The set of BED regions is derived by splitting
+ the total set of regions into args.nbatches portions.
+ """
+ tmpl = "{0}.batch_%03i".format(args.destination)
+
+ def _get_batch_fname(count):
+ """Returns a filename for batch number 'count'."""
+ if count:
+ return tmpl % (count,)
+ return args.destination
+
+ total_size = sum(end - start for (_, start, end) in regions)
+ batch_size = total_size // args.nbatches + 5
+
+ batch_count = 0
+ current_batch = []
+ current_total = 0
+ for (contig, start, end) in regions:
+ while (end - start) + current_total > batch_size:
+ new_end = start + batch_size - current_total
+ current_batch.append((contig, start, new_end))
+ start = new_end
+ yield args, current_batch, _get_batch_fname(batch_count), \
+ not batch_count
+ current_batch = []
+ current_total = 0
+ batch_count += 1
+ current_batch.append((contig, start, end))
+ current_total += end - start
+
+ if current_batch:
+ yield args, current_batch, _get_batch_fname(batch_count), \
+ not batch_count
+
+
+def merge_batch_results(filenames_iter):
+ """Takes a multiprocessing.imap iterator yielding filenames of completed
+ batches (gzipped vcf or mpileup files), and writes these into the
+ file-handle out.
+ """
+ while True:
+ try:
+ # A timeout allows iteruption by the user, which is not the
+ # case otherwise. The value is arbitrary.
+ target_filename = filenames_iter.next(60)
+ # None signals error in subprocess; see 'run_batch'
+ if target_filename is None:
+ return False
+
+ sys.stderr.write("Merging into file: %r\n" % (target_filename,))
+ break
+ except multiprocessing.TimeoutError:
+ pass
+ except StopIteration:
+ return
+
+ with open(target_filename, "r+") as target_handle:
+ while True:
+ try:
+ filename = filenames_iter.next(60)
+ sys.stderr.write(" - Processing batch: %r" % (filename,))
+
+ # BGZip is terminated by 28b empty block (cf. ref)
+ # While the standard implies that these should be ignored
+ # if not actually at the end of the file, the tabix tool
+ # stops processing at the first such block it encounters
+ target_handle.seek(-28, 2)
+ with open(filename) as input_handle:
+ shutil.copyfileobj(input_handle, target_handle)
+ os.remove(filename)
+ except multiprocessing.TimeoutError:
+ pass
+ except StopIteration:
+ break
+
+ return True
+
+
+def collect_regions(bedfile, bam_input_handle):
+ """Returns the regions to be genotyped / pileup'd, as a list of bed-regions
+ in the form (contig, start, end), where start is zero-based, and end is
+ open based.
+ """
+ if bedfile is not None:
+ regions = list(read_bed_file(bedfile))
+ sort_bed_by_bamfile(bam_input_handle, regions)
+ regions = merge_bed_regions(regions)
+ else:
+ regions = []
+ for (name, length) in zip(bam_input_handle.references,
+ bam_input_handle.lengths):
+ regions.append((name, 0, length))
+ return regions
+
+
+def process_batches(args, batches):
+ """Runs a set of batches, and merges the resulting output files if more
+ than one batch is included.
+ """
+ nbatches = min(args.nbatches, len(batches))
+ pool = multiprocessing.Pool(nbatches, init_worker_thread)
+
+ try:
+ batches = pool.imap(run_batch, batches, 1)
+ if not merge_batch_results(batches):
+ pool.terminate()
+ pool.join()
+ return 1
+
+ pool.close()
+ pool.join()
+ return 0
+ except:
+ pool.terminate()
+ pool.join()
+ raise
+
+
+def create_empty_bgz(destination):
+ """Writes an empty BGZip file to the given destination; this file contains
+ a single empty BGZip block (28b).
+ """
+ with open(destination, "w") as output:
+ # Empty BGZip block
+ output.write("\x1f\x8b\x08\x04\x00\x00\x00\x00\x00\xff\x06\x00\x42")
+ output.write("\x43\x02\x00\x1b\x00\x03\x00\x00\x00\x00\x00\x00\x00")
+ output.write("\x00\x00")
+
+
+def parse_args(argv):
+ prog = "paleomix genotype"
+ usage = "%s [options] sorted.bam out.vcf.bgz" % (prog,)
+
+ parser = argparse.ArgumentParser(prog=prog, usage=usage)
+ parser.add_argument("bamfile", metavar='INPUT',
+ help="Sorted and indexed BAM file.")
+ parser.add_argument("destination", metavar='OUTPUT',
+ help="BGZip compressed VCF or pileup. Also used as "
+ "prefix for temporary files.")
+ parser.add_argument('--bedfile', default=None, metavar="BED",
+ help="Optional bedfile, specifying regions to pileup "
+ "or genotype [Default: %(default)s].")
+ parser.add_argument('--mpileup-argument', default=[], action="append",
+ help="Pass argument to 'samtools mpileup'; must be "
+ "used as follows: --mpileup-argument=-argument "
+ "for arguments without values, and "
+ "--mpileup-argument=-argument=value for "
+ "arguments with values.")
+ parser.add_argument('--bcftools-argument', default=[], action="append",
+ help="Pass argument to 'bcftools view'; see the "
+ "--mpileup-argument command description.")
+ parser.add_argument('--pileup-only', default=False, action="store_true",
+ help="Only run 'samtools mpileup', generating a text "
+ "pileup instead of a VCF file [Default: off].")
+ parser.add_argument('--nbatches', metavar="N", default=1, type=int,
+ help="Split the BED into N number of batches, which "
+ "are run in parallel [Default: %(default)s].")
+ parser.add_argument('--overwrite', default=False, action="store_true",
+ help="Overwrite output if it already exists "
+ "[Default: no].")
+
+ # When set, the --bedfile argument is read and used to filter the BAM
+ # specified for the 'bamfile' parameter; all other parameters are ignored.
+ parser.add_argument('--filter-only', default=False, action="store_true",
+ help=argparse.SUPPRESS)
+
+ return parser.parse_args(argv)
+
+
+def main(argv):
+ args = parse_args(argv)
+ if args.filter_only:
+ if not args.bedfile:
+ sys.stderr.write("--filter-only requires --bedfile; terminating\n")
+ return 1
+
+ return filter_bam(args.bamfile, args.bedfile)
+
+ if os.path.exists(args.destination) and not args.overwrite:
+ sys.stderr.write("Output already exists; use --overwrite to allow "
+ "overwriting of this file.\n")
+ return 1
+
+ with pysam.Samfile(args.bamfile) as bam_input_handle:
+ regions = collect_regions(args.bedfile, bam_input_handle)
+ batches = list(create_batches(args, regions))
+ if not batches:
+ create_empty_bgz(args.destination)
+ return 0
+
+ try:
+ return process_batches(args, batches)
+ except BatchError, error:
+ sys.stderr.write("ERROR while processing BAM:\n")
+ sys.stderr.write(" %s\n"
+ % ("\n ".join(str(error).split("\n"),)))
+ return 1
+
+ return 0
diff --git a/paleomix/tools/gtf_to_bed.py b/paleomix/tools/gtf_to_bed.py
new file mode 100755
index 0000000..cb4daf1
--- /dev/null
+++ b/paleomix/tools/gtf_to_bed.py
@@ -0,0 +1,325 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+#
+# Converts a GTF file to a set of BED6 files, one for each
+# feature in the GTF file (CDS, Exon, ...). Also generates a list
+# of introns, and UTRs based on sequences in the GTF.
+#
+from __future__ import with_statement
+from __future__ import print_function
+
+import sys
+from argparse import ArgumentParser
+
+import pysam
+
+from paleomix.common.fileutils import open_ro
+from paleomix.common.utilities import set_in, get_in
+
+import paleomix.common.text as text
+
+
+###############################################################################
+###############################################################################
+# Functions used for GTF parsing and filtering
+
+def filter_gtf_record(gtf):
+ return gtf.feature not in ("exon", "CDS")
+
+
+def update_gtf_table(table, gtf, scaffolds, contig_prefix):
+ # Workaround for bug in Pysam, which mis-parses individual properties
+ # (e.g. exon_number) if these are not quoted. This does not apply to
+ # asDict, which uses a different parsing implementation (v0.7.8).
+ properties = gtf.asDict()
+
+ gene_type = properties.get("gene_biotype")
+ if gene_type is None:
+ gene_type = properties.get("gene_type", "unknown_genetype")
+
+ keys = (gene_type,
+ properties["gene_id"],
+ properties["transcript_id"],
+ int(properties["exon_number"]),
+ gtf.feature)
+
+ record = {"contig": contig_prefix + gtf.contig,
+ "start": gtf.start,
+ # In pysam, 'end' equals the past-the-end position
+ "end": gtf.end - 1,
+ "strand": gtf.strand,
+ "feature": gtf.feature,
+ "transcript": properties["transcript_id"]}
+
+ if record["contig"] in scaffolds:
+ contig = scaffolds[record["contig"]]
+ record["contig"] = contig["chrom"]
+ record["start"] += int(contig["chromStart"])
+ record["end"] += int(contig["chromStart"])
+
+ assert not get_in(table, keys), keys
+ set_in(table, keys, record)
+
+
+def read_gtf(lines, scaffolds, contig_prefix):
+ table = {} # gene_id -> transcript_id -> exon_number -> feature -> [items]
+ for gtf in text.parse_lines(lines, pysam.asGTF()):
+ if not filter_gtf_record(gtf):
+ update_gtf_table(table, gtf, scaffolds, contig_prefix)
+
+ return table
+
+
+def read_scaffolds(filename):
+ scaffolds = {}
+ with open(filename) as handle:
+ header = handle.readline().strip("#\n\r").split("\t")
+ for line in handle:
+ row = dict(zip(header, line.rstrip("\r\n").split("\t")))
+ scaffolds[row["contig"]] = row
+ return scaffolds
+
+
+###############################################################################
+###############################################################################
+
+def get_introns(exons):
+ lst = [exons[n]["exon"] for n in sorted(exons)]
+ if lst[0]["strand"] == "-":
+ # Exon numbers follow the read direction, and the coordinates are
+ # therefore descending for regions on the negative strand. Below we
+ # assume that the coordinates are ascending, so reorder the list.
+ lst = lst[::-1]
+
+ introns = []
+ for (record_a, record_b) in zip(lst, lst[1:]):
+ if record_a["end"] == record_b["start"] - 1:
+ # Intron has been lost?
+ continue
+
+ record = dict(record_a)
+ record.update(feature="intron",
+ start=record_a["end"] + 1,
+ end=record_b["start"] - 1)
+ assert record["start"] <= record["end"], lst
+
+ introns.append(record)
+
+ return introns
+
+
+def split_exon(exon, cds):
+ """Takes an exon and a CDS, and returns a map of regions for each
+ feature (UTR5/3, CDS) that may be inferred from the arguments.
+ Note that the CDS is simply returned as is, to simplify
+ downstream handling of these features."""
+ results = [cds]
+
+ if exon["start"] < cds["start"]:
+ utr = dict(exon)
+ utr.update(end=cds["start"] - 1,
+ feature=(exon["strand"] == "+" and "UTR5" or "UTR3"))
+ results.append(utr)
+
+ if exon["end"] > cds["end"]:
+ utr = dict(exon)
+ utr.update(start=cds["end"] + 1,
+ feature=(exon["strand"] == "+" and "UTR3" or "UTR5"))
+ results.append(utr)
+
+ return results
+
+
+def split_exons(exons, func):
+ # By looping over the list sorted by exon-number, we can easily
+ # determine whether or not we are dealing with a 5' or 3' UTR.
+ seen_cds = False
+ for (_, exon) in sorted(exons.iteritems()):
+ if "CDS" in exon:
+ seen_cds = True
+ cds, exon = exon["CDS"], exon["exon"]
+
+ func(split_exon(exon, cds))
+ else:
+ utr = dict(exon["exon"])
+ utr.update(feature=(seen_cds and "UTR3" or "UTR5"))
+
+ func([utr])
+
+
+def select_transcripts(options, transcripts, protein_coding):
+ """Returns the largest transcript, preferring well formed
+ transcripts (len(CDS) % 3 == 0) if the gene is protein coding."""
+ if options.keep_all_transcripts and options.keep_malformed_proteins:
+ return transcripts.itervalues()
+
+ selection = []
+ for transcript in transcripts.itervalues():
+ well_formed = True
+ exon_len = cds_len = 0
+ for records in transcript.itervalues():
+ exon_record = records["exon"]
+ exon_len += exon_record["end"] - exon_record["start"] + 1
+
+ if protein_coding and ("CDS" in records):
+ cds_record = records["CDS"]
+ cds_len += cds_record["end"] - cds_record["start"] + 1
+
+ if protein_coding:
+ well_formed = (cds_len and (cds_len % 3 == 0))
+
+ if well_formed or options.keep_malformed_proteins:
+ selection.append(((well_formed, exon_len), transcript))
+
+ if options.keep_all_transcripts or not selection:
+ return [item[-1] for item in selection]
+
+ return [max(selection, key=lambda item: item[0])[-1]]
+
+
+def _do_build_feature_table(options, table, features, protein_coding):
+ def add_records(records):
+ for record in records:
+ features[record["feature"]].append(record)
+
+ retained = read = 0
+ for transcripts in table.itervalues():
+ read += len(transcripts)
+ for exons in select_transcripts(options, transcripts, protein_coding):
+ retained += 1
+ add_records(get_introns(exons))
+ yield (exons, add_records)
+
+ print("\t- Processed %i transcripts, filtered %i (%.1f%%) ..."
+ % (read, read - retained, (100.0 * (read - retained)) / read))
+
+
+def build_coding_seqs_table(options, table):
+ """Takes a table generated from a GTF file, and constructs a table for each
+ feature, inferring introns and UTRs from the exons and CDSs of the input
+ table."""
+ print("Building table of features for coding sequences ...")
+ features = {"UTR5": [],
+ "UTR3": [],
+ "CDS": [],
+ "intron": []}
+
+ feature_table = _do_build_feature_table(options, table, features, True)
+ for (exons, add_records) in feature_table:
+ split_exons(exons, add_records)
+ return features
+
+
+def build_noncoding_seqs_table(options, table):
+ print("Building table of features for non-coding sequences ...")
+ features = {"exon": [],
+ "intron": []}
+
+ feature_table = _do_build_feature_table(options, table, features, False)
+ for (exons, add_records) in feature_table:
+ add_records(record["exon"] for record in exons.itervalues())
+ return features
+
+
+def write_bed(table, target):
+ if not table:
+ return
+
+ def sort_key(record):
+ return (record["contig"], record["start"], record["end"])
+
+ with open(target, "w") as out:
+ for record in sorted(table, key=sort_key):
+ out.write("%s\t%i\t%i\t%s\t%i\t%s\n" %
+ # As described on http://genome.ucsc.edu/FAQ/FAQformat
+ (record["contig"], # chrom
+ record["start"], # chromStart
+ record["end"] + 1, # chromEnd, past-the-end
+ record["transcript"], # name
+ 0, # score
+ record["strand"])) # strand
+
+
+###############################################################################
+###############################################################################
+
+def parse_arguments(argv):
+ prog = "paleomix gtf_to_bed"
+ usage = "%s [options] in.gtf out_prefix [in.scaffolds]" % (prog,)
+
+ parser = ArgumentParser(prog=prog, usage=usage)
+ parser.add_argument('infile', metavar="INPUT.gtf",
+ help="Input file in GTF format.")
+ parser.add_argument('output_prefix', metavar="OUTPUT_PREFIX",
+ help="Prefix of output files.")
+ parser.add_argument('scaffolds', metavar="SCAFFOLDS", nargs="?",
+ help="Mapping of scaffolds to contig positions; e.g. "
+ "mapping individual Un* scaffolds onto chrUn.")
+ parser.add_argument("--keep-all-transcripts",
+ action="store_true", default=False,
+ help="Include all transcripts in the output BED "
+ "files, not just the longest transcript of each "
+ "gene [default: off]")
+ parser.add_argument("--keep-malformed-proteins",
+ action="store_true", default=False,
+ help="Include transcripts of protein-coding in the "
+ "output, even if the the length of the CDS is "
+ "not divisible by 3 [default: off]")
+ parser.add_argument('--contig-prefix', default="",
+ help="Add prefix to contig names (e.g. 'chr') "
+ "[default: no prefix].")
+
+ return parser.parse_args(argv)
+
+
+def main(argv):
+ args = parse_arguments(argv)
+
+ scaffolds = {}
+ if args.scaffolds:
+ print("Reading scaffolds information from %r" % (args.scaffolds,))
+ scaffolds = read_scaffolds(args.scaffolds)
+
+ with open_ro(args.infile) as gtf_file:
+ print("Reading GTF from %r" % (args.infile,))
+ src_table = read_gtf(gtf_file, scaffolds, args.contig_prefix)
+
+ for (source, table) in src_table.iteritems():
+ print("Writing tables for '%s' ..." % source)
+
+ if source.startswith("protein"):
+ features = build_coding_seqs_table(args, table)
+ else:
+ features = build_noncoding_seqs_table(args, table)
+
+ for feature in features:
+ fpath = "%s.%s.%s.bed" % (args.output_prefix, source, feature)
+
+ print("\tWriting %ss to '%s' ..." % (feature, fpath, ))
+ write_bed(features[feature], fpath)
+
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv[1:]))
diff --git a/paleomix/tools/phylo_pipeline/__init__.py b/paleomix/tools/phylo_pipeline/__init__.py
new file mode 100644
index 0000000..90e5529
--- /dev/null
+++ b/paleomix/tools/phylo_pipeline/__init__.py
@@ -0,0 +1,22 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
diff --git a/paleomix/tools/phylo_pipeline/config.py b/paleomix/tools/phylo_pipeline/config.py
new file mode 100644
index 0000000..1e11016
--- /dev/null
+++ b/paleomix/tools/phylo_pipeline/config.py
@@ -0,0 +1,167 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2013 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import optparse
+
+import paleomix
+
+import paleomix.tools.phylo_pipeline.parts.genotype as genotype
+import paleomix.tools.phylo_pipeline.parts.msa as msa
+import paleomix.tools.phylo_pipeline.parts.paml as paml
+import paleomix.tools.phylo_pipeline.parts.phylo as phylo
+import paleomix.common.console as console
+
+from paleomix.config import \
+ ConfigError, \
+ PerHostValue, \
+ PerHostConfig, \
+ migrate_config
+
+
+_DESCRIPTION = \
+ "Commands:\n" \
+ " -- %prog help -- Display this message.\n" \
+ " -- %prog example [...] -- Copy example project to folder.\n" \
+ " -- %prog makefile -- Print makefile template.\n" \
+ " -- %prog genotype [...] -- Carry out genotyping according to makefile.\n" \
+ " -- %prog msa [...] -- Carry out multiple sequence alignment.\n" \
+ " -- %prog phylogeny [...] -- Carry out phylogenetic inference.\n"
+
+
+_COMMANDS = {
+ "mkfile" : True,
+ "makefile" : True,
+ "genotype" : genotype.chain,
+ "genotyping" : genotype.chain,
+ "msa" : msa.chain,
+ "paml:codeml" : paml.chain_codeml,
+ "phylogeny:examl" : phylo.chain_examl,
+ }
+
+
+class CustomHelpFormatter(optparse.IndentedHelpFormatter):
+ def format_description(self, description):
+ return description or ""
+
+
+def select_commands(chain):
+ commands = []
+ for command in chain.split("+"):
+ command_key = command.strip().lower()
+ command_func = None
+
+ if command in _COMMANDS:
+ command_func = _COMMANDS[command_key]
+ elif len(command) >= 3:
+ for (key, value) in _COMMANDS.iteritems():
+ if key.startswith(command):
+ command_key = key
+ command_func = value
+ break
+
+ commands.append((command_key, command_func))
+
+ return commands
+
+
+def _run_config_parser(argv):
+ per_host_cfg = PerHostConfig("phylo_pipeline")
+ usage_str = "paleomix phylo_pipeline <command> [options] [makefiles]"
+ version_str = "paleomix phylo_pipeline v%s" % (paleomix.__version__,)
+ parser = optparse.OptionParser(usage=usage_str,
+ version=version_str)
+
+ parser.formatter = CustomHelpFormatter()
+ parser.formatter.set_parser(parser)
+ parser.description = _DESCRIPTION
+
+ paleomix.ui.add_optiongroup(parser,
+ ui_default=PerHostValue("running"),
+ color_default=PerHostValue("on"))
+ paleomix.logger.add_optiongroup(parser, default = PerHostValue("warning"))
+
+ group = optparse.OptionGroup(parser, "Scheduling")
+ group.add_option("--samtools-max-threads", default = PerHostValue(1), type = int,
+ help = "Maximum number of threads to use when genotyping or building pileups [%default]")
+ group.add_option("--examl-max-threads", default = PerHostValue(1), type = int,
+ help = "Maximum number of threads to use for each instance of ExaML [%default]")
+ group.add_option("--max-threads", default = per_host_cfg.max_threads, type = int,
+ help = "Maximum number of threads to use in total [%default]")
+ group.add_option("--dry-run", default = False, action="store_true",
+ help = "If passed, only a dry-run in performed, the dependency tree is printed, "
+ "and no tasks are executed.")
+ parser.add_option_group(group)
+
+ group = optparse.OptionGroup(parser, "Required paths")
+ group.add_option("--temp-root", default = per_host_cfg.temp_root,
+ help = "Location for temporary files and folders [%default]")
+ group.add_option("--samples-root", default = PerHostValue("./data/samples", is_path = True),
+ help = "Location of BAM files for each sample [%default]")
+ group.add_option("--regions-root", default = PerHostValue("./data/regions", is_path = True),
+ help = "Location of BED files containing regions of interest [%default]")
+ group.add_option("--prefix-root", default = PerHostValue("./data/prefixes", is_path = True),
+ help = "Location of prefixes (FASTAs) [%default]")
+ group.add_option("--refseq-root", default = PerHostValue("./data/refseqs", is_path = True),
+ help = "Location of reference sequences (FASTAs) [%default]")
+ group.add_option("--destination", default = "./results",
+ help = "The destination folder for result files [%default]")
+ parser.add_option_group(group)
+
+ group = optparse.OptionGroup(parser, "Files and executables")
+ group.add_option("--list-input-files", action="store_true", default=False,
+ help="List all input files used by pipeline for the "
+ "makefile(s), excluding any generated by the "
+ "pipeline itself.")
+ group.add_option("--list-output-files", action="store_true", default=False,
+ help="List all output files generated by pipeline for "
+ "the makefile(s).")
+ group.add_option("--list-executables", action="store_true", default=False,
+ help="List all executables required by the pipeline, "
+ "with version requirements (if any).")
+ parser.add_option_group(group)
+
+ parser.add_option("--to-dot-file", dest="dot_file",
+ help="Write dependency tree to the specified dot-file.")
+
+ return per_host_cfg.parse_args(parser, argv)
+
+
+def parse_config(argv):
+ migrate_config()
+
+ options, args = _run_config_parser(argv)
+ paleomix.ui.set_ui_colors(options.ui_colors)
+
+ if args and args[0] in ("example", "examples"):
+ return options, args
+ elif (len(args) < 2) and (args != ["mkfile"] and args != ["makefile"]):
+ description = _DESCRIPTION.replace("%prog", "phylo_pipeline").strip()
+ console.print_info("Phylogeny Pipeline v%s\n" % (paleomix.__version__,))
+ console.print_info(description)
+ return options, args
+
+ commands = select_commands(args[0] if args else ())
+ if any((func is None) for (_, func) in commands):
+ unknown_commands = ", ".join(repr(key) for (key, func) in commands if func is None)
+ raise ConfigError("Unknown analysis step(s): %s" % (unknown_commands,))
+
+ return options, args
diff --git a/paleomix/tools/phylo_pipeline/makefile.py b/paleomix/tools/phylo_pipeline/makefile.py
new file mode 100644
index 0000000..b8da9d0
--- /dev/null
+++ b/paleomix/tools/phylo_pipeline/makefile.py
@@ -0,0 +1,757 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import pysam
+import types
+
+import paleomix.common.makefile
+from paleomix.common.makefile import \
+ MakefileError, \
+ REQUIRED_VALUE, \
+ IsDictOf, \
+ IsListOf, \
+ IsInt, \
+ IsStr, \
+ StringIn, \
+ IsFloat, \
+ IsUnsignedInt, \
+ IsBoolean, \
+ IsNone, \
+ ValueIn, \
+ ValuesSubsetOf, \
+ StringStartsWith, \
+ StringEndsWith, \
+ CLI_PARAMETERS, \
+ And, \
+ Or, \
+ Not
+
+from paleomix.common.fileutils import \
+ swap_ext, \
+ add_postfix
+from paleomix.common.utilities import \
+ fill_dict
+from paleomix.common.console import \
+ print_info, \
+ print_warn
+from paleomix.common.text import \
+ parse_padded_table
+from paleomix.common.bedtools import \
+ read_bed_file, \
+ BEDError
+from paleomix.common.formats.fasta import \
+ FASTA
+
+
+def read_makefiles(options, filenames, commands):
+ print_info("Reading makefile(s):")
+ steps = frozenset(key for (key, _) in commands)
+
+ makefiles = []
+ for filename in filenames:
+ makefile = paleomix.common.makefile.read_makefile(filename, _VALIDATION)
+ makefile = _mangle_makefile(options, makefile["Makefile"], steps)
+ makefiles.append(makefile)
+ return makefiles
+
+
+def _mangle_makefile(options, mkfile, steps):
+ _collapse_samples(mkfile)
+ _update_regions(options, mkfile)
+ _update_subsets(mkfile, steps)
+ _update_filtering(mkfile)
+ _update_sample_sets(mkfile)
+ _update_genotyping(mkfile)
+ _update_msa(mkfile)
+ _update_homozygous_contigs(mkfile)
+ _check_bam_sequences(options, mkfile, steps)
+ _check_sexes(mkfile)
+ _update_and_check_max_read_depth(options, mkfile)
+ _check_indels_and_msa(mkfile)
+ mkfile["Nodes"] = ()
+
+ return mkfile
+
+
+def _collapse_samples(mkfile):
+ groups, samples = {}, set()
+ def _collect_samples(samples_dict, path = ()):
+ current_samples = {}
+ for (key, subdd) in samples_dict.iteritems():
+ if key.startswith("<") and key.endswith(">"):
+ key = key.lstrip("<").rstrip(">")
+ current_samples.update(_collect_samples(subdd, path + (key,)))
+ elif key not in samples:
+ samples.add(key)
+ subdd["Name"] = key
+ current_samples[key] = subdd
+ else:
+ raise MakefileError("Duplicate sample-name: %r" % (key,))
+
+ groups[path] = current_samples
+ return current_samples
+
+ _collect_samples(mkfile["Project"]["Samples"])
+ mkfile["Project"]["Samples"] = groups.pop(())
+ mkfile["Project"]["Groups"] = groups
+
+
+def _select_samples(select, groups, samples, path):
+ selection = set()
+ for group in select:
+ if group.startswith("<") and group.endswith(">"):
+ key = tuple(group[1:-1].split("/"))
+ if key not in groups:
+ raise MakefileError("Unknown group specifed for filtering %r: %r" % (path, key))
+ selection.update(groups[key])
+ elif group in samples:
+ selection.add(group)
+ else:
+ raise MakefileError("Unknown/Invalid group specifed for filtering %r: %r" % (path, group))
+ return selection
+
+
+def _update_regions(options, mkfile):
+ print_info(" - Validating regions of interest ...")
+ mkfile["Project"]["Regions"] = mkfile["Project"].pop("RegionsOfInterest")
+
+ if not mkfile["Project"]["Regions"]:
+ raise MakefileError('No regions of interest have been specified; '
+ 'no analyses will be performed.')
+
+ for (name, subdd) in mkfile["Project"]["Regions"].iteritems():
+ if "Prefix" not in subdd:
+ raise MakefileError("No genome specified for regions %r" % (name,))
+
+ subdd["Name"] = name
+ subdd["Desc"] = "{Prefix}.{Name}".format(**subdd)
+ subdd["BED"] = os.path.join(options.regions_root, subdd["Desc"] + ".bed")
+ subdd["FASTA"] = os.path.join(options.prefix_root, subdd["Prefix"] + ".fasta")
+
+ required_files = (
+ ("Regions file", subdd["BED"]),
+ ("Reference sequence", subdd["FASTA"]),
+ )
+
+ for (desc, path) in required_files:
+ if not os.path.isfile(path):
+ raise MakefileError("%s does not exist for %r:\n Path = %r"
+ % (desc, name, path))
+
+ # Collects seq. names / validate regions
+ try:
+ sequences = _collect_sequence_names(bed_file=subdd["BED"],
+ fasta_file=subdd["FASTA"])
+ except (IOError, BEDError), error:
+ raise MakefileError("Error reading regions-of-interest %r:\n%s"
+ % (name, error))
+
+ subdd["Sequences"] = {None: sequences}
+ subdd["SubsetFiles"] = {None: ()}
+ sampledd = subdd["Genotypes"] = {}
+ for sample_name in mkfile["Project"]["Samples"]:
+ fasta_file = ".".join((sample_name, subdd["Desc"], "fasta"))
+ sampledd[sample_name] = os.path.join(options.destination,
+ mkfile["Project"]["Title"],
+ "genotypes",
+ fasta_file)
+
+
+def _collect_fasta_contigs(filename, cache={}):
+ if filename in cache:
+ return cache[filename]
+
+ if not os.path.exists(filename + ".fai"):
+ print_info(" - Index does not exist for %r; this may "
+ "take a while ..." % (filename,))
+
+ cache[filename] = contigs = dict(FASTA.index_and_collect_contigs(filename))
+ return contigs
+
+
+def _collect_sequence_names(bed_file, fasta_file, min_columns=6):
+ contigs = _collect_fasta_contigs(fasta_file)
+ sequences = {}
+
+ for record in read_bed_file(bed_file, min_columns=6, contigs=contigs):
+ current = (record.contig, record.strand)
+ reference = sequences.setdefault(record.name, current)
+
+ if current[0] != reference[0]:
+ raise MakefileError("Regions in %r with the same name (%r) "
+ "are located on different contigs (%r and "
+ "%r); note that PALEOMIX assumes that "
+ "regions with the same name constitute "
+ "parts of a single consecutive sequence, "
+ "which must therefore be located on one "
+ "strand of a single sequence. Please "
+ "rename one or more of these regions to"
+ "continue." % (bed_file, record.name,
+ current[0], reference[0]))
+ elif current[1] != reference[1]:
+ raise MakefileError("Regions in %r with the same name (%r) "
+ "are located on different strands; note "
+ "that PALEOMIX assumes that regions with "
+ "the same name constitute parts of a "
+ "single consecutive sequence, and that "
+ "these must therefore be located on the "
+ "same strand." % (bed_file, record.name,))
+
+ return frozenset(sequences)
+
+
+def _update_subsets(mkfile, steps):
+ subsets_by_regions = mkfile["Project"]["Regions"]
+
+ def _collect_subsets(roi, subset, path):
+ if roi not in subsets_by_regions:
+ raise MakefileError("Subset of unknown region (%r) requested at %r"
+ % (roi, path))
+
+ roi_fname = swap_ext(subsets_by_regions[roi]["BED"], subset + ".names")
+ if not os.path.isfile(roi_fname):
+ raise MakefileError("Subset file does not exist for Regions Of "
+ "Interest:\n Region = %r\n Subset = %r\n"
+ " Path = %r"
+ % (roi, subset, roi_fname))
+
+ sequences = set()
+ with open(roi_fname) as handle:
+ for line in handle:
+ line = line.strip()
+ if line and not line.startswith("#"):
+ sequences.add(line)
+
+ known_seqs = subsets_by_regions[roi]["Sequences"][None]
+ unknown_seqs = sequences - known_seqs
+ if unknown_seqs:
+ message = ("Unknown sequences in subset file:\n"
+ " File = %r\n Region = %r\n Subset = %r\n"
+ " Unknown sequence names =") \
+ % (roi_fname, roi, subset)
+ unknown_seqs = list(sorted(unknown_seqs))
+ if len(unknown_seqs) > 5:
+ unknown_seqs = unknown_seqs[:5] + ["..."]
+ message = "\n - ".join([message] + unknown_seqs)
+ raise MakefileError(message)
+
+ subsets_by_regions[roi]["SubsetFiles"][subset] = (roi_fname,)
+ subsets_by_regions[roi]["Sequences"][subset] = frozenset(sequences)
+
+ if "phylogeny:examl" in steps:
+ for (key, subdd) in mkfile["PhylogeneticInference"].iteritems():
+ for (subkey, roidd) in subdd["RegionsOfInterest"].iteritems():
+ if subkey not in subsets_by_regions:
+ message = \
+ "Unknown regions name in phylogenetic inference:\n" \
+ "\tPath = PhylogeneticInference:%s:RegionsOfInterest" \
+ "\n\tName = %s"
+ raise MakefileError(message % (key, subkey))
+
+ roidd["Name"] = subkey
+
+ if roidd.get("SubsetRegions") is not None:
+ path = "PhylogeneticInference:%s:RegionsOfInterest:%s" % (key, subkey)
+ _collect_subsets(subkey, roidd["SubsetRegions"], path)
+
+ if "paml:codeml" in steps:
+ for (roi, subset) in mkfile["PAML"]["codeml"]["SubsetRegions"].iteritems():
+ _collect_subsets(roi, subset, "PAML:codeml:SubsetRegions")
+
+
+def _update_filtering(mkfile):
+ samples = mkfile["Project"]["Samples"]
+ groups = mkfile["Project"]["Groups"]
+
+ filtering = {}
+ for (target, filter_by) in mkfile["Project"]["FilterSingletons"].iteritems():
+ if target.startswith("<") and target.endswith(">"):
+ raise MakefileError("Singleton-filtering must be specified per "
+ "sample, not by groups: %r" % (target,))
+ elif target not in samples:
+ raise MakefileError("Unknown/Invalid sample specifed for singleton filtering: %r" % (target,))
+ elif target in filter_by:
+ raise MakefileError("Attempting to filter singleton in sample using itself as comparison: %r" % (target,))
+
+ path = "Project:FilterSingletons:%s" % (target,)
+ filtering[target] = _select_samples(filter_by, groups, samples, path)
+
+ # Implicit inclusion is allowed, since that is useful in some cases,
+ # where we want to filter a sample based on the group it is a member of
+ if target in filtering[target]:
+ # The target itself must be excluded, as including it is invalid
+ filtering[target] = filtering[target] - set((target,))
+ print_warn("Warning: Sample %r is singleton-filtered using a "
+ "group it is also a member of; this may be by mistake."
+ % (target,))
+
+ if not filtering[target]:
+ raise MakefileError("No samples specified by which to "
+ "singleton-filter by for %r" % (target,))
+
+ mkfile["Project"]["FilterSingletons"] = filtering
+
+
+def _update_homozygous_contigs(mkfile):
+ """Treat unspecified values for HomozygousContigs as an empty list, in
+ order that the user does not need to specify "[]" for empty lists.
+ """
+ for regions in mkfile["Project"]["Regions"].itervalues():
+ hcontigs = regions["HomozygousContigs"]
+
+ for key, contigs in hcontigs.items():
+ if contigs is None:
+ hcontigs[key] = []
+
+
+def _check_bam_sequences(options, mkfile, steps):
+ """Check that the BAM files contains the reference sequences found in the
+ FASTA file, matched by name and length; extra sequences are permitted. This
+ check is only done if genotyping is to be carried out, to reduce the
+ overhead of reading the BAM file headers.
+
+ """
+ if ("genotype" not in steps) and ("genotyping" not in steps):
+ return
+
+ print_info(" - Validating BAM files ...")
+ bam_files = {}
+ for regions in mkfile["Project"]["Regions"].itervalues():
+ for sample in mkfile["Project"]["Samples"].itervalues():
+ filename = os.path.join(options.samples_root, "%s.%s.bam"
+ % (sample["Name"], regions["Prefix"]))
+ if regions["Realigned"]:
+ filename = add_postfix(filename, ".realigned")
+
+ if os.path.exists(filename):
+ bam_files[filename] = _collect_fasta_contigs(regions["FASTA"])
+
+ for (filename, contigs) in bam_files.iteritems():
+ with pysam.Samfile(filename) as handle:
+ bam_contigs = dict(zip(handle.references, handle.lengths))
+
+ for (contig, length) in contigs.iteritems():
+ bam_length = bam_contigs.get(contig)
+
+ if bam_length is None:
+ message = ("Reference sequence missing from BAM file; "
+ "BAM file aligned against different prefix?\n"
+ " BAM file = %s\n Sequence name = %s") \
+ % (filename, contig)
+ raise MakefileError(message)
+ elif bam_length != length:
+ message = ("Length of reference sequence in FASTA differs "
+ "from length of sequence in BAM file; BAM file "
+ "aligned against different prefix?\n"
+ " BAM file = %s\n"
+ " Length in FASTA = %s\n"
+ " Length in BAM = %s") \
+ % (filename, length, bam_length)
+ raise MakefileError(message)
+
+
+def _check_sexes(mkfile):
+ all_contigs = set()
+ contigs_sexes = set()
+ regions_sexes = set()
+ for regions in mkfile["Project"]["Regions"].itervalues():
+ all_contigs.update(_collect_fasta_contigs(regions["FASTA"]))
+
+ for contigs in regions["HomozygousContigs"].itervalues():
+ contigs_sexes.update(contigs)
+
+ current_sexes = set(regions["HomozygousContigs"])
+ if not regions_sexes:
+ regions_sexes = current_sexes
+ elif regions_sexes != current_sexes:
+ raise MakefileError("List of sexes for regions %r does not "
+ "match other regions" % (regions["Name"],))
+
+ if not regions_sexes:
+ raise MakefileError("No sexes have been specified in makefile; "
+ "please list all sample sexes and assosiated "
+ "homozygous contigs (if any).")
+
+ for sample in mkfile["Project"]["Samples"].itervalues():
+ if sample.get("Sex") is None:
+ if sample.get("Gender") is None:
+ raise MakefileError("Please specify a sex for sample %r, or "
+ "'NA' if not applicable."
+ % (sample["Name"]))
+
+ sample["Sex"] = sample.pop("Gender")
+ elif sample.get("Gender") is not None:
+ raise MakefileError("Both a Sex and a Gender has been specified "
+ "sample %r; the Gender field is deprecated, "
+ "please only use the Sex field."
+ % (sample["Name"]))
+
+ if sample["Sex"] not in regions_sexes:
+ sexes = ", ".join(map(repr, regions_sexes))
+ message = "Sample %r has unknown sex %r; known sexes are %s" \
+ % (sample["Name"], sample["Sex"], sexes)
+ raise MakefileError(message)
+
+ unknown_contigs = contigs_sexes - all_contigs
+ if unknown_contigs:
+ print_warn("WARNING: Unknown contig(s) in 'HomozygousContigs':\n"
+ " - " + "\n - ".join(unknown_contigs))
+ print_warn("Please verify that the list(s) of contigs is correct!")
+
+
+def _update_and_check_max_read_depth(options, mkfile):
+ if any(subdd["VCF_Filter"]["MaxReadDepth"] == "auto"
+ for subdd in mkfile["Genotyping"].itervalues()):
+ print_info(" - Determinining max-depth from depth-histograms ...")
+
+ for (key, settings) in mkfile["Genotyping"].iteritems():
+ required_keys = set()
+ for sample in mkfile["Project"]["Samples"].itervalues():
+ if sample["GenotypingMethod"].lower() == "samtools":
+ required_keys.add(sample["Name"])
+
+ max_depths = settings["VCF_Filter"]["MaxReadDepth"]
+ if isinstance(max_depths, types.DictType):
+ # Extra keys are allowed, to make it easier
+ # to temporarily disable a sample
+ missing_keys = required_keys - set(max_depths)
+ if missing_keys:
+ missing_keys = "\n - ".join(sorted(missing_keys))
+ message = "MaxReadDepth not specified for the following " \
+ "samples for %r:\n - %s" % (key, missing_keys)
+ raise MakefileError(message)
+
+ elif isinstance(max_depths, types.StringTypes):
+ assert max_depths.lower() == "auto", max_depths
+ prefix = mkfile["Project"]["Regions"][key]["Prefix"]
+
+ settings["VCF_Filter"]["MaxReadDepth"] \
+ = _read_max_depths(options, prefix, required_keys)
+ else:
+ max_depths = dict.fromkeys(required_keys, max_depths)
+ settings["VCF_Filter"]["MaxReadDepth"] = max_depths
+
+
+def _read_max_depths(options, prefix, required_keys):
+ missing = []
+ max_depths = {}
+ for sample in required_keys:
+ fname = "%s.%s.depths" % (sample, prefix)
+ fpath = os.path.join(options.samples_root, fname)
+ max_depths[sample] = fpath
+
+ if not os.path.exists(fpath):
+ missing.append((sample, fpath))
+
+ if missing:
+ raise MakefileError("Could not determine 'MaxReadDepth' values "
+ "automatically; .depth files are missing for one "
+ "or more samples: \n - " +
+ "\n - ".join("%s: %s" % item for item in missing) +
+ "\n\nEnsure that the .depth files are available, "
+ "or specify a value for 'MaxReadDepth' manually.")
+
+ for sample, fpath in max_depths.iteritems():
+ max_depths[sample] = _read_max_depth(fpath, prefix, sample)
+
+ return max_depths
+
+
+def _read_max_depth(filename, prefix, sample):
+ if filename in _DEPTHS_CACHE:
+ return _DEPTHS_CACHE[filename]
+
+ max_depth = None
+ max_depths = {}
+ try:
+ with open(filename) as handle:
+ for row in parse_padded_table(handle):
+ if row["Name"] != "*" and \
+ row["Sample"] == "*" and \
+ row["Library"] == "*" and \
+ row["Contig"] == "*":
+
+ if row["Name"] in max_depths:
+ raise MakefileError("Depth histogram %r contains "
+ "multiple 'MaxDepth' records for "
+ "sample %r; please rebuild!"
+ % (filename, row["Name"]))
+
+ max_depths[row["Name"]] = row["MaxDepth"]
+ except (OSError, IOError), error:
+ raise MakefileError("Error reading depth-histogram (%s): %s"
+ % (filename, error))
+
+ if sample in max_depths:
+ max_depth = max_depths[sample]
+ else:
+ name_counts = {}
+ name_mapping = {}
+ for cand_sample, cand_max in max_depths.iteritems():
+ name = cand_sample.split('.', 1)[0]
+ name_mapping[name] = cand_sample
+ name_counts[name] = name_counts.get(name, 0) + 1
+
+ if name_mapping.get(sample) == 1:
+ # Sample name (with some extensions) found
+ # This is typical if 'paleomix depths' has been run manually.
+ max_depth = max_depths[name_mapping[sample]]
+ elif len(max_depths) == 1:
+ # Just one sampel in the depth histogram; even though it does not
+ # match, we assuem that this is the correct table. This is because
+ # manually generating files / renaming files would otherwise cause
+ # failure when using 'MaxDepth: auto'.
+ (cand_sample, max_depth), = max_depths.items()
+ print_warn(" - Name in depths file not as expected; "
+ "found %r, not %r:"
+ % (cand_sample, sample))
+
+ if max_depth is None:
+ raise MakefileError("MaxDepth for %r not found in depth-histogram: %r"
+ % (sample, filename))
+ elif max_depth == "NA":
+ raise MakefileError("MaxDepth is not calculated for sample %r; "
+ "cannot determine MaxDepth values automatically."
+ % (filename,))
+ elif not max_depth.isdigit():
+ raise MakefileError("MaxDepth is not a valid for sample %r in %r; "
+ "expected integer, found %r."
+ % (sample, filename, max_depth))
+
+ max_depth = int(max_depth)
+
+ print_info(" - %s.%s = %i" % (sample, prefix, max_depth))
+ _DEPTHS_CACHE[filename] = max_depth
+ return max_depth
+
+
+_DEPTHS_CACHE = {}
+
+
+def _check_indels_and_msa(mkfile):
+ msa = mkfile["MultipleSequenceAlignment"]
+ regions = mkfile["Project"]["Regions"]
+ for (name, subdd) in regions.iteritems():
+ msa_enabled = msa[name]["Enabled"]
+
+ if subdd["IncludeIndels"] and not msa_enabled:
+ raise MakefileError("Regions %r includes indels, but MSA is disabled!" % (name,))
+
+
+def _update_sample_sets(mkfile):
+ samples = mkfile["Project"]["Samples"]
+ groups = mkfile["Project"]["Groups"]
+
+ for (key, subdd) in mkfile["PhylogeneticInference"].iteritems():
+ subdd["ExcludeSamples"] = \
+ _select_samples(subdd["ExcludeSamples"], groups, samples, "PhylogeneticInference:%s:ExcludeSamples" % (key,))
+
+ # Replace None with an empty list, to simplify code using this value
+ root_trees_on = subdd["RootTreesOn"] or ()
+ subdd["RootTreesOn"] = \
+ _select_samples(root_trees_on, groups, samples, "PhylogeneticInference:%s:RootTreesOn" % (key,))
+
+ mkfile["PAML"]["codeml"]["ExcludeSamples"] = \
+ _select_samples(mkfile["PAML"]["codeml"]["ExcludeSamples"], groups, samples, "PAML:codeml:ExcludeSamples")
+
+
+def _update_genotyping(mkfile):
+ genotyping = mkfile["Genotyping"]
+ defaults = genotyping.pop("Defaults")
+ defaults.setdefault("Padding", 5)
+ defaults["VCF_Filter"].setdefault("MaxReadDepth", 0)
+
+ for (key, subdd) in genotyping.iteritems():
+ if subdd.get("GenotypeEntirePrefix"):
+ message = "GenotypeEntirePrefix is only allowed for prefixes " \
+ "using default parameters, but is set for %r" % (key,)
+ raise MakefileError(message)
+
+ for key in mkfile["Project"]["Regions"]:
+ subdd = fill_dict(genotyping.get(key, {}), defaults)
+ subdd["Random"]["--padding"] = subdd["Padding"]
+ genotyping[key] = subdd
+
+ regions = set(genotyping)
+ unknown_regions = regions - set(mkfile["Project"]["Regions"])
+ if unknown_regions:
+ raise MakefileError("Unknown Regions of Interest in Genotyping: %s" \
+ % (", ".join(unknown_regions),))
+
+
+def _update_msa(mkfile):
+ msa = mkfile["MultipleSequenceAlignment"]
+ defaults = msa.pop("Defaults")
+ defaults.setdefault("Program", "MAFFT")
+ defaults["MAFFT"].setdefault("Algorithm", "MAFFT")
+
+ for key in mkfile["Project"]["Regions"]:
+ msa[key] = fill_dict(msa.get(key, {}), defaults)
+
+ unknown_regions = set(msa) - set(mkfile["Project"]["Regions"])
+ if unknown_regions:
+ raise MakefileError("Unknown Regions of Interest in Genotyping: %s" \
+ % (", ".join(unknown_regions),))
+
+
+# Recursive definition of sample tree
+_VALIDATION_SUBSAMPLE_KEY = And(StringStartsWith("<"),
+ StringEndsWith(">"))
+_VALIDATION_SAMPLES_KEY = And(IsStr, Not(_VALIDATION_SUBSAMPLE_KEY))
+_VALIDATION_SAMPLES = {
+ _VALIDATION_SAMPLES_KEY: {
+ "GenotypingMethod": StringIn(("reference sequence",
+ "random sampling",
+ "samtools"),
+ default="samtools"),
+ "SpeciesName": IsStr, # Not used; left for backwards compatibility
+ "CommonName": IsStr, # Not used; left for backwards compatibility
+ "Sex": IsStr(),
+ "Gender": IsStr(),
+ }
+}
+_VALIDATION_SAMPLES[_VALIDATION_SUBSAMPLE_KEY] = _VALIDATION_SAMPLES
+
+# Genotyping settings; note that explicit lists must not be used here, to allow
+# proper inheritance of default values. Use IsListOf instead.
+_VALIDATION_GENOTYPES = {
+ "Padding": IsUnsignedInt,
+ "GenotypeEntirePrefix": IsBoolean(default=False),
+ "MPileup": {
+ StringStartsWith("-"): Or(IsInt, IsStr, IsNone),
+ },
+ "BCFTools": {
+ StringStartsWith("-"): Or(IsInt, IsStr, IsNone),
+ },
+ "Random": {
+ "--min-distance-to-indels": IsUnsignedInt,
+ },
+ "VCF_Filter": {
+ "MaxReadDepth": Or(IsUnsignedInt, IsDictOf(IsStr, IsUnsignedInt),
+ StringIn(("auto",))),
+
+ "--keep-ambigious-genotypes": IsNone,
+ "--min-quality": IsUnsignedInt,
+ "--min-allele-frequency": IsFloat,
+ "--min-mapping-quality": IsUnsignedInt,
+ "--min-read-depth": IsUnsignedInt,
+ "--max-read-depth": IsUnsignedInt,
+ "--min-num-alt-bases": IsUnsignedInt,
+ "--min-distance-to-indels": IsUnsignedInt,
+ "--min-distance-between-indels": IsUnsignedInt,
+ "--min-strand-bias": IsFloat,
+ "--min-baseq-bias": IsFloat,
+ "--min-mapq-bias": IsFloat,
+ "--min-end-distance-bias": IsFloat,
+ },
+}
+
+_VALIDATION_MSA = {
+ "Enabled": IsBoolean(default=True),
+ "Program": StringIn(("mafft",)), # TODO: Add support for other programs
+
+ "MAFFT": {
+ "Algorithm": StringIn(("mafft", "auto",
+ "FFT-NS-1", "FFT-NS-2", "FFT-NS-i",
+ "NW-INS-i", "L-INS-i", "E-INS-i", "G-INS-i")),
+ StringStartsWith("-"): CLI_PARAMETERS,
+ },
+}
+
+
+_VALIDATION = {
+ "Project": {
+ "Title": IsStr(default="Untitled"),
+ "Samples": _VALIDATION_SAMPLES,
+ "RegionsOfInterest": {
+ IsStr: {
+ "Prefix": IsStr(default=REQUIRED_VALUE),
+ "Realigned": IsBoolean(default=False),
+ "ProteinCoding": IsBoolean(default=False),
+ "IncludeIndels": IsBoolean(default=True),
+ "HomozygousContigs": {
+ IsStr: Or(IsNone, IsListOf(IsStr)),
+
+ # The sex 'NA' defaults to no homozygous chromosomes
+ "NA": Or(IsNone, IsListOf(IsStr),
+ default=[]),
+ },
+ },
+ },
+ "FilterSingletons": {
+ IsStr: [IsStr],
+ },
+ },
+ "Genotyping": {
+ "Defaults": _VALIDATION_GENOTYPES,
+ IsStr: _VALIDATION_GENOTYPES,
+ },
+ "MultipleSequenceAlignment": {
+ "Defaults": _VALIDATION_MSA,
+ IsStr: _VALIDATION_MSA,
+ },
+ "PhylogeneticInference": {
+ IsStr: {
+ # Which program to use; TODO: Add support for other programs
+ "Program": StringIn(("examl",), default="examl"),
+ # Exclude one or more samples from the phylogeny
+ "ExcludeSamples": [IsStr],
+ # Which samples to root the final trees on / or midpoint rooting
+ "RootTreesOn": [IsStr],
+ # Create a tree per gene, for each region of interest,
+ # or create a supermatrix tree from all regions specified.
+ "PerGeneTrees": IsBoolean(default=False),
+ # Selection of regions of interest / settings per region
+ "RegionsOfInterest": {
+ IsStr: {
+ "Partitions": Or(And(IsStr,
+ ValuesSubsetOf("123456789X")),
+ ValueIn([False]),
+ default=REQUIRED_VALUE),
+ "SubsetRegions": Or(IsStr, IsNone, default=None),
+ },
+ },
+ "SubsetRegions": {
+ IsStr: IsStr,
+ },
+ "ExaML": {
+ "Bootstraps": IsUnsignedInt(default=100),
+ "Replicates": IsUnsignedInt(default=1),
+ "Model": StringIn(("GAMMA", "PSR"),
+ default="gamma"),
+ }
+ }
+ },
+ "PAML": {
+ "codeml": {
+ "ExcludeSamples": [IsStr],
+ "SubsetRegions": {
+ IsStr: IsStr,
+ },
+ IsStr: {
+ "ControlFile": IsStr(default=REQUIRED_VALUE),
+ "TreeFile": IsStr(default=REQUIRED_VALUE),
+ },
+ },
+ },
+}
diff --git a/paleomix/tools/phylo_pipeline/mkfile.py b/paleomix/tools/phylo_pipeline/mkfile.py
new file mode 100755
index 0000000..bda2b91
--- /dev/null
+++ b/paleomix/tools/phylo_pipeline/mkfile.py
@@ -0,0 +1,233 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import sys
+
+
+_TEMPLATE = """# -*- mode: Yaml; -*-
+Project:
+ Title: PROJECT_NAME
+
+ # List of samples to be included in the analytical steps, which may be
+ # grouped using any arbitrary number of (levels of) groups. (Sub)groups
+ # are not required, but may be used instead of listing individual samples
+ # in 'ExcludeSamples' and 'FilterSingletons'.
+ Samples:
+ <GROUP>:
+ <SUBGROUP>:
+ SAMPLE_NAME:
+ # Sex of the sample; used to filter SNPs on homozygous
+ # contigs (see below). If not relevant, the value 'NA' may be used.
+ Sex: ...
+ # Method to use when genotyping samples (see 'Genotyping');
+ # defaults to 'SAMTools' if not explicitly specified.
+# Genotyping Method: ...
+
+ # Specifies a set of regions of interest, each representing one or more
+ # named regions in a reference sequence (e.g. genes) in BED format.
+ RegionsOfInterest:
+ NAME:
+ # Name of the prefix; is expected to correspond to the filename
+ # of the FASTA file without the extension / the name of the
+ # prefix used in the BAM pipeline.
+ Prefix: PREFIX_NAME
+ # If true, BAM files are expected to have the postfix ".realigned";
+ # allows easier interopterability with the BAM pipeline.
+ Realigned: yes
+ # Specifies whether or not the sequences are protein coding; if true
+ # indels are only included in the final sequence if the length is
+ # divisible by 3.
+ ProteinCoding: no
+ # Include indels in final sequence; note that indels are always called,
+ # and used to filter SNPs, even if not included in the resulting FASTA
+ # sequences. Requires that 'MultipleSequenceAlignment' is enabled
+ IncludeIndels: yes
+ # List of contigs for which heterozygous SNPs should be filtered
+ # (site set to 'N') based on sex; All sexes used in the 'Samples'
+ # section must be listed, except for 'NA' which defaults to no contigs.
+ HomozygousContigs:
+ Female:
+ - chrM
+ Male:
+ - chrX
+ - chrY
+ - chrM
+
+ # Filter sites in a sample, replacing any nucleotide not observed
+ # in the specified list of samples or groups with 'N'.
+# FilterSingletons:
+# NAME_OF_SAMPLE:
+# - <NAME_OF_GROUP>
+# - NAME_OF_SAMPLE
+
+
+Genotyping:
+ # Default settings for all regions of interest
+ Defaults:
+ # Regions of interest are expanded by this number of bases when calling
+ # SNPs, in order to ensure that adjacent indels can be used during filtering
+ # (VCF_filter --min-distance-to-indels and --min-distance-between-indels).
+ # The final sequences does not include the padding.
+ Padding: 10
+
+ # By default, each set of regions of interest are genotyped seperately,
+ # even if these overlap. By setting this option to true, the entire prefix
+ # is genotyped once, and all regions of interest are extracted from this.
+ # This can only be done for prefixes that only use genotyping defaults.
+ GenotypeEntirePrefix: no
+
+ # Settings for genotyping by random sampling of nucletoides at each site
+ Random:
+ # Min distance of variants to indels
+ --min-distance-to-indels: 2
+
+ MPileup:
+ -E: # extended BAQ for higher sensitivity but lower specificity
+ -A: # count anomalous read pairs
+
+ BCFTools:
+ -g: # Call genotypes at variant sites
+
+ VCF_Filter:
+ # Maximum coverage acceptable for genotyping calls; if set to zero, the
+ # default vcf_filter value is used; if set to 'auto', the MaxDepth value
+ # will be read from the depth histograms generated by the BAM pipeline.
+ MaxReadDepth: 0
+
+ # Minimum coverage acceptable for genotyping calls
+ --min-read-depth: 8
+ # Min RMS mapping quality
+ --min-mapping-quality: 10
+ # Min QUAL score (Phred) for genotyping calls
+ --min-quality: 30
+ # Min distance of variants to indels
+ --min-distance-to-indels: 2
+ # Min distance between indels
+ --min-distance-between-indels: 10
+ # Min P-value for strand bias (given PV4)
+ --min-strand-bias: 1.0e-4
+ # Min P-value for baseQ bias (given PV4)
+ --min-baseq-bias: 1.0e-4
+ # Min P-value for mapQ bias (given PV4)
+ --min-mapq-bias: 1.0e-4
+ # Min P-value for end distance bias (given PV4)
+ --min-end-distance-bias: 1.0e-4
+ # Max frequency of the major allele at heterozygous sites
+ --min-allele-frequency: 0.2
+ # Minimum number of alternative bases observed for variants
+ --min-num-alt-bases: 2
+
+# Add / overwrite default settings for a set of regions
+# NAME_OF_REGIONS:
+# ...
+
+
+MultipleSequenceAlignment:
+ # Default settings for all regions of interest
+ Defaults:
+ Enabled: yes
+
+ # Multiple sequence alignment using MAFFT
+ MAFFT:
+ # Select alignment algorithm; valid values are 'mafft', 'auto', 'fft-ns-1',
+ # 'fft-ns-2', 'fft-ns-i', 'nw-ns-i', 'l-ins-i', 'e-ins-i', and 'g-ins-i'.
+ Algorithm: G-INS-i
+
+ # Parameters for mafft algorithm; see above for example of how to specify
+ --maxiterate: 1000
+
+# Add / overwrite default settings for a set of regions
+# NAME_OF_REGIONS:
+# ...
+
+
+PhylogeneticInference:
+ PHYLOGENY_NAME:
+ # Exclude (groups of) samples from this analytical step
+# ExcludeSamples:
+# - <NAME_OF_GROUP>
+# - NAME_OF_SAMPLE
+
+ # Root the final tree(s) on one or more samples; if no samples
+ # are specified, the tree(s) will be rooted on the midpoint(s)
+# RootTreesOn:
+# - <NAME_OF_GROUP>
+# - NAME_OF_SAMPLE
+
+ # If 'yes', a tree is generated per named sequence in the areas of
+ # interest; otherwise a super-matrix is created from the combined set
+ # of regions specfied below.
+ PerGeneTrees: no
+
+ # Which Regions Of Interest to build the phylogeny from.
+ RegionsOfInterest:
+ REGIONS_NAME:
+ # Partitioning scheme for sequences: Numbers specify which group a
+ # position belongs to, while 'X' excludes the position from the final
+ # partioned sequence; thus "123" splits sequences by codon-positions,
+ # while "111" produces a single partition per gene. If set to 'no',
+ # a single partition is used for the entire set of regions.
+ Partitions: "111"
+ # Limit analysis to a subset of a RegionOfInterest; subsets are expected to be
+ # located at <genome root>/<prefix>.<region name>.<subset name>.names, and
+ # contain single name (corresponding to column 4 in the BED file) per line.
+# SubsetRegions: SUBSET_NAME
+
+ ExaML:
+ # Number of times to perform full phylogenetic inference
+ Replicates: 1
+ # Number of bootstraps to compute
+ Bootstraps: 100
+ # Model of rate heterogeneity (GAMMA or PSR)
+ Model: GAMMA
+"""
+
+_NOT_ENABLED = """
+PAML:
+ # Run codeml on each named sequence in the regions of interest
+ codeml:
+# Exclude (groups of) samples from this analytical step
+# ExcludeSamples:
+# - <NAME_OF_GROUP>
+# - NAME_OF_SAMPLE
+
+ # Limit analysis to a subset of a RegionOfInterest; subsets are expected to be
+ # located at <genome root>/<prefix>.<region name>.<subset name>.names, and
+ # contain single name (corresponding to column 4 in the BED file) per line.
+# SubsetRegions:
+# REGIONS_NAME: SUBSET_NAME
+
+ # One or more 'codeml' runs; name is used as a postfix for results.
+ RUN_NAME:
+ # Control file template; the values 'seqfile', 'treefile'
+ # automatically set to the approriate values.
+ ControlFile: PATH_TO_CODEML_CONTROL_FILE
+ # 'treefile' in the control-file is set to this value
+ TreeFile: PATH_TO_CODEML_TREEFILE
+"""
+
+
+def main(_argv):
+ print _TEMPLATE
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv[1:]))
diff --git a/paleomix/tools/phylo_pipeline/parts/__init__.py b/paleomix/tools/phylo_pipeline/parts/__init__.py
new file mode 100644
index 0000000..d8a7467
--- /dev/null
+++ b/paleomix/tools/phylo_pipeline/parts/__init__.py
@@ -0,0 +1,22 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
diff --git a/paleomix/tools/phylo_pipeline/parts/genotype.py b/paleomix/tools/phylo_pipeline/parts/genotype.py
new file mode 100644
index 0000000..554b9bc
--- /dev/null
+++ b/paleomix/tools/phylo_pipeline/parts/genotype.py
@@ -0,0 +1,388 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+
+from copy import deepcopy
+
+from paleomix.atomiccmd.builder import \
+ apply_options
+from paleomix.nodes.samtools import \
+ TabixIndexNode, \
+ FastaIndexNode, \
+ BAMIndexNode
+from paleomix.nodes.bedtools import \
+ PaddedBedNode
+from paleomix.nodes.sequences import \
+ ExtractReferenceNode
+from paleomix.common.fileutils import \
+ swap_ext, \
+ add_postfix
+from paleomix.nodes.commands import \
+ VCFPileupNode, \
+ VCFFilterNode, \
+ BuildRegionsNode, \
+ SampleRegionsNode, \
+ GenotypeRegionsNode
+
+
+def apply_samtools_options(builder, options, argument):
+ for (key, value) in dict(options).iteritems():
+ sam_argument = key
+ if value is not None:
+ sam_argument = "%s=%s" % (key, value)
+
+ builder.add_option(argument, sam_argument, sep="=")
+
+
+###############################################################################
+###############################################################################
+
+# Caches for nodes shared between multiple tasks
+_BAI_CACHE = {}
+_FAI_CACHE = {}
+_BED_CACHE = {}
+_VCF_CACHE = {}
+
+
+def build_bam_index_node(bamfile):
+ """Returns a node generating a BAI index (using SAMTools) for a BAM file;
+ the result is cached, to ensure that multiple calls for the same BAM does
+ not result in files being clobbered.
+
+ """
+ if bamfile not in _BAI_CACHE:
+ _BAI_CACHE[bamfile] = \
+ BAMIndexNode(infile=bamfile)
+ return _BAI_CACHE[bamfile]
+
+
+def build_fasta_index_node(reference):
+ if reference not in _FAI_CACHE:
+ _FAI_CACHE[reference] = \
+ FastaIndexNode(infile=reference)
+ return _FAI_CACHE[reference]
+
+
+def build_regions_nodes(regions, padding, dependencies=()):
+ destination = add_postfix(regions["BED"], ".padded_%ibp" % (padding,))
+
+ if not padding:
+ return regions["BED"], dependencies
+
+ if destination not in _BED_CACHE:
+ dependencies = list(dependencies)
+ dependencies.append(build_fasta_index_node(regions["FASTA"]))
+ _BED_CACHE[destination] \
+ = PaddedBedNode(fai_file=regions["FASTA"] + ".fai",
+ infile=regions["BED"],
+ outfile=destination,
+ amount=padding,
+ dependencies=dependencies)
+
+ return destination, (_BED_CACHE[destination],)
+
+
+def _apply_vcf_filter_options(vcffilter, genotyping, sample):
+ filter_cfg = genotyping["VCF_Filter"]
+ apply_options(vcffilter.commands["filter"], filter_cfg)
+ if filter_cfg["MaxReadDepth"][sample]:
+ max_depth = filter_cfg["MaxReadDepth"][sample]
+ vcffilter.commands["filter"].set_option("--max-read-depth", max_depth)
+ return vcffilter.build_node()
+
+
+def build_genotyping_bedfile_nodes(options, genotyping, sample, regions,
+ dependencies):
+ bamfile = "%s.%s.bam" % (sample, regions["Prefix"])
+ bamfile = os.path.join(options.samples_root, bamfile)
+ if regions["Realigned"]:
+ bamfile = add_postfix(bamfile, ".realigned")
+
+ prefix = regions["Genotypes"][sample]
+ padding, bedfile = genotyping["Padding"], None
+ if not genotyping["GenotypeEntirePrefix"]:
+ bedfile, nodes = \
+ build_regions_nodes(regions, padding, dependencies)
+ bai_node = build_bam_index_node(bamfile)
+ dependencies = nodes + (bai_node,)
+ else:
+ prefix = os.path.join(os.path.dirname(prefix),
+ "%s.%s.TEMP" % (sample, regions["Prefix"]))
+
+ dependencies += (build_bam_index_node(bamfile),)
+
+ return prefix, bamfile, bedfile, dependencies
+
+
+def build_genotyping_nodes_cached(options, genotyping, sample, regions,
+ dependencies):
+ """Carries out genotyping, filtering of calls, and indexing of files for a
+ given sample and prefix. If the option 'GenotypeEntirePrefix' is enabled,
+ the BAM is genotyped once, and each set of RegionsOfInterest simply extract
+ the relevant regions during construction of the consensus sequence.
+
+ Parameters:
+ options: An options object (c.f. paleomix.tools.phylo_pipeline.config).
+ genotyping: Genotyping options defined for a specific set of areas of
+ interest, corresponding to Genotyping:NAME in the makefile.
+ sample: The name of the sample to be genotyped.
+ egions: A dictionary for a 'RegionsOfInterest' from the makefile.
+ dependencies: Depenencies that must be met before genotyping starts.
+
+ Returns a tuple containing the filename of the filtered and tabix-indexed
+ VCF file, and the top-level node generating this file. Multiple calls for
+ the same BAM and prefix will return the same VCF and nodes if the option
+ for 'GenotypeEntirePrefix' is enabled, otherwise each ROI is genotyped
+ individiually.
+
+ Output files are generated in ./results/PROJECT/genotyping. If the option
+ for 'GenotypeEntirePrefix' is enabled, the following files are generated:
+ SAMPLE.PREFIX.vcf.bgz: Unfiltered calls for variant/non-variant sites.
+ SAMPLE.PREFIX.vcf.pileup.bgz: Pileup of sites containing SNPs.
+ SAMPLE.PREFIX.vcf.pileup.bgz.tbi: Tabix index of the pileup.
+ SAMPLE.PREFIX.filtered.vcf.bgz: Variant calls filtered with vcf_filter.
+ SAMPLE.PREFIX.filtered.vcf.bgz.tbi: Tabix index for the filtered VCF.
+
+ If 'GenotypeEntirePrefix' is not enabled for a given ROI, the following
+ files are generated for that ROI (see descriptions above):
+ SAMPLE.PREFIX.ROI.filtered.vcf.bgz
+ SAMPLE.PREFIX.ROI.filtered.vcf.bgz.tbi
+ SAMPLE.PREFIX.ROI.vcf.bgz
+ SAMPLE.PREFIX.ROI.vcf.pileup.bgz
+ SAMPLE.PREFIX.ROI.vcf.pileup.bgz.tbi
+
+ In addition, the following files are generated for each set of
+ RegionsOfInterest (ROI), regardless of the 'GenotypeEntirePrefix' option:
+ SAMPLE.PREFIX.ROI.CDS.fasta: FASTA sequence of each feature in the ROI.
+ SAMPLE.PREFIX.ROI.CDS.fasta.fai: FASTA index generated using SAMTools.
+
+ """
+ output_prefix, bamfile, bedfile, dependencies \
+ = build_genotyping_bedfile_nodes(options, genotyping, sample, regions,
+ dependencies)
+
+ if (bamfile, output_prefix) in _VCF_CACHE:
+ return _VCF_CACHE[(bamfile, output_prefix)]
+
+ calls = swap_ext(output_prefix, ".vcf.bgz")
+ pileups = swap_ext(output_prefix, ".vcf.pileup.bgz")
+ filtered = swap_ext(output_prefix, ".filtered.vcf.bgz")
+
+ # 1. Call samtools mpilup | bcftools view on the bam
+ genotype = GenotypeRegionsNode.customize(reference=regions["FASTA"],
+ bedfile=bedfile,
+ infile=bamfile,
+ outfile=calls,
+ nbatches=options.samtools_max_threads,
+ dependencies=dependencies)
+
+ apply_samtools_options(genotype.command, genotyping["MPileup"],
+ "--mpileup-argument")
+ apply_samtools_options(genotype.command, genotyping["BCFTools"],
+ "--bcftools-argument")
+ genotype = genotype.build_node()
+
+ # 2. Collect pileups of sites with SNPs, to allow proper filtering by
+ # frequency of the minor allele, as only the major non-ref allele is
+ # counted in the VCF (c.f. field DP4).
+ vcfpileup = VCFPileupNode.customize(reference=regions["FASTA"],
+ infile_bam=bamfile,
+ infile_vcf=calls,
+ outfile=pileups,
+ dependencies=genotype)
+ apply_samtools_options(vcfpileup.command, genotyping["MPileup"],
+ "--mpileup-argument")
+ vcfpileup = vcfpileup.build_node()
+
+ vcf_tabix = TabixIndexNode(infile=pileups,
+ preset="pileup",
+ dependencies=vcfpileup)
+
+ # 3. Filter all sites using the 'vcf_filter' command
+ vcffilter = VCFFilterNode.customize(infile=calls,
+ pileup=pileups,
+ outfile=filtered,
+ regions=regions,
+ dependencies=vcf_tabix)
+ vcffilter = _apply_vcf_filter_options(vcffilter, genotyping, sample)
+
+ # 4. Tabix index. This allows random-access to the VCF file when building
+ # the consensus FASTA sequence later in the pipeline.
+ tabix = TabixIndexNode(infile=filtered,
+ preset="vcf",
+ dependencies=vcffilter)
+
+ _VCF_CACHE[(bamfile, output_prefix)] = (filtered, tabix)
+ return filtered, tabix
+
+
+def build_genotyping_nodes(options, genotyping, sample, regions, dependencies):
+ """Builds the nodes required for genotyping a BAM, in part or in whole.
+
+ By default, only the region of interest (including padding) will be
+ genotyped. However, if option 'GenotypeEntirePrefix' is enabled, the entire
+ genome is genotyped, and reused between different areas of interest.
+
+ In addition to the files generated by 'build_genotyping_nodes_cached', this
+ function generates the following files:
+ SAMPLE.PREFIX.ROI.fasta: FASTA containing each named region.
+ SAMPLE.PREFIX.ROI.fasta.fai: Index file built using "samtools faidx"
+
+ The function returns a sequence of the top-level nodes generating the files.
+
+ """
+ # 1. Get path of the filtered VCF file, and the assosiated node
+ filtered, node = build_genotyping_nodes_cached(options=options,
+ genotyping=genotyping,
+ sample=sample,
+ regions=regions,
+ dependencies=dependencies)
+
+ # 2. Generate consensus sequence from filtered VCF
+ output_fasta = regions["Genotypes"][sample]
+ builder = BuildRegionsNode.customize(infile=filtered,
+ bedfile=regions["BED"],
+ outfile=output_fasta,
+ padding=genotyping["Padding"],
+ dependencies=node)
+ if regions["ProteinCoding"]:
+ builder.command.set_option("--whole-codon-indels-only")
+ if not regions["IncludeIndels"]:
+ builder.command.set_option("--ignore-indels")
+ builder = builder.build_node()
+
+ # 3. Index sequences to make retrival easier for MSA
+ faidx = FastaIndexNode(infile=output_fasta,
+ dependencies=builder)
+
+ return (faidx,)
+
+
+def build_sampling_nodes(options, genotyping, sample, regions, dependencies):
+ fasta_file = regions["Genotypes"][sample]
+ pileup_file = swap_ext(fasta_file, ".pileup.bgz")
+
+ padding = genotyping["Padding"]
+ slop, node = build_regions_nodes(regions, padding, dependencies)
+
+ bam_file = "%s.%s.bam" % (sample, regions["Prefix"])
+ bam_file = os.path.join(options.samples_root, bam_file)
+ if regions["Realigned"]:
+ bam_file = add_postfix(bam_file, ".realigned")
+ bai_node = build_bam_index_node(bam_file)
+
+ genotype = GenotypeRegionsNode.customize(pileup_only=True,
+ reference=regions["FASTA"],
+ bedfile=slop,
+ infile=bam_file,
+ outfile=pileup_file,
+ nbatches=options.samtools_max_threads,
+ dependencies=node + (bai_node,))
+ apply_samtools_options(genotype.command, genotyping["MPileup"],
+ "--mpileup-argument")
+ genotype = genotype.build_node()
+
+ tabix = TabixIndexNode(infile=pileup_file,
+ preset="pileup",
+ dependencies=genotype)
+
+ builder = SampleRegionsNode(infile=pileup_file,
+ bedfile=regions["BED"],
+ outfile=fasta_file,
+ dependencies=tabix)
+
+ faidx = FastaIndexNode(infile=fasta_file,
+ dependencies=builder)
+
+ return (faidx,)
+
+
+def build_reference_nodes(options, genotyping, sample, regions, dependencies):
+ input_file = "%s.%s.fasta" % (regions["Prefix"], sample)
+ input_fpath = os.path.join(options.refseq_root, input_file)
+
+ output_file = "%s.%s.fasta" % (sample, regions["Desc"])
+ output_fpath = os.path.join(options.destination, "genotypes", output_file)
+
+ dependencies = list(dependencies)
+ dependencies.append(build_fasta_index_node(regions["FASTA"]))
+
+ node = ExtractReferenceNode(reference=input_fpath,
+ bedfile=regions["BED"],
+ outfile=output_fpath,
+ dependencies=dependencies)
+
+ faidx = FastaIndexNode(infile=output_fpath,
+ dependencies=node)
+ return (faidx,)
+
+
+# Functions used to carry out each of the supported genotyping methods
+_GENOTYPING_METHODS = {
+ "reference sequence": build_reference_nodes,
+ "random sampling": build_sampling_nodes,
+ "samtools": build_genotyping_nodes,
+}
+
+
+def build_sample_nodes(options, genotyping, regions_sets, sample,
+ dependencies=()):
+ nodes = []
+ for regions in regions_sets.itervalues():
+ regions = deepcopy(regions)
+
+ # Enforce homozygous contigs based on sex tag
+ regions["HomozygousContigs"] \
+ = regions["HomozygousContigs"][sample["Sex"]]
+
+ genotyping_method = sample["GenotypingMethod"].lower()
+ if genotyping_method not in _GENOTYPING_METHODS:
+ assert False, "Unexpected genotyping method %r for sample %r" \
+ % (genotyping_method, sample["Name"])
+
+ genotyping_function = _GENOTYPING_METHODS[genotyping_method]
+ node = genotyping_function(options=options,
+ genotyping=genotyping[regions["Name"]],
+ sample=sample["Name"],
+ regions=regions,
+ dependencies=dependencies)
+ nodes.extend(node)
+
+ return nodes
+
+
+def chain(pipeline, options, makefiles):
+ destination = options.destination
+ for makefile in makefiles:
+ regions_sets = makefile["Project"]["Regions"]
+ genotyping = makefile["Genotyping"]
+ options.destination = os.path.join(destination,
+ makefile["Project"]["Title"])
+
+ nodes = []
+ for sample in makefile["Project"]["Samples"].itervalues():
+ nodes.extend(build_sample_nodes(options, genotyping, regions_sets,
+ sample, makefile["Nodes"]))
+
+ makefile["Nodes"] = tuple(nodes)
+ options.destination = destination
diff --git a/paleomix/tools/phylo_pipeline/parts/msa.py b/paleomix/tools/phylo_pipeline/parts/msa.py
new file mode 100755
index 0000000..693c692
--- /dev/null
+++ b/paleomix/tools/phylo_pipeline/parts/msa.py
@@ -0,0 +1,98 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+
+from paleomix.atomiccmd.builder import \
+ apply_options
+from paleomix.nodes.sequences import \
+ CollectSequencesNode, \
+ FilterSingletonsNode
+from paleomix.nodes.mafft import \
+ MAFFTNode
+
+import paleomix.common.fileutils as fileutils
+
+
+def build_msa_nodes(options, settings, regions, filtering, dependencies):
+ if settings["Program"].lower() != "mafft":
+ raise RuntimeError("Only MAFFT support has been implemented!")
+
+ sequencedir = os.path.join(options.destination, "alignments", regions["Name"])
+ # Run on full set of sequences
+ sequences = regions["Sequences"][None]
+
+ node = CollectSequencesNode(fasta_files=regions["Genotypes"],
+ destination=sequencedir,
+ sequences=sequences,
+ dependencies=dependencies)
+
+ if settings["Enabled"]:
+ fasta_files = {}
+ algorithm = settings["MAFFT"]["Algorithm"]
+ for sequence in sequences:
+ input_file = os.path.join(sequencedir, sequence + ".fasta")
+ output_file = os.path.join(sequencedir, sequence + ".afa")
+
+ mafft = MAFFTNode.customize(input_file=input_file,
+ output_file=output_file,
+ algorithm=algorithm,
+ dependencies=node)
+ apply_options(mafft.command, settings["MAFFT"])
+ fasta_files[output_file] = mafft.build_node()
+ else:
+ fasta_files = dict((filename, node) for filename in node.output_files)
+
+ if not any(filtering.itervalues()):
+ return fasta_files.values()
+
+ destination = sequencedir + ".filtered"
+ filtering = dict(filtering)
+ filtered_nodes = []
+
+ for (filename, node) in fasta_files.iteritems():
+ output_filename = fileutils.reroot_path(destination, filename)
+ filtered_node = FilterSingletonsNode(input_file=filename,
+ output_file=output_filename,
+ filter_by=filtering,
+ dependencies=node)
+
+ filtered_nodes.append(filtered_node)
+
+ return filtered_nodes
+
+
+def chain(_pipeline, options, makefiles):
+ destination = options.destination # Move to makefile
+ for makefile in makefiles:
+ nodes = []
+ settings = makefile["MultipleSequenceAlignment"]
+ filtering = makefile["Project"]["FilterSingletons"]
+ options.destination = os.path.join(destination, makefile["Project"]["Title"])
+
+ for regions in makefile["Project"]["Regions"].itervalues():
+ regions_settings = settings[regions["Name"]]
+ nodes.extend(build_msa_nodes(options, regions_settings, regions,
+ filtering, makefile["Nodes"]))
+
+ makefile["Nodes"] = tuple(nodes)
+ options.destination = destination
diff --git a/paleomix/tools/phylo_pipeline/parts/paml.py b/paleomix/tools/phylo_pipeline/parts/paml.py
new file mode 100755
index 0000000..75c19c6
--- /dev/null
+++ b/paleomix/tools/phylo_pipeline/parts/paml.py
@@ -0,0 +1,170 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import re
+
+from paleomix.node import CommandNode, NodeError
+from paleomix.atomiccmd.command import AtomicCmd
+from paleomix.atomiccmd.sets import SequentialCmds
+from paleomix.common.formats.fasta import FASTA
+from paleomix.common.utilities import safe_coerce_to_frozenset
+
+import paleomix.common.fileutils as fileutils
+
+
+class CodemlNode(CommandNode):
+ def __init__(self, control_file, sequence_file, trees_file, output_tar,
+ exclude_groups=(), dependencies=()):
+ self._exclude_groups = safe_coerce_to_frozenset(exclude_groups)
+ self._control_file = control_file
+ self._sequence_file = sequence_file
+ self._trees_file = trees_file
+
+ paml_cmd = AtomicCmd(["codeml", "template.ctl"],
+ IN_CONTROL_FILE = control_file,
+ IN_SEQUENCE_FILE = sequence_file,
+ IN_TREES_FILE = trees_file,
+ TEMP_OUT_CTL = "template.ctl",
+ TEMP_OUT_SEQS = "template.seqs",
+ TEMP_OUT_TREES = "template.trees",
+ TEMP_OUT_STDOUT = "template.stdout",
+ TEMP_OUT_STDERR = "template.stderr",
+ TEMP_OUT_4FOLD = "4fold.nuc",
+ IN_STDIN = "/dev/null", # Prevent promts from blocking
+ set_cwd = True,
+ **CodemlNode._get_codeml_files("TEMP_OUT_CODEML"))
+
+ tar_pairs = CodemlNode._get_codeml_files("TEMP_IN_CODEML")
+ tar_files = ["%%(%s)s" % (key,) for key in tar_pairs]
+ tar_cmd = AtomicCmd(["tar", "cvzf", "%(OUT_FILE)s"] + tar_files,
+ OUT_FILE = output_tar,
+ set_cwd = True,
+ **tar_pairs)
+
+ CommandNode.__init__(self,
+ description = "<CodemlNode: %r -> %r>" % (sequence_file, output_tar),
+ command = SequentialCmds([paml_cmd, tar_cmd]),
+ dependencies = dependencies)
+
+ def _setup(self, _config, temp):
+ self._update_ctl_file(source = self._control_file,
+ destination = os.path.join(temp, "template.ctl"))
+
+ os.symlink(os.path.abspath(self._trees_file), os.path.join(temp, "template.trees"))
+ with open(os.path.join(temp, "template.seqs"), "w") as handle:
+ for record in FASTA.from_file(self._sequence_file):
+ if record.name not in self._exclude_groups:
+ name = record.name
+ sequence = record.sequence.upper()
+ handle.write("%s\n" % (FASTA(name, None, sequence),))
+
+ def _run(self, config, temp):
+ try:
+ CommandNode._run(self, config, temp)
+ except NodeError, error:
+ if self._command.join() == [1, None]:
+ with open(fileutils.reroot_path(temp, "template.stdout")) as handle:
+ lines = handle.readlines()
+ if lines and ("Giving up." in lines[-1]):
+ error = NodeError("%s\n\n%s" % (error, lines[-1]))
+ raise error
+
+ @classmethod
+ def _update_ctl_file(cls, source, destination):
+ with open(source) as handle:
+ template = handle.read()
+
+ # TODO: Do check before running everything!
+ template, count = re.subn(r'(\bseqfile\s*=).*', r'\1 template.seqs', template)
+ assert count == 1, count
+ template, count = re.subn(r'(\btreefile\s*=).*', r'\1 template.trees', template)
+ assert count == 1, count
+ template, count = re.subn(r'(\boutfile\s*=).*', r'\1 mlc', template)
+ assert count == 1, count
+
+ with open(destination, "w") as handle:
+ handle.write(template)
+
+ @classmethod
+ def _get_codeml_files(cls, key_type):
+ results = {}
+ codeml_files = ["mlc", "2NG.dN", "2NG.dS", "2NG.t",
+ "lnf", "rst", "rst1", "rub"]
+
+ for filename in codeml_files:
+ key = "%s_%s" % (key_type, filename.upper().replace(".", "_"))
+ results[key] = filename
+ return results
+
+
+def build_codeml_nodes(options, settings, regions, filtering, dependencies):
+ in_postfix, out_postfix, afa_ext = "", "", ".afa"
+ if any(filtering.itervalues()):
+ in_postfix = out_postfix = ".filtered"
+ if not settings["MultipleSequenceAlignment"][regions["Name"]]["Enabled"]:
+ out_postfix = ".unaligned" + out_postfix
+ afa_ext = ".fasta"
+
+ codeml = settings["PAML"]["codeml"]
+ subset_key = codeml["SubsetRegions"].get(regions["Name"])
+ sequences = regions["Sequences"][subset_key]
+ sequencedir = os.path.join(options.destination, "alignments", regions["Name"] + in_postfix)
+ destination = os.path.join(options.destination, "paml", "codeml", regions["Name"] + out_postfix)
+
+ fasta_files = {}
+ for sequence in sequences:
+ fasta_files[sequence] = os.path.join(sequencedir, sequence + afa_ext)
+
+ codeml_nodes = []
+ for (ctl_name, ctl_files) in codeml.iteritems():
+ # This dictionary also contains the "ExcludeSamples" option
+ if ctl_name in ("ExcludeSamples", "SubsetRegions"):
+ continue
+
+ for (sequence, filename) in fasta_files.iteritems():
+ output_tar = os.path.join(destination, "%s.%s.tar.gz" % (sequence, ctl_name))
+ ctl_file = ctl_files["ControlFile"].format(Name=sequence)
+ tree_file = ctl_files["TreeFile"].format(Name=sequence)
+
+ node = CodemlNode(control_file=ctl_file,
+ trees_file=tree_file,
+ sequence_file=filename,
+ output_tar=output_tar,
+ exclude_groups=codeml["ExcludeSamples"],
+ dependencies=dependencies)
+ codeml_nodes.append(node)
+
+ return codeml_nodes
+
+
+def chain_codeml(_pipeline, options, makefiles):
+ destination = options.destination # Move to makefile
+ for makefile in makefiles:
+ nodes = []
+ filtering = makefile["Project"]["FilterSingletons"]
+ options.destination = os.path.join(destination, makefile["Project"]["Title"])
+
+ for regions in makefile["Project"]["Regions"].itervalues():
+ nodes.extend(build_codeml_nodes(options, makefile, regions, filtering, makefile["Nodes"]))
+ makefile["Nodes"] = tuple(nodes)
+ options.destination = destination
diff --git a/paleomix/tools/phylo_pipeline/parts/phylo.py b/paleomix/tools/phylo_pipeline/parts/phylo.py
new file mode 100644
index 0000000..b72d2c5
--- /dev/null
+++ b/paleomix/tools/phylo_pipeline/parts/phylo.py
@@ -0,0 +1,260 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import random
+import collections
+
+from paleomix.nodes.formats import \
+ FastaToPartitionedInterleavedPhyNode as ToPhylipNode
+from paleomix.nodes.raxml import \
+ RAxMLParsimonyTreeNode
+from paleomix.nodes.phylip import \
+ PHYLIPBootstrapNode
+from paleomix.nodes.examl import \
+ ExaMLNode, \
+ ExaMLParserNode
+from paleomix.nodes.newick import \
+ NewickRerootNode, \
+ NewickSupportNode
+from paleomix.common.fileutils import \
+ swap_ext, \
+ add_postfix
+
+
+def _build_supermatrix(destination, input_files, exclude_samples, subset_files,
+ dependencies):
+ matrixprefix = os.path.join(destination, "alignments")
+
+ # This supermatrix (and partitions) is not used by the pipeline, but is
+ # built for reference purposes; may be made optional in the future.
+ supermatrix = ToPhylipNode(infiles=input_files,
+ out_prefix=matrixprefix,
+ exclude_groups=exclude_samples,
+ dependencies=dependencies,
+ file_dependencies=subset_files)
+
+ # Supermatrix with empty columns (all N, n, or -) stripped
+ reduced = ToPhylipNode(reduce=True,
+ infiles=input_files,
+ out_prefix=matrixprefix + ".reduced",
+ exclude_groups=exclude_samples,
+ dependencies=dependencies,
+ file_dependencies=subset_files)
+
+ return (supermatrix, reduced)
+
+
+def _examl_nodes(options, settings, input_alignment, input_partitions, input_binary, output_template, dependencies):
+ parsimony_tree = output_template % ("parsimony_tree",)
+ tree = RAxMLParsimonyTreeNode(input_alignment = input_alignment,
+ input_partitions = input_partitions,
+ output_tree = parsimony_tree,
+ dependencies = dependencies)
+
+ params = ExaMLNode.customize(input_binary = input_binary,
+ initial_tree = parsimony_tree,
+ output_template = output_template,
+ threads = options.examl_max_threads,
+ dependencies = tree)
+
+ params.command.set_option("-m", settings["ExaML"]["Model"].upper())
+
+ return params.build_node()
+
+
+def _build_rerooted_trees(nodes, reroot_on):
+ filenames = []
+ for node in nodes:
+ for filename in node.output_files:
+ if filename.endswith(".result"):
+ filenames.append(filename)
+
+ output_file = os.path.dirname(filenames[0]) + ".newick"
+
+ return NewickRerootNode(tree_files=filenames,
+ output_file=output_file,
+ taxa=reroot_on,
+ dependencies=nodes)
+
+
+def _build_examl_replicates(options, phylo, destination, input_alignment, input_partition, dependencies):
+ input_binary = os.path.join(destination, "alignments.reduced.binary")
+ binary = ExaMLParserNode(input_alignment = input_alignment,
+ input_partition = input_partition,
+ output_file = input_binary,
+ dependencies = dependencies)
+
+ replicates = []
+ for replicate_num in range(phylo["ExaML"]["Replicates"]):
+ replicate_destination = os.path.join(destination, "replicates")
+ replicate_template = os.path.join(replicate_destination, "replicate.%04i.%%s" % (replicate_num,))
+ replicates.append(_examl_nodes(options, phylo, input_alignment, input_partition, input_binary, replicate_template, binary))
+
+ if replicates:
+ return _build_rerooted_trees(replicates, phylo["RootTreesOn"])
+
+ return None
+
+
+def _build_examl_bootstraps(options, phylo, destination, input_alignment, input_partition, dependencies):
+ bootstraps = []
+ num_bootstraps = phylo["ExaML"]["Bootstraps"]
+ bootstrap_destination = os.path.join(destination, "bootstraps")
+ bootstrap_template = os.path.join(bootstrap_destination, "bootstrap.%04i.phy")
+
+ for bootstrap_num in xrange(num_bootstraps):
+ bootstrap_alignment = bootstrap_template % (bootstrap_num,)
+ bootstrap = PHYLIPBootstrapNode(input_alignment = input_alignment,
+ input_partition = input_partition,
+ output_alignment = bootstrap_alignment,
+ seed = random.randint(1, 2**32 - 1),
+ dependencies = dependencies)
+
+ bootstrap_binary = swap_ext(bootstrap_alignment, ".binary")
+ bootstrap_final = swap_ext(bootstrap_alignment, ".%s")
+ bs_binary = ExaMLParserNode(input_alignment = bootstrap_alignment,
+ input_partition = input_partition,
+ output_file = bootstrap_binary,
+ dependencies = bootstrap)
+
+ bootstraps.append(_examl_nodes(options = options,
+ settings = phylo,
+ input_alignment = bootstrap_alignment,
+ input_partitions = input_partition,
+ input_binary = bootstrap_binary,
+ output_template = bootstrap_final,
+ dependencies = bs_binary))
+
+ if bootstraps:
+ return _build_rerooted_trees(bootstraps, phylo["RootTreesOn"])
+
+ return None
+
+
+def add_bootstrap_support(destination, replicate, bootstrap):
+ if not (replicate and bootstrap):
+ return filter(None, (replicate, bootstrap))
+
+ replicate_file = os.path.join(destination, "replicates.newick")
+ bootstrap_file = os.path.join(destination, "bootstraps.newick")
+ output_file = add_postfix(replicate_file, ".support")
+
+ return NewickSupportNode(main_tree_files = replicate_file,
+ support_tree_files = bootstrap_file,
+ output_file = output_file,
+ dependencies = (bootstrap, replicate)),
+
+
+def _build_examl_nodes(options, settings, destination, input_files, subset_files, dependencies):
+ input_alignment = os.path.join(destination, "alignments.reduced.phy")
+ input_partition = os.path.join(destination, "alignments.reduced.partitions")
+
+ excluded = settings["ExcludeSamples"]
+ supermatrix = _build_supermatrix(destination, input_files, excluded, subset_files, dependencies)
+ examl_args = (options, settings, destination, input_alignment, input_partition, supermatrix)
+
+ examl_replicates = _build_examl_replicates(*examl_args)
+ examl_bootstraps = _build_examl_bootstraps(*examl_args)
+ examl_dependencies = add_bootstrap_support(destination, examl_replicates, examl_bootstraps)
+
+ return examl_dependencies
+
+
+def _build_examl_per_gene_nodes(options, settings, run_dd, roi, destination, filtering, dependencies):
+ regions = settings["Project"]["Regions"][roi["Name"]]
+ sequences = regions["Sequences"][roi["SubsetRegions"]]
+ subset_files = regions["SubsetFiles"][roi["SubsetRegions"]]
+ filtering_postfix = ".filtered" if any(filtering.itervalues()) else ""
+ sequence_dir = os.path.join(options.destination, "alignments", roi["Name"] + filtering_postfix)
+ msa_enabled = settings["MultipleSequenceAlignment"][regions["Name"]]["Enabled"]
+ fasta_extension = ".afa" if msa_enabled else ".fasta"
+
+ partitions = roi["Partitions"] or "111"
+
+ nodes = []
+ for sequence in sequences:
+ seq_source = os.path.join(sequence_dir, sequence + fasta_extension)
+ seq_destination = os.path.join(destination, sequence)
+ input_files = {sequence : {"partitions" : partitions, "filenames" : [seq_source]}}
+ nodes.extend(_build_examl_nodes(options, run_dd, seq_destination, input_files, subset_files, dependencies))
+
+ return nodes
+
+
+def _build_examl_regions_nodes(options, settings, run_dd, destination, filtering, dependencies):
+ input_files = collections.defaultdict(dict)
+ subset_files = []
+ for (roi_name, roi_dd) in run_dd["RegionsOfInterest"].iteritems():
+ regions = settings["Project"]["Regions"][roi_name]
+ subset_key = roi_dd.get("SubsetRegions")
+ sequences = regions["Sequences"][subset_key]
+ subset_files.extend(regions["SubsetFiles"][subset_key])
+
+ partitions = roi_dd["Partitions"]
+ filtering_postfix = ".filtered" if any(filtering.itervalues()) else ""
+ sequence_dir = os.path.join(options.destination, "alignments", roi_name + filtering_postfix)
+ msa_enabled = settings["MultipleSequenceAlignment"][regions["Name"]]["Enabled"]
+ fasta_extension = ".afa" if msa_enabled else ".fasta"
+
+ if partitions:
+ for sequence in sequences:
+ seq_source = os.path.join(sequence_dir, sequence + fasta_extension)
+
+ input_files[sequence] = {"partitions": partitions,
+ "filenames": [seq_source]}
+ else:
+ filenames = []
+ for sequence in sequences:
+ filenames.append(os.path.join(sequence_dir, sequence + fasta_extension))
+
+ input_files[roi_name] = {"partitions": "1",
+ "filenames": filenames}
+
+ return _build_examl_nodes(options, run_dd, destination, dict(input_files), subset_files, dependencies)
+
+
+def build_phylogeny_nodes(options, settings, filtering, dependencies):
+ nodes = []
+ for (run_name, run_dd) in settings["PhylogeneticInference"].iteritems():
+ destination = os.path.join(options.destination, "phylogenies", run_name)
+
+ if run_dd["PerGeneTrees"]:
+ run_nodes = []
+ for roi in run_dd["RegionsOfInterest"].itervalues():
+ roi_destination = os.path.join(destination, roi["Name"])
+ run_nodes.extend(_build_examl_per_gene_nodes(options, settings, run_dd, roi, roi_destination, filtering, dependencies))
+ nodes.extend(run_nodes)
+ else:
+ nodes.extend(_build_examl_regions_nodes(options, settings, run_dd, destination, filtering, dependencies))
+
+ return nodes
+
+
+def chain_examl(_pipeline, options, makefiles):
+ destination = options.destination # Move to makefile
+ for makefile in makefiles:
+ filtering = makefile["Project"]["FilterSingletons"]
+ options.destination = os.path.join(destination, makefile["Project"]["Title"])
+
+ makefile["Nodes"] = build_phylogeny_nodes(options, makefile, filtering, makefile["Nodes"])
+ options.destination = destination
diff --git a/paleomix/tools/phylo_pipeline/pipeline.py b/paleomix/tools/phylo_pipeline/pipeline.py
new file mode 100644
index 0000000..4d6eb77
--- /dev/null
+++ b/paleomix/tools/phylo_pipeline/pipeline.py
@@ -0,0 +1,137 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import logging
+import os
+import sys
+import time
+
+import paleomix.logger
+import paleomix.resources
+import paleomix.tools.phylo_pipeline.mkfile as mkfile
+import paleomix.ui
+import paleomix.yaml
+
+from paleomix.pipeline import Pypeline
+from paleomix.common.console import print_err
+from paleomix.tools.phylo_pipeline.makefile import \
+ MakefileError, \
+ read_makefiles
+from paleomix.tools.phylo_pipeline.config import \
+ ConfigError, \
+ parse_config, \
+ select_commands
+
+
+def main(argv):
+ try:
+ config, args = parse_config(argv)
+ except ConfigError, error:
+ print_err(error)
+ return 1
+
+ if not args or ("help" in args):
+ return 0
+ elif args[0] in ("example", "examples"):
+ if paleomix.resources.copy_example("phylo_pipeline", argv[1:]):
+ return 1
+
+ # Update interpreter to match the one currently in use;
+ # this is required since we may be running from a virtual env
+ filename = os.path.join(argv[1],
+ 'phylo_pipeline',
+ 'synthesize_reads.py')
+
+ with open(filename) as handle:
+ header, lines = handle.read().split('\n', 1)
+
+ with open(filename, 'w') as handle:
+ handle.write('#!%s\n' % (os.path.abspath(sys.executable, )))
+ handle.write(lines)
+
+ return 0
+ elif (len(args) < 2) and ("mkfile" not in args and "makefile" not in args):
+ print_err("\nPlease specify at least one makefile!")
+ return 1
+
+ commands = select_commands(args.pop(0))
+ if any((cmd in ("makefile", "mkfile")) for (cmd, _) in commands):
+ return mkfile.main(args[1:])
+
+ if not os.path.exists(config.temp_root):
+ try:
+ os.makedirs(config.temp_root)
+ except OSError, error:
+ print_err("ERROR: Could not create temp root:\n\t%s" % (error,))
+ return 1
+
+ if not os.access(config.temp_root, os.R_OK | os.W_OK | os.X_OK):
+ print_err("ERROR: Insufficient permissions for temp root: '%s'"
+ % (config.temp_root,))
+ return 1
+
+ # Init worker-threads before reading in any more data
+ pipeline = Pypeline(config)
+
+ try:
+ makefiles = read_makefiles(config, args, commands)
+ except (MakefileError, paleomix.yaml.YAMLError, IOError), error:
+ print_err("Error reading makefiles:",
+ "\n %s:\n " % (error.__class__.__name__,),
+ "\n ".join(str(error).split("\n")))
+ return 1
+
+ logfile_template = time.strftime("phylo_pipeline.%Y%m%d_%H%M%S_%%02i.log")
+ paleomix.logger.initialize(config, logfile_template)
+ logger = logging.getLogger(__name__)
+
+ for (command_key, command_func) in commands:
+ logger.info("Building %s pipeline ...", command_key)
+ command_func(pipeline, config, makefiles)
+
+ for makefile in makefiles:
+ if "Nodes" in makefile:
+ pipeline.add_nodes(makefile["Nodes"])
+
+ if config.list_input_files:
+ logger.info("Printing output files ...")
+ pipeline.print_input_files()
+ return 0
+ elif config.list_output_files:
+ logger.info("Printing output files ...")
+ pipeline.print_output_files()
+ return 0
+ elif config.list_executables:
+ logger.info("Printing required executables ...")
+ pipeline.print_required_executables()
+ return 0
+ elif config.dot_file:
+ logger.info("Writing dependency graph to %r ...", config.dot_file)
+ if not pipeline.to_dot(config.dot_file):
+ return 1
+ return 0
+
+ if not pipeline.run(max_threads=config.max_threads,
+ dry_run=config.dry_run,
+ progress_ui=config.progress_ui):
+ return 1
+ return 0
diff --git a/paleomix/tools/rmdup_collapsed.py b/paleomix/tools/rmdup_collapsed.py
new file mode 100755
index 0000000..eb5e9ec
--- /dev/null
+++ b/paleomix/tools/rmdup_collapsed.py
@@ -0,0 +1,182 @@
+#!/usr/bin/env python
+
+"""
+Stripped down version of 'FilterUniqueBAM' by
+:Author: Martin Kircher
+:Contact: Martin.Kircher at eva.mpg.de
+:Date: *08.10.2011
+:Type: tool
+:Input: BAM
+:Output: BAM
+
+Mark/Filter PCR duplicates for merged PE reads Reads BAM
+from STDIN and writes BAM to STDOUT. All non-collapsed reads
+as well as secondary/chinermic alignments, reads that have
+failed QC and unmmaped reads, are written to STDOUT as is.
+
+The input is assumed to be sorted by coordinates, and this
+order is preservered, though individual reads at the same
+position may be re-arranged).
+"""
+
+import sys
+import pysam
+import random
+
+from argparse import ArgumentParser
+
+
+def calc_consensus(reads, rng=random.random):
+ count = len(reads)
+ outread = None
+ maxsumqual = -1
+ for read in reads:
+ # Handle reads without qualities, but favor reads with qualities
+ qual = read.qual
+ if qual is None:
+ # Generate value in (-1; 0]
+ nsum = -rng()
+ else:
+ nsum = sum(map(ord, qual))
+
+ if nsum > maxsumqual:
+ outread = read
+ maxsumqual = nsum
+
+ # LOOK FOR PREVIOUS PCR DUPLICATE COUNTS
+ for key, value in read.tags:
+ if key == "XP":
+ count += value
+
+ if not outread.tags:
+ outread.tags = [("XP", count)]
+ else:
+ outread.tags = outread.tags + [("XP", count)]
+
+ return outread
+
+
+def get_consensus_se(reads):
+ # DETERMINE MOST FREQUENT CIGAR LINE
+ by_cigar = {}
+ cigar_count = {}
+ for read in reads:
+ tcigar = tuple(read.cigar)
+ if tcigar in by_cigar:
+ cigar_count[tcigar] += 1
+ by_cigar[tcigar].append(read)
+ else:
+ cigar_count[tcigar] = 1
+ by_cigar[tcigar] = [read]
+
+ to_sort = [(y, -len(str(x)), x) for (x, y) in cigar_count.iteritems()]
+ to_sort.sort()
+ selcigar = to_sort[-1][-1]
+ reads = by_cigar[selcigar]
+
+ return calc_consensus(reads)
+
+
+def write_consensus_se(outfile, reads, remove_duplicates):
+ consensus = get_consensus_se(reads)
+ for read in reads:
+ read.is_duplicate = (read is not consensus)
+ if not (read.is_duplicate and remove_duplicates):
+ outfile.write(read)
+
+
+def _flush_buffer(outfile, curvariants, remove_duplicates):
+ for value in curvariants.itervalues():
+ write_consensus_se(outfile, value[0], remove_duplicates)
+ curvariants.clear()
+
+
+_FILTERED_FLAGS = 0x1 # PE reads
+_FILTERED_FLAGS |= 0x4 # Unmapped
+_FILTERED_FLAGS |= 0x100 # Secondary alignment
+_FILTERED_FLAGS |= 0x200 # Failed QC
+_FILTERED_FLAGS |= 0x800 # Chimeric alignment
+
+
+def parse_args(argv):
+ usage = """paleomix rmdup_collapsed [options] < sorted.bam > out.bam
+
+The rmdup_collapsed filters a BAM file for PCR duplicates unpaired reads under
+the assumption that any unpaired read have been generated by the merging of
+overlapping paired-end reads, and thereby represent the complete template
+sequence. PCR duplicates are therefore detected based on both the 5' and 3'
+alignment coordinate.
+
+Paired reads (0x1), unmapped reads (0x4), secondary alignments (0x100),
+reads that failed QC (0x200), and chimeric alignments (0x800), as identified
+using the BAM record flags, are not filtered, but simply written to the output.
+
+By default, filtered reads are flagged using the "duplicate" flag (0x400), and
+written to the output. Use the --remove-duplicates command-line option to
+instead remove these records from the output.
+"""
+ parser = ArgumentParser(usage=usage)
+ parser.add_argument("input", default="-", nargs="?",
+ help="BAM file; if not set, input is read from STDIN.")
+ parser.add_argument("--remove-duplicates",
+ help="Remove duplicates from output; by default "
+ "duplicates are only flagged (flag = 0x400).",
+ default=False, action="store_true")
+ parser.add_argument("--seed", default=None, type=int,
+ help="Seed used for randomly selecting representative "
+ "reads when no reads have quality scores assigned"
+ "[default: initialized using system time].")
+
+ return parser.parse_args(argv)
+
+
+def main(argv):
+ args = parse_args(argv)
+
+ # Initialize seed used when selecting among reads without quality scores
+ random.seed(args.seed)
+
+ if args.input == "-" and sys.stdin.isatty():
+ sys.stderr.write("STDIN is a terminal, terminating!\n")
+ return 1
+ elif sys.stdout.isatty():
+ sys.stderr.write("STDOUT is a terminal, terminating!\n")
+ return 1
+
+ with pysam.Samfile(args.input, "rb") as infile:
+ with pysam.Samfile("-", "wb", template=infile) as outfile:
+ curpos = None
+ curvariants = {}
+ for (read_num, read) in enumerate(infile):
+ if curpos and ((read.tid, read.pos) != curpos):
+ # Sort order is defined as ascending 'tid's and positions
+ if curpos > (read.tid, read.pos) and not read.is_unmapped:
+ sys.stderr.write("ERROR: Input file does not appear "
+ "to be sorted by coordinates at "
+ "record %i, aborting ...\n"
+ % (read_num,))
+ return 1
+
+ _flush_buffer(outfile, curvariants,
+ args.remove_duplicates)
+ curpos = None
+
+ if read.flag & _FILTERED_FLAGS:
+ outfile.write(read)
+ continue
+
+ curpos = (read.tid, read.pos)
+ nkey = (read.is_reverse, read.pos, read.alen)
+ if nkey in curvariants:
+ curvariants[nkey][0].append(read)
+ curvariants[nkey][1] += 1
+ else:
+ curvariants[nkey] = [[read], 1]
+
+ _flush_buffer(outfile, curvariants, args.remove_duplicates)
+
+ return 0
+
+
+if __name__ == '__main__':
+ sys.exit(main(sys.argv[1:]))
diff --git a/paleomix/tools/sample_pileup.py b/paleomix/tools/sample_pileup.py
new file mode 100755
index 0000000..014bee9
--- /dev/null
+++ b/paleomix/tools/sample_pileup.py
@@ -0,0 +1,248 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+from __future__ import print_function
+from __future__ import with_statement
+
+import argparse
+import collections
+import heapq
+import itertools
+import random
+import sys
+
+import pysam
+
+import paleomix.common.sequences as sequences
+import paleomix.common.text as text
+
+from paleomix.common.bedtools import BEDRecord
+from paleomix.common.formats.fasta import FASTA
+
+
+# Regions are genotyped in chunks of 1 MB strings; this is done to reduce
+# overhead, as storing 1 MB of chars individually in a list adds about 45 MB of
+# overhead. A 100MB chromosome would therefore require ~4.5GB.
+_CHUNK_SIZE = 2 ** 20
+
+
+class Pileup(object):
+ def __init__(self, line):
+ fields = line.split("\t")
+
+ self.position = int(fields[1]) - 1
+ self.reference = fields[2]
+ self.depth = int(fields[3])
+ self.observed = fields[4]
+
+
+class PileupRegion(object):
+ def __init__(self, tabix, chrom, start, end, padding=0,
+ dist_to_indels=0):
+ assert padding >= 0, "Padding must be >= 0, not %i" % (padding,)
+ self._tabix = tabix
+ self._chrom = chrom
+ self._start = start
+ self._end = end
+ self._padding = padding
+ self._distance_to_indels = dist_to_indels
+
+ def __iter__(self):
+ if self._distance_to_indels <= 0:
+ for pileup in self._tabix.fetch(reference=self._chrom,
+ start=self._start,
+ end=self._end):
+ yield (pileup.position, self._sample_position(pileup))
+
+ # Note that bed.end is a past-the-end coordinate
+ start = max(0, self._start - self._padding)
+ end = self._end + self._padding
+
+ region = self._tabix.fetch(reference=self._chrom,
+ start=start,
+ end=end)
+ pileup_buffer = collections.deque()
+ blacklist = []
+
+ # Fill buffer, and detect blacklisted sites due to indels
+ for _ in xrange(max(self._padding, self._distance_to_indels) * 2):
+ self._add_to_buffer(region, pileup_buffer, blacklist)
+
+ while pileup_buffer:
+ position, nucleotide = pileup_buffer.popleft()
+ while blacklist and blacklist[0] < position:
+ heapq.heappop(blacklist)
+
+ if not blacklist or blacklist[0] != position:
+ if self._start <= position < self._end:
+ yield (position, nucleotide)
+
+ self._add_to_buffer(region, pileup_buffer, blacklist)
+
+ def _add_to_buffer(self, region, pileup_buffer, blacklist):
+ try:
+ pileup = Pileup(region.next())
+ self._collect_indels(pileup, blacklist)
+ pileup_buffer.append((pileup.position,
+ self._sample_position(pileup)))
+ except StopIteration:
+ pass
+
+ def _collect_indels(self, pileup, blacklist):
+ previous = None
+ for (index, current) in enumerate(pileup.observed):
+ if previous == '^':
+ previous = current
+ continue
+
+ previous = current
+ if current == "+":
+ # Insertions do not themselves cover any bases
+ length = 0
+ elif current == "-":
+ len_slice = itertools.islice(pileup.observed, index + 1, None)
+ digits = "".join(itertools.takewhile(str.isdigit, len_slice))
+ length = int(digits)
+ else:
+ continue
+
+ # Distance is defined as sites overlapping INDELs having distance
+ # 0, sites adjacent to INDELS have distance 1, etc. Note that the
+ # INDEL starts on the next position of the current row.
+ start = pileup.position - self._distance_to_indels + 2
+ end = pileup.position + self._distance_to_indels + length
+ for position in xrange(start, end):
+ heapq.heappush(blacklist, position)
+
+ @classmethod
+ def _sample_position(cls, pileup):
+ skip = 0
+ bases = []
+ observed = pileup.observed.upper()
+ for current in observed:
+ if skip:
+ skip -= 1
+ elif current in ".,":
+ bases.append(pileup.reference)
+ elif current in "ACGT":
+ bases.append(current)
+ elif current == "^":
+ skip = 1
+ elif current not in "$*N+-1234567890":
+ assert False, current
+
+ if not bases:
+ return "N"
+
+ return random.choice(bases)
+
+
+def build_region(options, genotype, bed):
+ # 'fetch' raises a ValueError if the VCF does not contain any entries for
+ # the specified contig, which can occur due to low coverage.
+ if bed.contig in genotype.contigs:
+ region = PileupRegion(genotype,
+ chrom=bed.contig,
+ start=bed.start,
+ end=bed.end,
+ padding=options.padding,
+ dist_to_indels=options.min_distance_to_indels)
+
+ remaining_length = (bed.end - bed.start)
+ offset = bed.start
+
+ # Genotyping is done in chunks, so that these can be reduced to strings
+ # and thereby reduce the memory requirements for larger regions.
+ chunk = ["N"] * min(_CHUNK_SIZE, remaining_length)
+
+ for position, nucleotide in region:
+ while position >= offset + len(chunk):
+ yield "".join(chunk)
+ remaining_length -= len(chunk)
+ offset += len(chunk)
+ chunk = ["N"] * min(_CHUNK_SIZE, remaining_length)
+
+ chunk[position - offset] = nucleotide
+
+ while offset < bed.end:
+ yield "".join(chunk)
+ remaining_length -= len(chunk)
+ offset += len(chunk)
+ chunk = ["N"] * min(_CHUNK_SIZE, remaining_length)
+
+ yield "".join(chunk)
+ else:
+ yield "N" * (bed.end - bed.start)
+
+
+def build_genes(options, genotype, regions):
+ def keyfunc(bed):
+ return (bed.contig, bed.name, bed.start)
+ regions.sort(key=keyfunc)
+
+ for (gene, beds) in itertools.groupby(regions, lambda x: x.name):
+ sequence, beds = [], tuple(beds)
+ for bed in beds:
+ sequence.extend(build_region(options, genotype, bed))
+ sequence = "".join(sequence)
+
+ if any((bed.strand == "-") for bed in beds):
+ assert all((bed.strand == "-") for bed in beds)
+
+ sequence = sequences.reverse_complement(sequence)
+
+ yield (gene, sequence)
+
+
+def main(argv):
+ prog = "paleomix sample_pileup"
+ usage = "%s [options] --genotype in.vcf --intervals in.bed > out.fasta" \
+ % (prog,)
+
+ parser = argparse.ArgumentParser(prog=prog, usage=usage)
+ parser.add_argument("--genotype", help="Tabix indexed pileup file.",
+ required=True, metavar="PILEUP")
+ parser.add_argument("--intervals", help="BED file.", required=True,
+ metavar="BED")
+ parser.add_argument("--padding", type=int, default=10, metavar="BED",
+ help="Number of bases to expand intervals, when "
+ "filtering based on adjacent indels "
+ "[%(default)s]")
+ parser.add_argument("--min-distance-to-indels", type=int, default=5,
+ help="Variants closer than this distance from indels "
+ "are filtered; set to a negative value to "
+ "disable [%(default)s].")
+ args = parser.parse_args(argv)
+
+ genotype = pysam.Tabixfile(args.genotype)
+ with open(args.intervals) as bed_file:
+ intervals = text.parse_lines_by_contig(bed_file, BEDRecord)
+
+ for (_, beds) in sorted(intervals.items()):
+ for (name, sequence) in build_genes(args, genotype, beds):
+ FASTA(name, None, sequence).write(sys.stdout)
+
+ return 0
+
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv[1:]))
diff --git a/paleomix/tools/vcf_filter.py b/paleomix/tools/vcf_filter.py
new file mode 100755
index 0000000..cda379b
--- /dev/null
+++ b/paleomix/tools/vcf_filter.py
@@ -0,0 +1,85 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+from __future__ import print_function
+
+import sys
+import errno
+import optparse
+import fileinput
+
+import pysam
+
+import paleomix.common.vcffilter as vcffilter
+
+
+def _read_files(filenames, args):
+ in_header = True
+ has_filters = False
+ reset_filter = (args.reset_filter == 'yes')
+ vcf_parser = pysam.asVCF()
+ for line in fileinput.input(filenames):
+ if not line.startswith("#"):
+ in_header = False
+ line = line.rstrip("\n\r")
+ vcf = vcf_parser(line, len(line))
+ if reset_filter:
+ vcf.filter = '.'
+
+ yield vcf
+ elif in_header:
+ if not (line.startswith("##") or has_filters):
+ has_filters = True
+ for item in sorted(vcffilter.describe_filters(args).items()):
+ print('##FILTER=<ID=%s,Description="%s">' % item)
+
+ print(line, end="")
+
+
+def main(argv):
+ desc = "paleomix vcf_filter [options] [in1.vcf, ...]"
+ parser = optparse.OptionParser(desc)
+ parser.add_option('--reset-filter', default='no', choices=('yes', 'no'),
+ help="If set to 'yes', values in the 'FILTER' column "
+ "are cleared, and set according to the results "
+ "from running the filters implemented by this "
+ "tool. If set to 'no', any existing values are "
+ "retained, and any (new) failed filters are added "
+ "to these [default: %default].")
+
+ vcffilter.add_varfilter_options(parser)
+ (args, filenames) = parser.parse_args(argv)
+
+ if (not filenames or "-" in filenames) and sys.stdin.isatty():
+ parser.error("STDIN is a terminal, terminating!")
+
+ try:
+ for vcf in vcffilter.filter_vcfs(args, _read_files(filenames, args)):
+ print(vcf)
+ except IOError, error:
+ # Check for broken pipe (head, less, etc).
+ if error.errno != errno.EPIPE:
+ raise
+
+
+if __name__ == '__main__':
+ sys.exit(main(sys.argv[1:]))
diff --git a/paleomix/tools/vcf_to_fasta.py b/paleomix/tools/vcf_to_fasta.py
new file mode 100755
index 0000000..14523f9
--- /dev/null
+++ b/paleomix/tools/vcf_to_fasta.py
@@ -0,0 +1,362 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+"""Creats consensus sequence from a VCF and BED file:
+vcf_to_fasta --intervals PATH_TO.bed --genotype PATH_TO.vcf
+
+The VCF file is expected to have been created using SAMTools, and makes use
+of SAMTools specific fields. For each region in the BED file, the script will
+create a consensus sequence containing all basepairs which are filtered as
+'PASS' or '.' in the VCF file; at each site, the most likely phenotype is
+selected using the PL fields, and heterozygous sites encoded using UIPAC codes.
+
+BED features with the same name are merged into single sequences. To ensure
+that indels are called near sequence termini, the script expects the VCF file
+to contain a certain amount of padding around the regions of interest.
+
+"""
+from __future__ import print_function
+
+import argparse
+import copy
+import itertools
+import os
+import sys
+import re
+
+import pysam
+
+import paleomix.common.vcfwrap as vcfwrap
+import paleomix.common.text as text
+import paleomix.common.sequences as sequences
+import paleomix.common.utilities as utilities
+
+from paleomix.common.bedtools import BEDRecord
+
+
+# Max number of positions to keep in memory / genotype at once
+_SEQUENCE_CHUNK = 1024 * 1024 # 1kbp
+# Number of columns per line in FASTA sequences
+_FASTA_COLUMNS = 60
+
+_VCF_DICT = re.compile("##(.*)=<(.*)>")
+
+
+###############################################################################
+###############################################################################
+# Utility functions
+
+def flush_fasta(sequence):
+ """Takes a FASTA sequence as a string, fragments it into lines of exactly
+ _FASTA_COLUMNS chars (e.g. 60), and prints all complete lines. The final
+ incomplete line (if any) is returned.
+
+ """
+ for seq_frag in utilities.fragment(_FASTA_COLUMNS, sequence):
+ if len(seq_frag) < _FASTA_COLUMNS:
+ return seq_frag
+ print(seq_frag)
+ return ""
+
+
+def split_beds(beds, size=_SEQUENCE_CHUNK):
+ """Takes a list of beds, and splits each bed into chunks that are at most
+ 'size' bp long. The resulting (smaller) beds are returned as a new list.
+
+ """
+ results = []
+ for bed in beds:
+ for start in xrange(bed.start, bed.end, size):
+ end = min(start + size, bed.end)
+ split_bed = copy.copy(bed)
+ split_bed.start = start
+ split_bed.end = end
+ results.append(split_bed)
+ return results
+
+
+###############################################################################
+###############################################################################
+# Genotyping functions
+
+def add_snp(options, snp, position, sequence):
+ if snp.alt != ".":
+ genotype = "".join(vcfwrap.get_ml_genotype(snp, options.nth_sample))
+ encoded = sequences.encode_genotype(genotype)
+ else:
+ encoded = snp.ref
+ sequence[position] = encoded
+
+
+def add_indel(options, bed, indel, sequence):
+ if indel.alt == ".":
+ return
+
+ genotype = vcfwrap.get_ml_genotype(indel, options.nth_sample)
+ if genotype[0] != genotype[1]:
+ # No way to represent heterozygous indels
+ return
+ elif genotype[0] == "N":
+ # No most likely genotype
+ return
+
+ # Note that bed.end is a past-the-end coordinate
+ start = max(0, bed.start - options.padding)
+
+ # FIXME: parse_indel only supports a single 'alt' values
+ indel.alt = genotype[0]
+ indel = vcfwrap.parse_indel(indel)
+ if indel.in_reference:
+ del_start = max(indel.pos + 1, bed.start)
+ del_end = min(indel.pos + 1 + len(indel.what), bed.end)
+
+ if del_start >= del_end:
+ # Deletion does not cover any bases of interest
+ return
+ elif options.whole_codon_indels_only:
+ if (del_end - del_start) % 3:
+ # Non-codon sized overlap with area of interest
+ return
+
+ for position in range(del_start, del_end):
+ sequence[position - start] = ""
+ elif (len(indel.what) % 3 == 0) or not options.whole_codon_indels_only:
+ # parse_indel assumes that the insertion is always the first possible
+ # base when multiple positions are possible. As a consequence, the
+ # position may be before start, with the rest of the bases overlapping
+ # the current sequence. For example:
+ # ref = ATTT
+ # alt = ATTTT
+ # It is assumed that the insertion (_) happened thus:
+ # interpretation = A_TTT
+ if indel.pos >= start:
+ sequence[indel.pos - start] += indel.what
+
+
+def filter_vcfs(genotype, contig, start, end):
+ if contig in genotype.contigs:
+ parser = pysam.asVCF()
+ # This raises a ValueError if the VCF does not
+ # contain any entries for the specified contig.
+ for vcf in genotype.fetch(contig, start, end, parser=parser):
+ if vcf.filter in ("PASS", "."):
+ yield vcf
+
+
+def build_region(options, genotype, bed):
+ # Note that bed.end is a past-the-end coordinate
+ start = max(0, bed.start - options.padding)
+
+ indels = []
+ sequence = ["N"] * (bed.end - start)
+ for vcf in filter_vcfs(genotype, bed.contig, start, bed.end):
+ if vcfwrap.is_indel(vcf):
+ indels.append(vcf)
+ else:
+ add_snp(options, vcf, vcf.pos - start, sequence)
+
+ if not options.ignore_indels:
+ for vcf in indels:
+ add_indel(options, bed, vcf, sequence)
+
+ offset = bed.start - start
+ length = bed.end - bed.start
+ truncated = sequence[offset:offset + length]
+
+ # Discard insertions after the last position
+ truncated[-1] = truncated[-1][:1]
+
+ return "".join(truncated)
+
+
+def build_regions(options, genotype, beds, reverse_compl):
+ for bed in beds:
+ sequence = build_region(options, genotype, bed)
+ if reverse_compl:
+ sequence = sequences.reverse_complement(sequence)
+ yield sequence
+
+
+def build_genes(options, genotype, regions):
+ def keyfunc(bed):
+ return (bed.contig, bed.name, bed.start)
+ regions.sort(key=keyfunc)
+
+ for (gene, beds) in itertools.groupby(regions, lambda x: x.name):
+ beds = split_beds(beds)
+ reverse_compl = False
+ if any((bed.strand == "-") for bed in beds):
+ assert all((bed.strand == "-") for bed in beds)
+
+ beds.reverse()
+ reverse_compl = True
+
+ fragments = build_regions(options, genotype, beds, reverse_compl)
+ yield (gene, fragments)
+
+
+def genotype_genes(options, intervals, genotype):
+ for (_, beds) in sorted(intervals.items()):
+ for (name, fragments) in build_genes(options, genotype, beds):
+ print(">%s" % (name,))
+
+ sequence = ""
+ for fragment in fragments:
+ sequence = flush_fasta(sequence + fragment)
+
+ if sequence:
+ print(sequence)
+
+ return 0
+
+
+###############################################################################
+###############################################################################
+
+def read_intervals(filename):
+ with open(filename) as bed_file:
+ intervals = text.parse_lines_by_contig(bed_file, BEDRecord)
+
+ for (key, beds) in intervals.iteritems():
+ bed_tuples = []
+ for bed in beds:
+ if len(bed) < 6:
+ sys.stderr.write(("ERROR: Invalid BED record '%r', must "
+ "have at least 6 fields ...\n") % (bed,))
+ return None
+
+ bed_tuples.append(bed)
+ intervals[key] = bed_tuples
+
+ return intervals
+
+
+def parse_intervals(genotype):
+ records = {}
+ for line in genotype.header:
+ match = _VCF_DICT.match(line)
+ if not match:
+ continue
+
+ key, values = match.groups()
+ if key == "contig":
+ values = dict(pair.split("=", 1) for pair in values.split(","))
+ record = BEDRecord()
+ record.contig = values["ID"]
+ record.start = 0
+ record.end = int(values["length"])
+ record.name = record.contig
+ record.strand = "+"
+
+ records[record.contig] = [record]
+
+ if not records:
+ sys.stderr.write("ERROR: List of contigs not found in VCF header; "
+ "specifying --intervals is required!\n")
+ return None
+
+ return records
+
+
+def check_nth_sample(options, genotype):
+ parser = pysam.asVCF()
+
+ for contig in genotype.contigs:
+ for record in text.parse_lines(genotype.fetch(contig), parser):
+ if len(record) <= options.nth_sample:
+ sys.stderr.write("ERROR: Sample %i selected with --nth-sample,"
+ " but file only contains %i sample(s)!\n"
+ % (options.nth_sample + 1, len(record)))
+ return False
+ return True
+ return True
+
+
+def main(argv):
+ prog = "paleomix vcf_to_fasta"
+ usage = "%s [options] --genotype in.vcf --intervals in.bed" % (prog,)
+
+ parser = argparse.ArgumentParser(prog=prog, usage=usage)
+ parser.add_argument("--genotype", required=True, metavar="VCF",
+ help="Tabix indexed VCF file; by default the first "
+ "sample is used in multi-sample VCFs. Use "
+ "--nth-sample option to select another sample.")
+ parser.add_argument("--nth-sample", default=1, type=int, metavar="NTH",
+ help="Use Nth sample from the VCF, with the first "
+ "sample numbered '1' [default: %(default)s].")
+ parser.add_argument("--intervals", metavar="BED",
+ help="Six column BED file; sequences on the same "
+ "contig with the same name are assumed to "
+ "represent the same gene, and are merged into a "
+ "single contiguous FASTA sequence.")
+ parser.add_argument("--padding", type=int, default=10,
+ help="Number of bases to expand intervals, when "
+ "checking for adjacent indels [%(default)s]")
+ parser.add_argument("--whole-codon-indels-only",
+ action="store_true", default=False,
+ help="If true, only indels where (length %% 3) == 0 "
+ "are retained [%(default)s]")
+ parser.add_argument("--ignore-indels",
+ action="store_true", default=False,
+ help="Do not include indels generated FASTA "
+ "sequence [%(default)s].")
+
+ opts = parser.parse_args(argv)
+
+ print("Running vcf_to_fasta", end="", file=sys.stderr)
+ if opts.whole_codon_indels_only:
+ print(", assuming sequences represents CDS", end="", file=sys.stderr)
+ print(" ...", file=sys.stderr)
+
+ if not os.path.exists(opts.genotype):
+ sys.stderr.write("ERROR: VCF file does not exist.\n")
+ return 1
+ elif not os.path.exists(opts.genotype + ".tbi"):
+ sys.stderr.write("ERROR: VCF file not tabix indexed.\n")
+ sys.stderr.write(" To index, run \"tabix -p vcf <filename>\".\n")
+ return 1
+ elif opts.nth_sample < 1:
+ sys.stderr.write("ERROR: --nth-sample uses 1-based offsets, zero and\n")
+ sys.stderr.write(" negative values are not allowed!\n")
+ return 1
+
+ # Relevant VCF functions uses zero-based offsets
+ opts.nth_sample -= 1
+
+ genotype = pysam.Tabixfile(opts.genotype)
+
+ if opts.intervals is None:
+ intervals = parse_intervals(genotype)
+ else:
+ intervals = read_intervals(opts.intervals)
+
+ if intervals is None:
+ return 1
+
+ if not check_nth_sample(opts, genotype):
+ return 1
+
+ return genotype_genes(opts, intervals, genotype)
+
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv[1:]))
diff --git a/paleomix/tools/zonkey/__init__.py b/paleomix/tools/zonkey/__init__.py
new file mode 100644
index 0000000..d094e34
--- /dev/null
+++ b/paleomix/tools/zonkey/__init__.py
@@ -0,0 +1,21 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2016 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
diff --git a/paleomix/tools/zonkey/build_db.py b/paleomix/tools/zonkey/build_db.py
new file mode 100755
index 0000000..4ed364e
--- /dev/null
+++ b/paleomix/tools/zonkey/build_db.py
@@ -0,0 +1,347 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2016 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import argparse
+import datetime
+import itertools
+import os
+import sys
+
+import pysam
+
+from paleomix.common.sequences import NT_CODES
+
+import paleomix.common.fileutils as fileutils
+import paleomix.tools.zonkey.common as common
+
+
+_CHUNK_SIZE = 1000000
+
+
+_SETTINGS_TEMPLATE = """
+# Database format; is incremented when the format changes
+Format: 1
+
+# Revision number; is incremented when the database (but not format) changes
+Revision: {Revision}
+
+# Arguments passed to plink
+Plink: "--horse"
+
+# Number of autosomal chromosomes; required for e.g. PCA analyses.
+# This includes autosomal chromosomes not included in the analyses.
+NChroms: {NChroms}
+
+# N bases of padding used for mitochondrial sequences; the last N bases of the
+# alignments are expected to be the same as the first N bases, in order to
+# allow alignments at this region of the genome, and are combined to generate
+# final consensus.
+MitoPadding: 30
+
+# The minimum distance between SNPs, assuming an even distribution of SNPs
+# across the genome. Used when --treemix-k is set to 'auto', which is the
+# default behavior. Value from McCue 2012 (doi:10.1371/journal.pgen.1002451).
+SNPDistance: 150000
+
+"""
+
+_BUILD_SH_TEMPLATE = """#!/bin/bash
+
+set -o nounset # Fail on unset variables
+set -o errexit # Fail on uncaught non-zero returncodes
+
+MITO_FA="mitochondria.fasta"
+if [ ! -e "${MITO_FA}" ];
+then
+ echo "WARNING: Mitochondrial FASTA ('${MITO_FA}') not found!"
+ MITO_FA=""
+fi
+
+SIM_TXT="simulations.txt"
+if [ ! -e "${SIM_TXT}" ];
+then
+ echo "WARNING: Simulations ('${SIM_TXT}') not found!"
+ SIM_TXT=""
+fi
+
+EXAMPLES="examples"
+if [ ! -d "${EXAMPLES}" ];
+then
+ echo "WARNING: Examples ('${EXAMPLES}') not found!"
+ EXAMPLES=""
+fi
+
+FILENAME="zonkey{REVISION}.tar"
+SOURCES="settings.yaml contigs.txt samples.txt ${MITO_FA} ${SIM_TXT} ${EXAMPLES} genotypes.txt build.sh"
+
+rm -vf "${FILENAME}"
+
+if ! tar cvf "${FILENAME}" ${SOURCES};
+then
+ echo "Removing partial files ..."
+ rm -vf "${FILENAME}"
+ exit 1
+fi
+"""
+
+
+class ZonkeyError(RuntimeError):
+ pass
+
+
+def _try_cast_int(value):
+ try:
+ return int(value)
+ except ValueError:
+ return value
+
+
+def _write_build_sh(args, filename):
+ sys.stderr.write('Writing %r ...\n' % (filename,))
+ if os.path.exists(filename) and not args.overwrite:
+ sys.stderr.write(' File exists; skipping.\n')
+ return
+
+ tmpl = _BUILD_SH_TEMPLATE.replace("{REVISION}", args.revision)
+ with open(filename, 'w') as handle:
+ handle.write(tmpl)
+
+
+def _write_genotypes(args, data, filename):
+ sys.stderr.write('Writing %r ...\n' % (filename,))
+ if os.path.exists(filename) and not args.overwrite:
+ sys.stderr.write(' File exists; skipping.\n')
+ return
+
+ samples = data['samples']
+ keys = tuple(sorted(samples))
+
+ ref_handle = pysam.Fastafile(args.reference)
+
+ with open(filename, 'w') as handle:
+ header = ('Chrom', 'Pos', 'Ref', ';'.join(keys))
+ handle.write('%s\n' % ('\t'.join(header)))
+
+ for contig, size in sorted(data['contigs'].items()):
+ # Skip non-autosomal contigs
+ if not isinstance(contig, int):
+ continue
+
+ sys.stderr.write(' - %s: 0%%\r' % (contig,))
+ for pos in xrange(0, size, _CHUNK_SIZE):
+ sys.stderr.write(' - %s: % 3i%%\r'
+ % (contig, (100 * pos) / size))
+
+ chunks = []
+ for key in keys:
+ real_name = samples[key]['contigs'][contig]
+ fasta_handle = samples[key]['handle']
+ chunk = fasta_handle.fetch(real_name,
+ pos,
+ pos + _CHUNK_SIZE)
+
+ chunks.append(chunk)
+
+ ref_chunk = ref_handle.fetch(real_name, pos, pos + _CHUNK_SIZE)
+
+ for idx, row in enumerate(itertools.izip(*chunks)):
+ if 'N' in row:
+ continue
+
+ nucleotides = set()
+ for nuc in row:
+ nucleotides.update(NT_CODES[nuc])
+
+ if len(nucleotides) == 2:
+ handle.write('%s\t%i\t%s\t%s\n'
+ % (contig, pos + idx + 1,
+ ref_chunk[idx], ''.join(row)))
+ sys.stderr.write(' - %s: 100%%\n' % (contig,))
+
+
+def _write_settings(args, contigs, filename):
+ sys.stderr.write('Writing %r ...\n' % (filename,))
+ if os.path.exists(filename) and not args.overwrite:
+ sys.stderr.write(' File exists; skipping.\n')
+ return
+
+ # Determine the highest numbered chromosome; this is required by,
+ # for example, SmartPCA.
+ nchroms = max(name for name in contigs if isinstance(name, int))
+
+ with open(filename, 'w') as handle:
+ handle.write(_SETTINGS_TEMPLATE.format(Revision=args.revision,
+ NChroms=nchroms))
+
+
+def _write_contigs(args, filename):
+ sys.stderr.write('Writing %r ...\n' % (filename,))
+ if os.path.exists(filename) and not args.overwrite:
+ sys.stderr.write(' File exists; skipping.\n')
+ return
+
+ fasta_handle = pysam.Fastafile(args.reference)
+ contigs = _read_contigs(args.reference)
+ lines = ['ID\tSize\tNs\tChecksum']
+
+ for name, (real_name, size) in sorted(contigs.items()):
+ sys.stderr.write(' - %s: 0%%\r' % (name,))
+ n_uncalled = 0
+ for pos in xrange(0, size, _CHUNK_SIZE):
+ sys.stderr.write(' - %s: % 3i%%\r' % (name, (100 * pos) / size))
+ chunk = fasta_handle.fetch(real_name, pos, pos + _CHUNK_SIZE)
+ n_uncalled += chunk.count('n')
+ n_uncalled += chunk.count('N')
+ n_uncalled += chunk.count('-')
+
+ sys.stderr.write(' - %s: 100%%\n' % (name,))
+ lines.append('%s\t%i\t%i\t%s'
+ % (name, size, n_uncalled, 'NA'))
+ lines.append('')
+
+ with open(filename, 'w') as handle:
+ handle.write('\n'.join(lines))
+
+
+def _write_samples(args, samples, filename):
+ sys.stderr.write('Writing %r ...\n' % (filename,))
+ if os.path.exists(filename) and not args.overwrite:
+ sys.stderr.write(' File exists; skipping.\n')
+ return
+
+ lines = ['ID\tGroup(2)\tGroup(3)\tSpecies\tSex\tSampleID\tPublication']
+
+ for name in sorted(samples):
+ lines.append('%s\t-\t-\tNA\tNA\t%s\tNA'
+ % (name, name))
+ lines.append('')
+
+ with open(filename, 'w') as handle:
+ handle.write('\n'.join(lines))
+
+
+def _process_contigs(reference, samples):
+ ref_contigs = _read_contigs(reference)
+ for name, (_, size) in ref_contigs.items():
+ ref_contigs[name] = size
+
+ for sample_name, obs_data in samples.items():
+ obs_contigs = obs_data['contigs']
+ for ref_name, ref_size in ref_contigs.iteritems():
+ if ref_name not in obs_contigs:
+ raise ZonkeyError('Contig missing for sample %r: %r'
+ % (sample_name, ref_name))
+
+ obs_name, obs_size = obs_contigs[ref_name]
+ if obs_size != ref_size:
+ raise ZonkeyError('Contig %r for sample %r has wrong size; '
+ '%i observed vs %i expected'
+ % (obs_name, sample_name,
+ obs_size, ref_size))
+
+ obs_contigs[ref_name] = obs_name
+
+ return {'samples': samples,
+ 'contigs': ref_contigs}
+
+
+def _read_contigs(filename):
+ contigs = {}
+ with open(filename + '.fai') as handle:
+ for line in handle:
+ name, size, _ = line.split('\t', 2)
+ if name in contigs:
+ raise ZonkeyError('FASTA file %r contains multiple contigs '
+ 'with the same name (%r); this is not '
+ 'supported.' % (filename, name))
+
+ fixed_name = common.contig_name_to_plink_name(name)
+ if fixed_name is not None:
+ contigs[_try_cast_int(fixed_name)] = (name, int(size))
+
+ return contigs
+
+
+def _collect_samples(reference, filenames):
+ samples = {}
+ for filename in filenames:
+ basename = os.path.basename(filename).split('.', 1)[0]
+ if basename in samples:
+ raise ZonkeyError('Duplicate sample name %r'
+ % (filename,))
+
+ # Open first to insure that file is indexed
+ handle = pysam.Fastafile(filename)
+ contigs = _read_contigs(filename)
+ if not contigs:
+ raise ZonkeyError('No usable contigs found in %r.'
+ % (filename,))
+
+ samples[basename] = {'handle': handle,
+ 'contigs': contigs}
+
+ return _process_contigs(reference, samples)
+
+
+def parse_args(argv):
+ parser = argparse.ArgumentParser()
+ parser.add_argument('root',
+ help='Root directory in which to write reference '
+ 'panel files.')
+ parser.add_argument('reference',
+ help='Reference genome in FASTA format.')
+ parser.add_argument('samples', nargs="+",
+ help='Samples to include in the reference-panel, in '
+ 'the form of FASTA files that map one-to-one '
+ 'to the reference sequences. That is to say '
+ 'that every position in the sample FASTA must '
+ 'be homologus to the same position in the '
+ 'reference sequence.')
+ parser.add_argument('--overwrite', default=False, action='store_true',
+ help='If set, the program is allowed to overwrite '
+ 'already existing output files.')
+
+ return parser.parse_args(argv)
+
+
+def main(argv):
+ args = parse_args(argv)
+ args.revision = datetime.datetime.today().strftime('%Y%m%d')
+
+ data = _collect_samples(args.reference, args.samples)
+ if not data:
+ return 1
+
+ fileutils.make_dirs(args.root)
+
+ _write_contigs(args, os.path.join(args.root, 'contigs.txt'))
+ _write_samples(args, data['samples'],
+ os.path.join(args.root, 'samples.txt'))
+ _write_settings(args, data['contigs'],
+ os.path.join(args.root, 'settings.yaml'))
+ _write_genotypes(args, data, os.path.join(args.root, 'genotypes.txt'))
+ _write_build_sh(args, os.path.join(args.root, 'build.sh'))
+
+
+if __name__ == '__main__':
+ sys.exit(main(sys.argv[1:]))
+
+
diff --git a/paleomix/tools/zonkey/build_mito.py b/paleomix/tools/zonkey/build_mito.py
new file mode 100755
index 0000000..ccbf6e3
--- /dev/null
+++ b/paleomix/tools/zonkey/build_mito.py
@@ -0,0 +1,203 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2014 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import argparse
+import os
+import sys
+
+import pysam
+
+from paleomix.common.formats.fasta import FASTA
+from paleomix.common.formats.msa import MSA
+from paleomix.common.utilities import fragment
+from paleomix.common.formats.phylip import interleaved_phy
+
+import paleomix.ui as ui
+import paleomix.tools.zonkey.database as database
+
+
+def majority_base(site):
+ best, count = "N", 0
+ for nuc in "ACGT":
+ if site[nuc] > count:
+ best = nuc
+ count = site[nuc]
+ elif site[nuc] == count:
+ best = "N"
+
+ return best
+
+
+def majority_sequence(handle, padding, contig_name, contig_length):
+ sequence = [dict.fromkeys("ACGTN", 0) for _ in xrange(contig_length)]
+
+ for column in handle.pileup(contig_name):
+ position = sequence[column.pos]
+
+ for alignment in column.pileups:
+ seq = alignment.alignment.seq
+ pos = alignment.query_position
+
+ if pos is not None:
+ position[seq[pos]] += 1
+
+ if padding:
+ offset = len(sequence) - padding
+ for idx in xrange(padding):
+ dst = sequence[idx]
+ src = sequence[idx + offset]
+
+ for key, value in src.iteritems():
+ dst[key] += value
+
+ del sequence[-padding:]
+
+ covered = coverage = 0
+ for counts in sequence:
+ total = sum(counts.itervalues()) - counts["N"]
+ coverage += total
+
+ if total:
+ covered += 1
+
+ statistics = {
+ "sequence_len": len(sequence),
+ "sequence_name": contig_name,
+ "nucleotides": coverage,
+ "covered_sites": covered,
+ "covered_pct": round((100.0 * covered) / len(sequence), 1),
+ "mean_coverage": round(coverage / float(len(sequence)), 1),
+ }
+
+ return statistics, "".join(map(majority_base, sequence))
+
+
+def align_majority(reference, majority):
+ aligned = []
+ reference_iter = iter(reference).next
+
+ for nucleotide in majority:
+ reference = reference_iter()
+ while reference == "-":
+ reference = reference_iter()
+ aligned.append("-")
+
+ aligned.append(nucleotide)
+
+ return "".join(aligned)
+
+
+def truncate_sequences(sequences, name):
+ result = {}
+ to_len = len(sequences[name].sequence)
+ for name, record in sequences.iteritems():
+ result[name] = FASTA(name=record.name,
+ meta=record.meta,
+ sequence=record.sequence[:to_len])
+
+ return result
+
+
+def filter_sequences(sequences):
+ selection = {}
+ for key, record in sequences.iteritems():
+ if record.meta is not None:
+ if "EXCLUDE" in map(str.strip, record.meta.upper().split(";")):
+ continue
+
+ selection[key] = record
+
+ return selection
+
+
+def sequences_to_msa(sequences):
+ records = []
+ for name, record in sorted(sequences.iteritems()):
+ records.append(record)
+
+ return MSA(records)
+
+
+def parse_args(argv):
+ parser = argparse.ArgumentParser()
+ parser.add_argument('database')
+ parser.add_argument('bam')
+ parser.add_argument('output_prefix')
+
+ return parser.parse_args(argv)
+
+
+def main(argv):
+ args = parse_args(argv)
+ data = database.ZonkeyDB(args.database)
+ sequences = data.mitochondria
+
+ try:
+ handle = pysam.Samfile(args.bam)
+ except (IOError, ValueError), error:
+ ui.print_err("Error reading BAM file: %s" % (error,))
+ return 1
+
+ with handle:
+ bam_info = data.validate_bam_handle(handle)
+ if bam_info is None:
+ return 1
+ elif not bam_info.is_mitochondrial:
+ ui.print_err("ERROR: BAM does not contain any known mitochondrial "
+ "sequence found in BAM ..")
+ return 1
+
+ reference = sequences[bam_info.mt_contig]
+ stats, majority = majority_sequence(handle,
+ padding=bam_info.mt_padding,
+ contig_name=bam_info.mt_contig,
+ contig_length=bam_info.mt_length)
+
+ sequences["Sample"] = FASTA(name="Sample",
+ meta=None,
+ sequence=align_majority(reference.sequence,
+ majority))
+
+ # Truncate all sequences to match the (now) unpadded sample sequence
+ sequences = truncate_sequences(sequences, "Sample")
+
+ sequences = filter_sequences(sequences)
+
+ with open(args.output_prefix + ".summary", "w") as handle:
+ stats["filename"] = os.path.abspath(args.bam)
+
+ for key, value in sorted(stats.iteritems()):
+ handle.write("{}: {}\n".format(key, value))
+
+ with open(args.output_prefix + ".phy", "w") as handle:
+ handle.write(interleaved_phy(sequences_to_msa(sequences)))
+
+ with open(args.output_prefix + ".fasta", "w") as handle:
+ for key, record in sorted(sequences.iteritems()):
+ handle.write(">{}\n".format(key))
+ for line in fragment(60, record.sequence):
+ handle.write("{}\n".format(line))
+
+ return 0
+
+
+if __name__ == '__main__':
+ sys.exit(main(sys.argv[1:]))
diff --git a/paleomix/tools/zonkey/build_tped.py b/paleomix/tools/zonkey/build_tped.py
new file mode 100644
index 0000000..b3e2469
--- /dev/null
+++ b/paleomix/tools/zonkey/build_tped.py
@@ -0,0 +1,352 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2014 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import argparse
+import collections
+import itertools
+import os
+import random
+import sys
+import tarfile
+
+import pysam
+
+from paleomix.common.sequences import NT_CODES
+from paleomix.common.sampling import reservoir_sampling
+
+import paleomix.common.bamfiles as bamtools
+import paleomix.common.fileutils as fileutils
+import paleomix.tools.zonkey.common as common
+import paleomix.tools.zonkey.database as database
+
+
+_TRANSITIONS = frozenset((('C', 'T'), ('T', 'C'),
+ ('G', 'A'), ('A', 'G')))
+
+
+def _filter_records(handle, flags=bamtools.EXCLUDED_FLAGS):
+ for record in handle:
+ if not record.flag & flags:
+ yield record
+
+
+class DownsampledBAM(object):
+ def __init__(self, handle, downsample, included_references):
+ self._records = collections.defaultdict(list)
+
+ references = handle.references
+ if len(references) != len(included_references):
+ raise ValueError("Length of 'included_references' must match the "
+ "number of references in BAM file.")
+
+ records = _filter_records(handle)
+ for record in reservoir_sampling(records, downsample):
+ key = references[record.tid]
+ self._records[key].append(record)
+
+ self.references = references
+
+ self._records = dict(self._records)
+ for value in self._records.values():
+ value.sort(key=lambda rec: rec.pos)
+
+ def fetch(self, chrom):
+ return self._records.get(chrom, ())
+
+
+class GenotypeSites(object):
+ def __init__(self, records):
+ last_chrom = None
+ sites = []
+
+ # chrom, pos, ref, ..., nucleotides
+ for chrom, pos, line in records:
+ # Convert pos from 1-based to 0-based (same as BAM.pos)
+ sites.append((int(pos) - 1, line, []))
+
+ assert last_chrom is None or chrom == last_chrom, \
+ (chrom, last_chrom)
+ last_chrom = chrom
+
+ sites.sort()
+ self._sites = collections.deque(sites)
+
+ def process(self, records, statistics):
+ count_used = 0
+ count_total = 0
+ sites = self._sites
+ for record_id, record in enumerate(records):
+ count_total += 1
+
+ # TODO: Check sorted
+ while sites and sites[0][0] < record.pos:
+ yield sites.popleft()
+
+ sequence = record.seq
+ sites_iter = iter(sites).next
+ alignment_iter = iter(record.get_aligned_pairs()).next
+ alignment_end = record.aend
+
+ try:
+ read_used = False
+ site_pos, _, nucleotides = sites_iter()
+ query_pos, ref_pos = alignment_iter()
+
+ while alignment_end > site_pos:
+ if ref_pos is None or query_pos is None or site_pos > ref_pos:
+ query_pos, ref_pos = alignment_iter()
+ elif site_pos < ref_pos:
+ site_pos, _, nucleotides = sites_iter()
+ else:
+ assert ref_pos == site_pos, (ref_pos, site_pos)
+ nucleotide = sequence[query_pos]
+ if nucleotide != "N":
+ nucleotides.append((record_id, nucleotide))
+ read_used = True
+
+ query_pos, ref_pos = alignment_iter()
+ site_pos, _, nucleotides = sites_iter()
+ except StopIteration:
+ if not sites:
+ break
+ finally:
+ if read_used:
+ count_used += 1
+
+ while sites:
+ yield sites.popleft()
+
+ statistics["n_reads"] += count_total
+ statistics["n_reads_used"] += count_used
+
+ @classmethod
+ def _parse_cigar(cls, record):
+ seq_iter = iter(record.seq)
+ seq_pos = record.pos
+ for key, value in record.cigar:
+ if key in (0, 7, 8): # M, =, X
+ for _ in xrange(value):
+ yield seq_pos, seq_iter.next()
+ seq_pos += 1
+ elif key in (1, 4): # I
+ for _ in xrange(value):
+ seq_iter.next()
+ elif key in (2, 3, ): # D, N
+ seq_pos += value
+ elif key in (5, 6): # H, P
+ pass
+ else:
+ raise ValueError(record.cigar)
+
+
+class GenotypeReader(object):
+ def __init__(self, filename):
+ self._tar_handle = tarfile.open(filename)
+ self._handle = self._tar_handle.extractfile("genotypes.txt")
+ self._header = self._handle.readline().rstrip('\r\n').split('\t')
+ self.samples = self._header[-1].split(';')
+
+ def __iter__(self):
+ for chrom, records in itertools.groupby(self._read_records(),
+ lambda rec: rec[0]):
+ sys.stderr.write("Reading contig %r information ...\n" % (chrom,))
+ yield chrom, GenotypeSites(records)
+
+ def _read_records(self):
+ for line in self._handle:
+ yield line.rstrip().split('\t', 2)
+
+ def __enter__(self):
+ return self
+
+ def __exit__(self, type, value, traceback):
+ self._handle.close()
+ self._tar_handle.close()
+
+
+def process_record(chrom, pos, line, nucleotides, statistics, records,
+ out_incl_ts=sys.stdout,
+ out_excl_ts=sys.stdout):
+ # Filter reads that have already been used
+ nucleotides = [(rec_id, nuc) for rec_id, nuc in nucleotides
+ if rec_id not in records]
+
+ if not nucleotides:
+ return
+ elif len(nucleotides) == 1:
+ # Avoid unnessary random() call in 'random.choice'
+ record_id, nucleotide = nucleotides[0]
+ else:
+ record_id, nucleotide = random.choice(nucleotides)
+
+ # Fields are expected to contain at least 2 columns, the first being the
+ # reference nucleotide, and the last being the sample genotypes
+ _, encoded_genotypes = line.strip().rsplit('\t', 1)
+
+ genotypes = []
+ for encoded_nucleotide in encoded_genotypes:
+ decoded_nucleotides = NT_CODES.get(encoded_nucleotide, ())
+ if len(decoded_nucleotides) == 1:
+ genotypes.append(decoded_nucleotides)
+ genotypes.append(decoded_nucleotides)
+ elif len(decoded_nucleotides) == 2:
+ genotypes.extend(decoded_nucleotides)
+ else:
+ raise ValueError('Invalid nucleotide, not bi-allelic: %r'
+ % (encoded_nucleotide,))
+
+ if nucleotide not in genotypes:
+ # Exclude SNPs not observed in the reference panel
+ return
+
+ # Convert from 0-based to 1-based
+ pos += 1
+
+ # Chromosome, SNP identifier, (dummy) pos in (centi)Morgans, position
+ output = [chrom, "chr{}_{}".format(chrom, pos), "0", str(pos)]
+ output.extend(genotypes)
+ output.append(nucleotide)
+ output.append(nucleotide)
+ output = " ".join(output)
+
+ statistics["n_sites_incl_ts"] += 1
+ out_incl_ts.write("{}\n".format(output))
+
+ if tuple(set(genotypes)) not in _TRANSITIONS:
+ statistics["n_sites_excl_ts"] += 1
+ out_excl_ts.write("{}\n".format(output))
+
+ records.add(record_id)
+
+
+def write_tfam(filename, data, samples, bam_sample):
+ with open(filename, "w") as handle:
+ for key in samples:
+ row = data.samples[key]
+ sex = {"MALE": 1, "FEMALE": 2, "NA": 0}[row["Sex"].upper()]
+ # Family, Individual, Paternal ID, Maternal ID, Sex, Phenotype
+ handle.write("{0} {0} 0 0 {1} -9\n".format(key, sex))
+
+ handle.write("{0} {0} 0 0 0 -9\n".format(bam_sample))
+
+
+def write_summary(args, filename, statistics):
+ with open(filename, "w") as handle:
+ handle.write("name: %s\n" % (args.name,))
+ handle.write("filename: %s\n" % (os.path.abspath(args.bam),))
+
+ for key in ("n_reads", "n_reads_used",
+ "n_sites_incl_ts", "n_sites_excl_ts"):
+ handle.write("%s: %s\n" % (key, statistics.get(key, "MISSING")))
+
+
+def process_bam(args, data, bam_handle):
+ raw_references = bam_handle.references
+ references = map(common.contig_name_to_plink_name, raw_references)
+
+ if args.downsample:
+ sys.stderr.write("Downsampling to at most %i BAM records ...\n"
+ % (args.downsample))
+ bam_handle = DownsampledBAM(bam_handle, args.downsample, references)
+
+ statistics = {"n_reads": 0,
+ "n_reads_used": 0,
+ "n_sites_incl_ts": 0,
+ "n_sites_excl_ts": 0}
+
+ fileutils.make_dirs(args.root)
+
+ with open(os.path.join(args.root, 'incl_ts.tped'), 'w') as output_incl:
+ with open(os.path.join(args.root, 'excl_ts.tped'), 'w') as output_excl:
+ with GenotypeReader(args.database) as reader:
+ for ref, sites in reader:
+ records = set()
+ raw_ref = raw_references[references.index(ref)]
+
+ sys.stderr.write("Reading %r from BAM ...\n" % (raw_ref,))
+ raw_sites = bam_handle.fetch(raw_ref)
+ for pos, line, nucleotides in sites.process(raw_sites,
+ statistics):
+ process_record(ref, pos, line, nucleotides,
+ out_incl_ts=output_incl,
+ out_excl_ts=output_excl,
+ statistics=statistics,
+ records=records)
+
+ write_summary(args, os.path.join(args.root, "common.summary"),
+ statistics=statistics)
+ write_tfam(os.path.join(args.root, "common.tfam"),
+ data, reader.samples, args.name)
+
+
+def parse_args(argv):
+ parser = argparse.ArgumentParser()
+ parser.add_argument('root', metavar='output_folder',
+ help='Output folder in which output files are '
+ 'to be placed; is created if it does not '
+ 'already exist.')
+ parser.add_argument('database',
+ help='Zonkey database file.')
+ parser.add_argument('bam',
+ help='Sorted BAM file.')
+ parser.add_argument('--seed', type=int,
+ help='RNG seed used when downsampling reads; '
+ 'defaults to using system time as seed.')
+ parser.add_argument('--downsample', type=int, default=0,
+ help='Sample N reads from the input BAM file, before '
+ 'building the TPED file. If not set, or set to '
+ 'zero, all reads are used [%(default)s].')
+ parser.add_argument('--name', default="Sample",
+ help='Name of sample to be used in output.')
+
+ return parser.parse_args(argv)
+
+
+def main(argv):
+ args = parse_args(argv)
+ random.seed(args.seed)
+
+ print "Reading reference information from %r ..." \
+ % (args.database,)
+
+ try:
+ data = database.ZonkeyDB(args.database)
+ except database.ZonkeyDBError, error:
+ sys.stderr.write("Error reading database file %r:\n%s\n"
+ % (args.database, error))
+ return 1
+
+ with pysam.Samfile(args.bam) as bam_handle:
+ bam_info = data.validate_bam_handle(bam_handle)
+ if not bam_info:
+ return 1
+ elif not bam_info.is_nuclear:
+ sys.stderr.write("ERROR: BAM file does not contain "
+ "identifiable nuclear alignments.\n")
+ return 1
+
+ process_bam(args, data, bam_handle)
+
+ return 0
+
+
+if __name__ == '__main__':
+ sys.exit(main(sys.argv[1:]))
diff --git a/paleomix/tools/zonkey/common.py b/paleomix/tools/zonkey/common.py
new file mode 100644
index 0000000..f083aee
--- /dev/null
+++ b/paleomix/tools/zonkey/common.py
@@ -0,0 +1,81 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2016 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import collections
+
+import paleomix.yaml
+
+import paleomix.common.versions as versions
+
+
+# Format number for database file; is incremented when the format is changed.
+# The 'revision' field specifies updates to the table that do not change the
+# format of the database (see below).
+_SUPPORTED_DB_FORMAT = 1
+
+
+RSCRIPT_VERSION = versions.Requirement(call=("Rscript", "--version"),
+ search="version (\d+)\.(\d+)\.(\d+)",
+ checks=versions.GE(3, 0, 0),
+ priority=10)
+
+
+class DBFileError(RuntimeError):
+ pass
+
+
+def get_sample_names(handle):
+ samples = []
+ for readgroup in handle.header.get("RG", ()):
+ if "SM" in readgroup:
+ samples.append(readgroup["SM"])
+ return frozenset(samples)
+
+
+def contig_name_to_plink_name(chrom):
+ """Converts chromosome / contig name to the values expected by 'plink',
+ namely a digit or X/Y, or returns None if the chromosome could not be
+ identified.
+ """
+ if chrom.isdigit():
+ return chrom.upper
+ elif chrom.upper() in "XY":
+ return chrom.upper()
+ elif chrom.lower().startswith("chr") and chrom[3:].isdigit():
+ return chrom[3:]
+ elif chrom.lower() in ("chrx", "chry"):
+ return chrom[3].upper()
+ else:
+ return None
+
+
+def read_summary(filename, default="[MISSING VALUE!]"):
+ results = collections.defaultdict(lambda: default)
+ with open(filename) as makefile:
+ string = makefile.read()
+ data = paleomix.yaml.safe_load(string)
+
+ if not isinstance(data, dict):
+ raise DBFileError('Summary file does not contain dictionary')
+
+ results.update(data)
+
+ return results
diff --git a/paleomix/tools/zonkey/config.py b/paleomix/tools/zonkey/config.py
new file mode 100644
index 0000000..ef5102e
--- /dev/null
+++ b/paleomix/tools/zonkey/config.py
@@ -0,0 +1,365 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2016 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import sys
+import optparse
+
+import pysam
+
+import paleomix
+import paleomix.ui
+
+from paleomix.ui import \
+ print_err, \
+ print_info
+
+from paleomix.config import \
+ ConfigError, \
+ PerHostValue, \
+ PerHostConfig, \
+ migrate_config
+
+import paleomix.common.fileutils as fileutils
+import paleomix.tools.zonkey.database as database
+
+
+_USAGE = """USAGE:
+{0} run <SampleDB> <samples.txt> [<destination>]
+{0} run <SampleDB> <sample.bam> [<destination>]
+{0} run <SampleDB> <nuclear.bam> <mitochondrial.bam> <destination>
+{0} dryrun <SampleDB> [...]
+{0} mito <SampleDB> <destination>
+{0} example <SampleDB> <destination>
+"""
+
+# List of valid commands / aliases, currently mostly undocumented
+_CMD_ALIASES = {
+ "mito": "mito",
+ "mitochondria": "mito",
+ "mitochondrial": "mito",
+ "nuc": "run",
+ "nuclear": "run",
+ "run": "run",
+ "dryrun": "dryrun",
+ "dry_run": "dryrun",
+ "example": "example",
+ "examples": "example",
+}
+
+
+def print_usage(out=sys.stderr):
+ out.write(_USAGE.format("paleomix zonkey"))
+
+
+def parse_config(argv):
+ migrate_config()
+
+ config, args = _parse_arguments(argv)
+ if not args:
+ print_usage()
+ return
+
+ config.command = _CMD_ALIASES.get(args[0])
+ if config.command is None:
+ print_err("ERROR: Unknown command %r" % (args[0],))
+ return
+ elif config.command == "dryrun":
+ config.command = "run"
+ config.dry_run = True
+
+ return parse_run_config(config, args[1:])
+
+
+def parse_run_config(config, args):
+ if not (2 <= len(args) <= 4):
+ print_usage()
+ return
+
+ config.multisample = False
+ config.tablefile = args[0]
+
+ try:
+ config.database = database.ZonkeyDB(config.tablefile)
+ except database.ZonkeyDBError, error:
+ print_err("ERROR reading database %r: %s"
+ % (config.tablefile, error))
+ return
+
+ known_samples = set(config.database.samples) | set(("Sample",))
+ unknown_samples = set(config.treemix_outgroup) - known_samples
+ if unknown_samples:
+ print_err("ERROR: Argument --treemix-outgroup includes unknown "
+ "sample(s): %s; known samples are %s. Note that "
+ "names are case-sensitive."
+ % (", ".join(map(repr, sorted(unknown_samples))),
+ ", ".join(map(repr, sorted(known_samples)))))
+ return
+
+ if config.command in ("mito", "example"):
+ if len(args) != 2:
+ print_err("ERROR: Wrong number of arguments!")
+ print_usage()
+ return
+
+ config.destination = args[1]
+ config.samples = {}
+ elif len(args) == 2:
+ filename = args[1]
+ config.destination = fileutils.swap_ext(filename, ".zonkey")
+
+ if not os.path.isfile(filename):
+ print_err("ERROR: Not a valid filename: %r" % (filename,))
+ return
+ elif _is_bamfile(filename):
+ # Called as either of
+ # zonkey run <SampleDB> <nuclear.bam>
+ # zonkey run <SampleDB> <mitochondrial.bam>
+ config.samples = {"-": {"Root": config.destination,
+ "Files": [filename]}}
+ else:
+ config.multisample = True
+ if not _read_sample_table(config, filename):
+ return
+ elif 3 <= len(args) <= 4:
+ root = args[-1]
+ if os.path.exists(root) and not os.path.isdir(root):
+ print_err("ERROR: Missing destination folder.")
+ print_usage()
+ return
+
+ config.destination = root
+
+ if len(args) == 3:
+ # zonkey run <SampleDB> <nuclear.bam> <destination>
+ # zonkey run <SampleDB> <mitochondrial.bam> <destination>
+ # zonkey run <SampleDB> <samples.txt> <destination>
+ filename = args[-2]
+
+ if not os.path.isfile(filename):
+ print_err("ERROR: Not a valid filename: %r" % (filename,))
+ return
+ elif _is_bamfile(filename):
+ # Called as either of
+ # zonkey run <SampleDB> <nuclear.bam>
+ # zonkey run <SampleDB> <mitochondrial.bam>
+ config.samples = {"-": {"Root": config.destination,
+ "Files": [filename]}}
+ else:
+ config.multisample = True
+ if not _read_sample_table(config, filename):
+ return
+ else:
+ # zonkey run <SampleDB> <nuclear.bam> <mitochondrial.bam> <dst>
+ config.destination = root
+ config.samples = {"-": {"Root": root, "Files": args[1:-1]}}
+ else:
+ raise RuntimeError("Unhandled number of args in parse_config: %i\n"
+ % (len(args),))
+
+ # Identify (mito or nuc?) and validate BAM files provided by user
+ if not _process_samples(config):
+ return
+
+ return config
+
+
+def _is_bamfile(filename):
+ """Returns true if a file is a BAM file, false otherwise.
+ """
+ try:
+ with pysam.Samfile(filename, "rb"):
+ return True
+ except ValueError:
+ return False
+ except IOError:
+ return False
+
+
+def _process_samples(config):
+ for name, info in sorted(config.samples.items()):
+ files = {}
+
+ if name == "-":
+ print_info("Validating unnamed sample ...")
+ else:
+ print_info("Validating sample %r ..." % (name,))
+
+ for filename in info.pop("Files"):
+ filetype = config.database.validate_bam(filename)
+ if not filetype:
+ print_err("ERROR: File is not a valid BAM file: %r"
+ % (filename,))
+ return False
+
+ if filetype.is_nuclear and filetype.is_mitochondrial:
+ if "Nuc" in files:
+ print_err("ERROR: Two nuclear BAMs specified!")
+ return False
+ elif "Mito" in files:
+ print_err("WARNING: Nuclear + mitochondrial BAM, and "
+ "mitochondrial BAM specified; the mitochondrial "
+ "genome in the first BAM will not be used!")
+
+ files["Nuc"] = filename
+ files.setdefault("Mito", filename)
+ elif filetype.is_nuclear:
+ if "Nuc" in files:
+ print_err("ERROR: Two nuclear BAMs specified!")
+ return False
+
+ files["Nuc"] = filename
+ elif filetype.is_mitochondrial:
+ if "Mito" in files:
+ print_err("ERROR: Two nuclear BAMs specified!")
+ return False
+
+ files["Mito"] = filename
+ else:
+ print_err("ERROR: BAM does not contain usable nuclear "
+ "or mitochondrial contigs: %r" % (filename,))
+ return False
+
+ config.samples[name]["Files"] = files
+
+ return True
+
+
+def _read_sample_table(config, filename):
+ """Parses a 2 - 3 column tab-seperated table containing, on each row, a
+ name to be used for a sample in the first row, and then the paths two
+ either one or to two BAM files, which must represent a single nuclear or
+ a single mitochondrial alignment (2 columns), or both (3 columns).
+ """
+ print_info("Reading table of samples from %r" % (filename,))
+
+ samples = config.samples = {}
+ with fileutils.open_ro(filename) as handle:
+ for linenum, line in enumerate(handle, start=1):
+ if not line.strip() or line.lstrip().startswith("#"):
+ continue
+
+ fields = filter(None, line.rstrip('\r\n').split('\t'))
+ if len(fields) not in (2, 3):
+ print_err("Error reading sample table (%r) at line %i; "
+ "expected 2 or 3 columns, found %i; please "
+ "correct file before continuing."
+ % (filename, linenum, len(fields)))
+ return
+
+ name = fields[0]
+ if name in samples:
+ print_err("Duplicate sample name found in sample table "
+ "(%r) at line %i: %r. All sample names must "
+ "be unique!" % (filename, linenum, name))
+ return
+
+ samples[name] = {"Root": os.path.join(config.destination, name),
+ "Files": fields[1:]}
+
+ return True
+
+
+def _parse_arguments(argv):
+ per_host_cfg = PerHostConfig("zonkey")
+
+ usage_str = "%prog <command> [options] <SampleDB> <bam/sam> [destination]"
+ version_str = "%%prog v%s" % (paleomix.__version__,)
+ parser = optparse.OptionParser(usage=usage_str,
+ version=version_str)
+
+ group = optparse.OptionGroup(parser, "Program options")
+ group.add_option("--downsample-to",
+ type=int, default=PerHostValue(1000000),
+ help="Number of reads to use for analyses; if 0, no "
+ "downsampling is performed [default: %default]")
+ group.add_option("--admixture-replicates",
+ type=int, default=PerHostValue(1),
+ help="Number of admixture replicates to run, before "
+ "the result with the highest likelihood [%default]")
+ group.add_option("--treemix-k", type=int, default=PerHostValue(0),
+ help="Value passed to treemix's -k option; number of "
+ "SNPs per block for estimation of the covariance "
+ "matrix. If set to 0, a value will be estimated "
+ "assuming an even distribution of SNPs [%default]")
+ group.add_option("--treemix-outgroup", default="",
+ help="Comma-seperated list of samples to use as the "
+ "outgroup when running TreeMix; note that these "
+ "must form a monophyletic clade, or TreeMix will "
+ "not execute.")
+
+ group.add_option("--admixture-only", help=optparse.SUPPRESS_HELP,
+ default=False, action="store_true")
+ group.add_option("--indep", nargs=3, help=optparse.SUPPRESS_HELP)
+ group.add_option("--indep-pairwise", nargs=3, help=optparse.SUPPRESS_HELP)
+ group.add_option("--indep-pairphase", nargs=3, help=optparse.SUPPRESS_HELP)
+
+ parser.add_option_group(group)
+
+ paleomix.ui.add_optiongroup(parser,
+ ui_default=PerHostValue("progress"),
+ color_default=PerHostValue("on"))
+ paleomix.logger.add_optiongroup(parser, default=PerHostValue("warning"))
+
+ group = optparse.OptionGroup(parser, "Pipeline")
+ group.add_option("--dry-run", action="store_true", default=False,
+ help="If passed, only a dry-run in performed, the "
+ "dependency tree is printed, and no tasks are "
+ "executed.")
+ group.add_option("--max-threads",
+ type=int, default=PerHostValue(1),
+ help="Maximum number of threads to use [%default]")
+ group.add_option("--list-input-files", action="store_true", default=False,
+ help="List all input files used by pipeline for the "
+ "makefile(s), excluding any generated by the "
+ "pipeline itself.")
+ group.add_option("--list-output-files", action="store_true", default=False,
+ help="List all output files generated by pipeline for "
+ "the makefile(s).")
+ group.add_option("--list-executables", action="store_true", default=False,
+ help="List all executables required by the pipeline, "
+ "with version requirements (if any).")
+ group.add_option("--to-dot-file", dest="dot_file",
+ help="Write dependency tree to the specified dot-file.")
+ parser.add_option_group(group)
+
+ config, args = per_host_cfg.parse_args(parser, argv)
+ paleomix.ui.set_ui_colors(config.ui_colors)
+
+ indep_opts = (config.indep, config.indep_pairwise, config.indep_pairphase)
+ if sum(bool(value) for value in indep_opts) > 1:
+ parser.error("Multiple --indep* options specified!")
+
+ if config.indep:
+ config.indep_params = config.indep
+ config.indep = 'indep'
+ elif config.indep_pairwise:
+ config.indep_params = config.indep_pairwise
+ config.indep = 'indep-pairwise'
+ elif config.indep_pairphase:
+ config.indep_params = config.indep_pairphase
+ config.indep = 'indep-pairphase'
+
+ config.treemix_outgroup \
+ = tuple(filter(None, sorted(config.treemix_outgroup.split(","))))
+
+ return config, args
diff --git a/paleomix/tools/zonkey/database.py b/paleomix/tools/zonkey/database.py
new file mode 100644
index 0000000..9f6df53
--- /dev/null
+++ b/paleomix/tools/zonkey/database.py
@@ -0,0 +1,562 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2016 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import os
+import tarfile
+
+import pysam
+
+import paleomix.yaml
+
+from paleomix.common.fileutils import \
+ swap_ext
+from paleomix.common.formats.fasta import \
+ FASTA
+from paleomix.tools.zonkey.common import \
+ get_sample_names, \
+ contig_name_to_plink_name
+
+from paleomix.ui import \
+ print_warn, \
+ print_info, \
+ print_err
+
+
+_SETTINGS_KEYS = ('Format', 'Revision', 'Plink', 'NChroms', 'MitoPadding',
+ 'SNPDistance')
+
+
+class BAMInfo(object):
+ def __init__(self):
+ self.nuclear = False
+ self.mt_contig = None
+ self.mt_length = None
+ self.mt_padding = None
+
+ @property
+ def is_nuclear(self):
+ return self.nuclear
+
+ @property
+ def is_mitochondrial(self):
+ return bool(self.mt_contig)
+
+ def __repr__(self):
+ tmpl = "BAMInfo(nuclear=%r, mt_contig=%r, mt_length=%r, mt_padding=%r)"
+
+ return tmpl % (self.nuclear, self.mt_contig,
+ self.mt_length, self.mt_padding)
+
+
+# Format number for database file; is incremented when the format is changed.
+# The 'revision' field specifies updates to the table that do not change the
+# format of the database (see below).
+_SUPPORTED_DB_FORMAT_MAJOR = 1
+_SUPPORTED_DB_FORMAT_MINOR = 20160112
+
+# Required columns in the 'contigs.txt' table; additional columns are ignored
+_CONTIGS_TABLE_COLUMNS = frozenset(('ID', 'Size', 'Checksum'))
+# Required columns in the 'samples.txt' table; additional columns are ignored
+_SAMPELS_TABLE_COLUMNS = frozenset(('ID', 'Group(2)', 'Group(3)', 'Species',
+ 'Sex', 'SampleID', 'Publication'))
+
+
+class ZonkeyDBError(RuntimeError):
+ pass
+
+
+class ZonkeyDB(object):
+ def __init__(self, filename):
+ self.filename = filename
+
+ if not os.path.exists(filename):
+ raise ZonkeyDBError('Database file does not exist')
+ elif not tarfile.is_tarfile(filename):
+ raise ZonkeyDBError('Database file is not a valid tar-file')
+
+ print_info('Reading Zonkey database from %r ...' % (filename,))
+
+ # Warn if file is gzip / bzip2 compressed; gives worse throughput
+ _check_file_compression(filename)
+
+ with tarfile.open(filename) as tar_handle:
+ print_info(' - Reading settings ...')
+ self.settings = self._read_settings(tar_handle, "settings.yaml")
+ print_info(' - Reading list of contigs ...')
+ self.contigs = self._read_contigs_table(tar_handle, "contigs.txt")
+ print_info(' - Reading list of samples ...')
+ self.samples = self._read_samples_table(tar_handle, "samples.txt")
+ print_info(' - Reading mitochondrial sequences ...')
+ self.mitochondria = self._read_mitochondria(tar_handle,
+ "mitochondria.fasta")
+ print_info(' - Reading emperical admixture distribution ...')
+ self.simulations = self._read_simulations(tar_handle,
+ "simulations.txt")
+ print_info(' - Determining sample order ...')
+ self.sample_order = self._read_sample_order(tar_handle,
+ "genotypes.txt")
+
+ self._cross_validate()
+
+ def validate_bam(self, filename):
+ """Validates a sample BAM file, checking that it is either a valid
+ mitochondrial BAM (aligned against one of the referenc mt sequences),
+ or that it is a valid nuclear BAM (aligned against the reference).
+
+ Returns one of INVALID_BAMFILE, NUC_BAMFILE, and MITO_BAMFILE.
+ """
+ print_info(" - Validating BAM file %r ... " % (filename,))
+
+ try:
+ handle = pysam.Samfile(filename)
+ except (ValueError, IOError), error:
+ print_err("Error reading BAM: %s" % (error,))
+ return
+
+ return self.validate_bam_handle(handle)
+
+ def validate_bam_handle(self, handle):
+ samples = get_sample_names(handle)
+ if len(samples) > 1:
+ print_warn("\nWARNING:")
+ print_warn("BAM read-groups specify more than one sample, "
+ "but this tool treats BAMs as a single sample:")
+
+ for sample in enumerate(samples, start=1):
+ print_warn(" %i: %r" % sample)
+ print_warn("")
+
+ info = BAMInfo()
+ if not _validate_mito_bam(self, handle, info):
+ return
+
+ if not _validate_nuclear_bam(self, handle, info):
+ return
+
+ return info
+
+ def _cross_validate(self):
+ """Cross validates tables to ensure consistency."""
+ genotypes = set(self.sample_order)
+ samples = set(self.samples)
+ differences = (genotypes | samples) - (genotypes & samples)
+ if differences:
+ raise ZonkeyDBError("Mismatch between samples in sample-list and "
+ "genotypes table; some samples not found in "
+ "both tables: %s"
+ % (",".join(differences),))
+
+ if self.mitochondria is None:
+ return
+
+ for name, record in self.mitochondria.iteritems():
+ if name not in self.samples:
+ # Ignore extra reference sequences
+ meta = (record.meta or "").upper()
+ if "EXCLUDE" not in map(str.strip, meta.split(";")):
+ raise ZonkeyDBError("Unexpected mitochondrial sequence: %r"
+ % (name,))
+
+ @classmethod
+ def _read_contigs_table(cls, tar_handle, filename):
+ cls._check_required_file(tar_handle, filename)
+
+ table = cls._read_table(tar_handle, filename, _CONTIGS_TABLE_COLUMNS)
+ for key, row in table.iteritems():
+ try:
+ row["Size"] = int(row["Size"])
+ except ValueError, error:
+ raise ZonkeyDBError("Invalid size specified for sample %r in "
+ "%r: %r" % (key, filename, error))
+
+ if row["Size"] <= 0:
+ raise ZonkeyDBError("Contig size must be >= 0 for %r in %r, "
+ "not %r" % (key, filename, row["Size"]))
+ return table
+
+ @classmethod
+ def _read_samples_table(cls, tar_handle, filename):
+ cls._check_required_file(tar_handle, filename)
+
+ samples = cls._read_table(tar_handle, "samples.txt")
+ if not samples:
+ raise ZonkeyDBError("ERROR: No samples found in genotypes table!")
+
+ for row in samples.itervalues():
+ if row["Sex"].upper() not in ("MALE", "FEMALE", "NA"):
+ raise ZonkeyDBError("ERROR: Unexpected sample sex (%r); "
+ "expected 'MALE', 'FEMALE', or 'NA'"
+ % (row["Sex"],))
+
+ for k_groups in (2, 3):
+ key = "Group(%i)" % (k_groups,)
+ groups = frozenset(row[key] for row in samples.itervalues())
+
+ if len(groups - set('-')) not in (0, k_groups):
+ raise ZonkeyDBError("The %r column in the samples table must "
+ "either contain %i ancestral groups, or "
+ "none" % (key, k_groups))
+
+ return samples
+
+ @classmethod
+ def _read_sample_order(cls, tar_handle, filename):
+ cls._check_required_file(tar_handle, filename)
+
+ handle = tar_handle.extractfile(filename)
+ header = handle.readline().rstrip('\r\n').split('\t')
+ sample_order = tuple(header[-1].split(";"))
+
+ if len(sample_order) != len(set(sample_order)):
+ raise ZonkeyDBError("Duplicate sample names in %r" % (filename,))
+
+ return sample_order
+
+ def _read_mitochondria(self, tar_handle, filename):
+ try:
+ tar_handle.getmember(filename)
+ except KeyError:
+ # Missing MT file is allowed
+ return None
+
+ handle = tar_handle.extractfile(filename)
+
+ results = {}
+ for record in FASTA.from_lines(handle):
+ record = FASTA(name=record.name,
+ meta=record.meta,
+ sequence=record.sequence.upper())
+
+ unexpected = set(record.sequence) - set("ACGTN-")
+ if unexpected:
+ unexpected = ", ".join(map(repr, sorted(unexpected)))
+ raise ZonkeyDBError("Unexpected nucleotide in %s; only A, C, "
+ "G, T, N, and - are allowed, not %s"
+ % (unexpected, filename))
+ elif record.name in results:
+ raise ZonkeyDBError("Duplicate sequence name in %s: %r"
+ % (filename, record.name))
+
+ results[record.name] = record
+
+ lengths = frozenset(len(record.sequence)
+ for record in results.itervalues())
+
+ if not lengths:
+ raise ZonkeyDBError("No mitochondrial sequences found in %r"
+ % (filename,))
+ elif len(lengths) > 2:
+ lengths = tuple(sorted(lengths))
+ lengths_s = "%s, and %s" % (", ".join(map(str, lengths[:-1])),
+ lengths[-1])
+
+ raise ZonkeyDBError("At most two different sequence lengths "
+ "expected for mitochondrial sequences, but "
+ "found %i different lengths in %r: %s"
+ % (len(lengths), filename, lengths_s))
+ elif len(lengths) != 1:
+ # Unpadded sequences are allowed
+ delta_len = max(lengths) - min(lengths)
+ mito_padding = self.settings["MitoPadding"]
+
+ if (delta_len != mito_padding):
+ raise ZonkeyDBError("Length difference between mitochondrial "
+ "sequences in %r does not match the "
+ "padding; expected a difference of %i bp, "
+ "but found a %i bp difference."
+ % (filename, mito_padding, delta_len))
+
+ return results
+
+ @classmethod
+ def _read_settings(cls, tar_handle, filename):
+ cls._check_required_file(tar_handle, filename)
+
+ handle = tar_handle.extractfile(filename)
+
+ try:
+ result = paleomix.yaml.safe_load(handle.read())
+ except paleomix.yaml.YAMLError, error:
+ raise ZonkeyDBError("Error reading settings file %r; %s"
+ % (filename, error))
+
+ for key in _SETTINGS_KEYS:
+ if key != "Plink":
+ if not isinstance(result[key], int) or result[key] < 0:
+ raise ZonkeyDBError("Value for %r in %s must be an non-"
+ "negative integer, not %r"
+ % (key, filename, result[key]))
+ elif not isinstance(result[key], str):
+ raise ZonkeyDBError("Value for %r in %s must be a string, "
+ "not %r"
+ % (key, filename, result[key]))
+
+ if result["Format"] > _SUPPORTED_DB_FORMAT_MAJOR:
+ raise ZonkeyDBError("Database version is too old; this version of "
+ "PALEOMIX supports the Zonkey DB v%i, but the "
+ "database is v%i; download an updated "
+ "database to continue."
+ % (_SUPPORTED_DB_FORMAT_MAJOR,
+ result["Format"]))
+ elif result["Format"] < _SUPPORTED_DB_FORMAT_MAJOR:
+ raise ZonkeyDBError("Database version is too new; this version of "
+ "PALEOMIX supports the Zonkey DB v%i, but the "
+ "database is v%i; upgrade PALEOMIX to "
+ "continue."
+ % (_SUPPORTED_DB_FORMAT_MAJOR,
+ result["Format"]))
+ elif result["Revision"] < _SUPPORTED_DB_FORMAT_MINOR:
+ raise ZonkeyDBError("Database version is too old; this version of "
+ "PALEOMIX supports the Zonkey DB v%i, rev. %i "
+ "or newer, but the database is v%i rev. %i; "
+ "please download an updated database to "
+ "continue."
+ % (_SUPPORTED_DB_FORMAT_MAJOR,
+ _SUPPORTED_DB_FORMAT_MINOR,
+ result["Format"],
+ result["Revision"]))
+
+ return result
+
+ def _read_simulations(self, tar_handle, filename):
+ try:
+ handle = tar_handle.extractfile(filename)
+ except KeyError:
+ # Missing simulations file is allowed
+ return None
+
+ header = handle.readline().rstrip().split('\t')
+
+ required_keys = set(('NReads', 'K', 'Sample1', 'Sample2', 'HasTS',
+ 'Percentile', 'Value'))
+ missing_keys = required_keys - set(header)
+ if missing_keys:
+ missing_keys = ', '.join(map(repr, missing_keys))
+ raise ZonkeyDBError('Simulations table %r does not contain all '
+ 'required columns; columns %r are missing!'
+ % (filename, missing_keys))
+
+ result = []
+ for linenum, line in enumerate(handle, start=2):
+ fields = line.strip().split('\t')
+ if len(fields) != len(header):
+ raise ZonkeyDBError("Line %i in simulations table %r, does "
+ "not contain the expected number of "
+ "columns; expected %i, but found %i!"
+ % (linenum, filename,
+ len(header), len(fields)))
+
+ row = dict(zip(header, fields))
+
+ if row['HasTS'] not in ('TRUE', 'FALSE'):
+ pass
+
+ row['HasTS'] = (row['HasTS'] == 'TRUE')
+
+ for key in ('NReads', 'K'):
+ try:
+ row[key] = int(row[key])
+ except ValueError:
+ raise ZonkeyDBError('Malformed value for column %r at '
+ 'line %i in simulations table %r; '
+ 'expected int, found %r'
+ % (key, linenum, filename, row[key]))
+
+ for key in ('Percentile', 'Value'):
+ try:
+ row[key] = float(row[key])
+ except ValueError:
+ raise ZonkeyDBError('Malformed value for column %r at '
+ 'line %i in simulations table %r; '
+ 'expected float, found %r'
+ % (key, linenum, filename, row[key]))
+
+ for key in ('Sample1', 'Sample2'):
+ group_key = 'Group(%i)' % (row['K'],)
+ groups = frozenset(row[group_key]
+ for row in self.samples.itervalues())
+
+ if row[key] not in groups and row[key] != '-':
+ raise ZonkeyDBError('Invalid group in column %r in '
+ 'simulations table %r: %r'
+ % (key, filename, row[key]))
+
+ result.append(row)
+
+ return result
+
+ @classmethod
+ def _check_required_file(cls, tar_handle, filename):
+ try:
+ obj = tar_handle.getmember(filename)
+ except KeyError:
+ raise ZonkeyDBError("Database does not contain required file %r; "
+ "please ensure that this is a valid Zonkey "
+ "database file!" % (filename,))
+
+ if not obj.isfile():
+ raise ZonkeyDBError("Object %r in Zonkey database is not a "
+ "file; please ensure that this is a valid "
+ "Zonkey database file!"
+ % (filename,))
+
+ @classmethod
+ def _read_table(cls, tar_handle, filename, requied_columns=()):
+ requied_columns = frozenset(requied_columns) | frozenset(("ID",))
+ handle = tar_handle.extractfile(filename)
+ result = {}
+
+ try:
+ header = handle.readline().rstrip('\r\n').split('\t')
+ if len(header) != len(set(header)):
+ raise ZonkeyDBError("Table %r does contains duplicate columns!"
+ % (filename,))
+
+ if requied_columns - set(header):
+ raise ZonkeyDBError("Required columns are missign in table "
+ "%r: %s" % (filename, ", ".join()))
+
+ for linenum, line in enumerate(handle):
+ fields = line.rstrip('\r\n').split('\t')
+
+ if len(fields) != len(header):
+ raise ZonkeyDBError("Error reading %r at line %i; "
+ "expected %i columns, found %i "
+ "columns!"
+ % (filename, linenum,
+ len(header), len(fields)))
+
+ row = dict(zip(header, fields))
+ if row["ID"] in result:
+ raise ZonkeyDBError("Duplicate IDs in %r: %s"
+ % (filename, row["ID"]))
+
+ result[row["ID"]] = row
+ finally:
+ handle.close()
+
+ return result
+
+
+def _validate_mito_bam(data, handle, info):
+ if data.mitochondria is None:
+ # No mitochondrial data .. skip phylogeny
+ return True
+
+ references = handle.references
+ min_length = min((len(record.sequence))
+ for record in data.mitochondria.itervalues())
+
+ for bam_contig, bam_length in zip(references, handle.lengths):
+ if bam_contig not in data.mitochondria:
+ continue
+
+ db_sequence = data.mitochondria[bam_contig].sequence
+ db_length = len(db_sequence) - db_sequence.count("-")
+
+ if bam_length != db_length:
+ print_err("ERROR: Length of mitochondrial contig %r (%i bp) "
+ "does not match the length of the corresponding "
+ "sequence in the database (%i bp)"
+ % (bam_contig, bam_length, db_length))
+ return False
+
+ if not os.path.exists(handle.filename + '.bai') \
+ and not os.path.exists(swap_ext(handle.filename, '.bai')):
+ print_info(' - Attempting to index BAM file %r!'
+ % (handle.filename,))
+ pysam.index(handle.filename)
+
+ # Workaround for pysam < 0.9 returning list, >= 0.9 returning str
+ for line in "".join(pysam.idxstats(handle.filename)).split('\n'):
+ line = line.strip()
+ if not line:
+ continue
+
+ name, _, hits, _ = line.split('\t')
+ if (name == bam_contig) and not int(hits):
+ print_err("WARNING: Mitochondrial BAM (%r) does not contain "
+ "any reads aligned to contig %r; inferring an "
+ "phylogeny is not possible."
+ % (handle.filename, name))
+ return True
+
+ info.mt_contig = bam_contig
+ info.mt_length = bam_length
+ info.mt_padding = len(db_sequence) - min_length
+
+ return True
+ return True
+
+
+def _validate_nuclear_bam(data, handle, info):
+ # Check that chromosomes are of expected size; unused chroms are ignored.
+ bam_contigs = dict(zip(map(contig_name_to_plink_name, handle.references),
+ handle.lengths))
+ ref_contigs = data.contigs
+
+ contigs_found = {}
+ for name, stats in sorted(ref_contigs.iteritems()):
+ if name not in bam_contigs:
+ contigs_found[name] = False
+ elif bam_contigs[name] != stats["Size"]:
+ print_err("\nERROR: Chrom %r in the BAM does not match the "
+ "length specified in data file:\n"
+ " - Expected: %i\n"
+ " - Found: %i"
+ % (name, bam_contigs[name], stats["Size"]))
+
+ return False
+ else:
+ contigs_found[name] = True
+
+ if any(contigs_found.itervalues()):
+ if not all(contigs_found.itervalues()):
+ print_err("\nERROR: Not all nuclear chromosomes found in BAM:")
+ for (name, stats) in sorted(ref_contigs.iteritems()):
+ is_found = "Found" if contigs_found[name] else "Not found!"
+ print_err(" - %s: %s" % (name, is_found))
+
+ return False
+ else:
+ info.nuclear = True
+
+ return True
+
+
+def _check_file_compression(filename):
+ try:
+ with open(filename) as handle:
+ header = handle.read(2)
+
+ if header == "\x1f\x8b":
+ print_warn('\nWARNING:\n'
+ 'Zonkey database file %r is gzip compressed;\n'
+ 'uncompressing the archive is recommended:\n'
+ ' $ gunzip "%s"\n' % (filename, filename))
+ elif header == "BZ":
+ print_warn('\nWARNING:\n'
+ 'Zonkey database file %r is bzip2 compressed;\n'
+ 'uncompressing the archive is recommended:\n'
+ ' $ bunzip2 "%s"\n' % (filename, filename))
+ except IOError:
+ # Errors are ignored at this stage
+ pass
diff --git a/paleomix/tools/zonkey/parts/__init__.py b/paleomix/tools/zonkey/parts/__init__.py
new file mode 100644
index 0000000..d094e34
--- /dev/null
+++ b/paleomix/tools/zonkey/parts/__init__.py
@@ -0,0 +1,21 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# Copyright (c) 2016 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
diff --git a/paleomix/tools/zonkey/parts/admixture.py b/paleomix/tools/zonkey/parts/admixture.py
new file mode 100644
index 0000000..cae5d4d
--- /dev/null
+++ b/paleomix/tools/zonkey/parts/admixture.py
@@ -0,0 +1,171 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2016 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+"""
+
+Parsing and validation of admixture results.
+
+"""
+import collections
+
+
+CUTOFF = 0.001
+
+
+class AdmixtureError(RuntimeError):
+ pass
+
+
+def read_admixture_results(filename, data, k_groups, cutoff=CUTOFF):
+ key = "Group(%i)" % (k_groups,)
+ names = tuple(data.sample_order) + ("-",)
+ table = _admixture_read_results(filename, names)
+ _admixture_validate_ancestral_groups(data, table, k_groups, cutoff)
+
+ ancestral_groups = [[set(), value] for value in table["-"]]
+ for sample, row in table.iteritems():
+ if sample == '-':
+ continue
+
+ group = data.samples[sample][key]
+ for index, value in enumerate(row):
+ if value >= cutoff:
+ ancestral_groups[index][0].add(group)
+
+ return ancestral_groups
+
+
+def get_percentiles(data, sample1, sample2, nreads, k_groups, has_ts, value):
+ results = {'Sample1': sample1,
+ 'Sample2': sample2}
+
+ nreads_lower = set(row['NReads'] for row in data.simulations
+ if row['NReads'] <= nreads)
+ nreads_upper = set(row['NReads'] for row in data.simulations
+ if row['NReads'] >= nreads)
+
+ if nreads_lower:
+ selection = _select_simulations(data=data,
+ sample1=sample1,
+ sample2=sample2,
+ nreads=max(nreads_lower),
+ k_groups=k_groups,
+ has_ts=has_ts)
+ lower_bound, upper_bound = _get_percentile_range(selection, value)
+ results['Lower'] = {'NReads': max(nreads_lower),
+ 'Lower': lower_bound,
+ 'Upper': upper_bound}
+
+ if nreads_upper:
+ selection = _select_simulations(data=data,
+ sample1=sample1,
+ sample2=sample2,
+ nreads=min(nreads_upper),
+ k_groups=k_groups,
+ has_ts=has_ts)
+ lower_bound, upper_bound = _get_percentile_range(selection, value)
+ results['Upper'] = {'NReads': min(nreads_upper),
+ 'Lower': lower_bound,
+ 'Upper': upper_bound}
+
+ return results
+
+
+def _select_simulations(data, sample1, sample2, nreads, k_groups, has_ts):
+ selection = []
+ samples = frozenset((sample1, sample2))
+ for row in data.simulations:
+ if row['K'] != k_groups or row['HasTS'] != has_ts:
+ continue
+ elif row['NReads'] != nreads:
+ continue
+ elif frozenset((row['Sample1'], row['Sample2'])) != samples:
+ continue
+
+ selection.append(row)
+
+ return selection
+
+
+def _get_percentile_range(selection, value):
+ selection = [(row['Percentile'], row['Value'])
+ for row in selection]
+ selection.sort()
+
+ lower_bound = 0.0
+ upper_bound = 1.0
+
+ for cur_pct, cur_value in selection:
+ if cur_value > value:
+ break
+
+ lower_bound = cur_pct
+
+ for cur_pct, cur_value in reversed(selection):
+ if cur_value < value:
+ break
+
+ upper_bound = cur_pct
+
+ return lower_bound, upper_bound
+
+
+def _admixture_read_results(filename, samples):
+ with open(filename) as handle:
+ lines = handle.readlines()
+
+ if len(samples) != len(lines):
+ raise AdmixtureError("unexpected number of lines in admixture file; "
+ "expected %i samples, found %i"
+ % (len(samples), len(lines)))
+
+ result = {}
+ for name, line in zip(samples, lines):
+ result[name] = [float(value) for value in line.split()]
+
+ return result
+
+
+def _admixture_validate_ancestral_groups(data, table, k_groups, cutoff):
+ key = "Group(%i)" % (k_groups,)
+ groups = collections.defaultdict(dict)
+ for sample, row in table.iteritems():
+ if sample not in data.samples:
+ continue
+
+ group = data.samples[sample][key]
+ for index, value in enumerate(row):
+ if value >= cutoff:
+ groups[group][index] = True
+
+ mixed_groups = []
+ for group, memberships in sorted(groups.iteritems()):
+ count = len(memberships)
+
+ if count > 1:
+ mixed_groups.append("member(s) of reference group %s assigned to "
+ "%i ancestral populations" % (group, count))
+
+ if mixed_groups:
+ raise AdmixtureError("Inconsistent ADMIXTURE results: %s; "
+ "cannot determine ancestry!"
+ % ("; ".join(mixed_groups)))
diff --git a/paleomix/tools/zonkey/parts/common.py b/paleomix/tools/zonkey/parts/common.py
new file mode 100644
index 0000000..83b57c9
--- /dev/null
+++ b/paleomix/tools/zonkey/parts/common.py
@@ -0,0 +1,65 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2016 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import paleomix.common.fileutils as fileutils
+
+from paleomix.node import Node
+
+
+_DEFAULT_COLORS = ("#E69F00", "#56B4E9",
+ "#009E73", "#F0E442",
+ "#0072B2", "#D55E00",
+ "#CC79A7")
+
+
+class WriteSampleList(Node):
+ def __init__(self, config, output_file, dependencies=()):
+ self._samples = config.database.samples
+
+ Node.__init__(self,
+ description="<WriteSampleList -> %r>" % (output_file,),
+ input_files=(config.tablefile,),
+ output_files=(output_file,),
+ dependencies=dependencies)
+
+ def _run(self, config, temp):
+ output_file, = self.output_files
+ samples = self._samples
+ groups = set(sample["Group(3)"] for sample in samples.itervalues())
+ colors = dict(zip(groups, _DEFAULT_COLORS))
+
+ with open(fileutils.reroot_path(temp, output_file), "w") as handle:
+ handle.write("Name\tGroup\tColor\n")
+
+ for name, sample in sorted(samples.iteritems()):
+ group = sample["Group(3)"]
+ color = colors[group]
+
+ handle.write("%s\t%s\t%s\n" % (name, group, color))
+
+ handle.write("Sample\t-\t#000000\n")
+
+ def _teardown(self, config, temp):
+ destination, = self.output_files
+ source = fileutils.reroot_path(temp, destination)
+
+ fileutils.move_file(source, destination)
diff --git a/paleomix/tools/zonkey/parts/mitochondria.py b/paleomix/tools/zonkey/parts/mitochondria.py
new file mode 100644
index 0000000..2bf3749
--- /dev/null
+++ b/paleomix/tools/zonkey/parts/mitochondria.py
@@ -0,0 +1,100 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2016 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+
+import paleomix.common.rtools as rtools
+import paleomix.tools.factory as factory
+
+from paleomix.atomiccmd.command import AtomicCmd
+from paleomix.common.formats.newick import Newick
+from paleomix.node import CommandNode
+
+from paleomix.tools.zonkey.common import RSCRIPT_VERSION
+
+
+class MitoConsensusNode(CommandNode):
+ def __init__(self, database, bamfile, output_prefix, dependencies=()):
+ cmd = factory.new("zonkey_mito")
+ cmd.add_value("%(IN_DATABASE)s")
+ cmd.add_value("%(IN_BAMFILE)s")
+ cmd.add_value("%(TEMP_OUT_PREFIX)s")
+
+ cmd.set_kwargs(IN_DATABASE=database,
+ IN_BAMFILE=bamfile,
+ TEMP_OUT_PREFIX=os.path.basename(output_prefix),
+ OUT_PHYLIP=output_prefix + ".phy",
+ OUT_FASTA=output_prefix + ".fasta",
+ OUT_SUMMARY=output_prefix + ".summary")
+
+ CommandNode.__init__(self,
+ description="<MitoConsensus -> '%s.*'>"
+ % (output_prefix,),
+ command=cmd.finalize(),
+ dependencies=dependencies)
+
+
+class DrawPhylogenyNode(CommandNode):
+ def __init__(self, samples, treefile, bootstraps, output_prefix,
+ dependencies=()):
+ rscript = rtools.rscript("zonkey", "tinytree.r")
+
+ cmd = AtomicCmd(("Rscript", rscript,
+ "%(TEMP_OUT_FILE)s",
+ "%(IN_SAMPLES)s",
+ "%(TEMP_OUT_PREFIX)s"),
+ AUX_RSCRIPT=rscript,
+ IN_SAMPLES=samples,
+ IN_FILE=treefile,
+ IN_BOOTSTRAPS=bootstraps,
+ TEMP_OUT_FILE="rerooted.newick",
+ TEMP_OUT_PREFIX=os.path.basename(output_prefix),
+ OUT_TREE_PDF=output_prefix + ".pdf",
+ OUT_TREE_PNG=output_prefix + ".png",
+ CHECK_RSCRIPT=RSCRIPT_VERSION,
+ CHECK_RSCRIPT_APE=rtools.requirement("ape"),
+ CHECK_RSCRIPT_GGPLOT2=rtools.requirement("ggplot2"),
+ CHECK_RSCRIPT_GRID=rtools.requirement("grid"))
+
+ self._treefile = treefile
+ self._bootstraps = bootstraps
+
+ CommandNode.__init__(self,
+ description="<DrawPhylogeny -> '%s.*'>"
+ % (output_prefix,),
+ command=cmd,
+ dependencies=dependencies)
+
+ def _setup(self, config, temp):
+ with open(self._bootstraps) as handle:
+ bootstraps = [Newick.from_string(line.strip())
+ for line in handle]
+
+ with open(self._treefile) as handle:
+ tree = Newick.from_string(handle.read().strip())
+
+ tree = tree.reroot_on_midpoint()
+ tree = tree.add_support(bootstraps, "{Percentage:.0f}")
+ with open(os.path.join(temp, "rerooted.newick"), "w") as handle:
+ handle.write("{}\n".format(tree))
+
+ CommandNode._setup(self, config, temp)
diff --git a/paleomix/tools/zonkey/parts/nuclear.py b/paleomix/tools/zonkey/parts/nuclear.py
new file mode 100644
index 0000000..62edc4e
--- /dev/null
+++ b/paleomix/tools/zonkey/parts/nuclear.py
@@ -0,0 +1,755 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2016 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import hashlib
+import itertools
+import math
+import os
+import random
+
+import pysam
+
+import paleomix.common.fileutils as fileutils
+import paleomix.common.rtools as rtools
+import paleomix.common.versions as versions
+import paleomix.tools.factory as factory
+
+from paleomix.atomiccmd.builder import AtomicCmdBuilder
+from paleomix.atomiccmd.command import AtomicCmd
+from paleomix.atomiccmd.sets import SequentialCmds
+from paleomix.node import CommandNode, Node, NodeError
+
+from paleomix.tools.zonkey.common import \
+ RSCRIPT_VERSION, \
+ contig_name_to_plink_name, \
+ read_summary
+
+
+ADMIXTURE_VERSION = versions.Requirement(call=("admixture", "--version"),
+ search="(\d+)\.(\d+)",
+ checks=versions.GE(1, 3))
+
+PLINK_VERSION = versions.Requirement(call=("plink", "--noweb", "--help",
+ "--out", "/tmp/plink"),
+ search="v(\d+)\.(\d+)",
+ checks=versions.GE(1, 7))
+
+SMARTPCA_VERSION = versions.Requirement(call=("smartpca",),
+ search="version: (\d+)",
+ checks=versions.GE(13050))
+
+TREEMIX_VERSION = versions.Requirement(call=("treemix",),
+ search="TreeMix v. (\d+)\.(\d+)",
+ checks=versions.GE(1, 12))
+
+
+class BuildTPEDFilesNode(CommandNode):
+ def __init__(self, output_root, table, bamfile, downsample,
+ dependencies=()):
+ cmd = factory.new("zonkey_tped")
+ cmd.set_option("--name", "Sample")
+ cmd.set_option("--downsample", downsample)
+ cmd.add_value("%(TEMP_DIR)s")
+ cmd.add_value("%(IN_TABLE)s")
+ cmd.add_value("%(IN_BAM)s")
+
+ if not downsample:
+ # Needed for random access (chromosomes are read 1 ... 31)
+ cmd.set_kwargs(IN_BAI=fileutils.swap_ext(bamfile, ".bai"))
+
+ cmd.set_kwargs(OUT_TFAM=os.path.join(output_root, "common.tfam"),
+ OUT_SUMMARY=os.path.join(output_root, "common.summary"),
+ OUT_TPED_INCL_TS=os.path.join(output_root,
+ "incl_ts.tped"),
+ OUT_TPED_EXCL_TS=os.path.join(output_root,
+ "excl_ts.tped"),
+ IN_TABLE=table,
+ IN_BAM=bamfile)
+
+ CommandNode.__init__(self,
+ description="<BuildTPEDFiles -> %r>"
+ % (os.path.join(output_root, '*'),),
+ command=cmd.finalize(),
+ dependencies=dependencies)
+
+
+class BuildBEDFilesNode(CommandNode):
+ def __init__(self, output_prefix, tfam, tped,
+ indep_filter=None, indep_parameters=None,
+ plink_parameters=None,
+ dependencies=()):
+ temp_prefix = os.path.basename(output_prefix)
+
+ plink_cmd = ["plink", "--make-bed", "--noweb",
+ "--tped", "%(IN_TPED)s",
+ "--tfam", "%(IN_TFAM)s",
+ "--out", "%(TEMP_OUT_PREFIX)s"]
+
+ plink_cmd.extend(self._parse_parameters(plink_parameters))
+
+ command = AtomicCmd(plink_cmd,
+ IN_TPED=tped,
+ IN_TFAM=tfam,
+ TEMP_OUT_PREFIX=temp_prefix,
+ OUT_BED=output_prefix + ".bed",
+ OUT_BIM=output_prefix + ".bim",
+ OUT_FAM=output_prefix + ".fam",
+ OUT_LOG=output_prefix + ".log",
+ TEMP_OUT_NOSEX=temp_prefix + ".nosex",
+ TEMP_OUT_NOF=temp_prefix + ".nof",
+ CHECK_VERSION=PLINK_VERSION,
+ set_cwd=True)
+
+ CommandNode.__init__(self,
+ description="<BuildBEDFiles -> '%s.*'>"
+ % (output_prefix,),
+ command=command,
+ dependencies=dependencies)
+
+ @classmethod
+ def _parse_parameters(cls, parameters):
+ return parameters.split()
+
+
+class BuildFilteredBEDFilesNode(CommandNode):
+ def __init__(self, output_prefix, tfam, tped,
+ indep_filter=None, indep_parameters=None,
+ plink_parameters=None,
+ dependencies=()):
+
+ assert indep_filter in ('indep',
+ 'indep-pairphase',
+ 'indep-pairwise'), indep_filter
+ assert len(indep_parameters) == 3, indep_parameters
+
+ parameters = self._parse_parameters(plink_parameters)
+
+ plink_cmd = ["plink", "--noweb",
+ "--tped", "%(IN_TPED)s",
+ "--tfam", "%(IN_TFAM)s",
+ "--out", "%(TEMP_OUT_PREFIX)s",
+ '--' + indep_filter]
+ plink_cmd.extend(indep_parameters)
+ plink_cmd.extend(parameters)
+
+ cmd_indep = AtomicCmd(plink_cmd,
+ IN_TFAM=tfam,
+ IN_TPED=tped,
+ TEMP_OUT_PREFIX="indep",
+ TEMP_OUT_LOG="indep.log",
+ TEMP_OUT_NOSEX="indep.nosex",
+ TEMP_OUT_PRUNE_IN="indep.prune.in",
+ TEMP_OUT_PRUNE_OUT="indep.prune.out",
+ set_cwd=True)
+
+ basename = os.path.basename(output_prefix)
+ cmd_filter = AtomicCmd(["plink", "--noweb", "--make-bed",
+ "--tped", "%(IN_TPED)s",
+ "--tfam", "%(IN_TFAM)s",
+ "--extract", "%(TEMP_IN_PRUNE)s",
+ "--out", "%(TEMP_OUT_PREFIX)s"] +
+ parameters,
+ IN_TFAM=tfam,
+ IN_TPED=tped,
+ TEMP_OUT_PREFIX=basename,
+ TEMP_IN_PRUNE="indep.prune.in",
+ TEMP_OUT_NOSEX=basename + ".nosex",
+ TEMP_OUT_LOG=basename + ".log",
+ OUT_LOG=output_prefix + ".log",
+ OUT_BED=output_prefix + ".bed",
+ OUT_BIM=output_prefix + ".bim",
+ OUT_FAM=output_prefix + ".fam",
+ set_cwd=True)
+
+ CommandNode.__init__(self,
+ description="<BuildFilteredBEDFiles -> '%s.*'>"
+ % (output_prefix,),
+ command=SequentialCmds((cmd_indep, cmd_filter)),
+ dependencies=dependencies)
+
+ @classmethod
+ def _parse_parameters(cls, parameters):
+ return parameters.split()
+
+
+class AdmixtureNode(CommandNode):
+ def __init__(self, input_file, k_groups, output_root,
+ samples=None, dependencies=()):
+ self._samples = samples
+ self._input_file = input_file
+ self._k_groups = k_groups
+
+ group_key = "Group(%i)" % (self._k_groups,)
+ self._supervised = samples and any((row[group_key] != '-')
+ for row in samples.itervalues())
+
+ assert k_groups in (2, 3), k_groups
+ prefix = os.path.splitext(os.path.basename(input_file))[0]
+ output_prefix = os.path.join(output_root,
+ "%s.%i" % (prefix, k_groups))
+
+ cmd = AtomicCmdBuilder("admixture",
+ IN_FILE_BED=input_file,
+ IN_FILE_BIM=fileutils.swap_ext(input_file,
+ ".bim"),
+ IN_FILE_FAM=fileutils.swap_ext(input_file,
+ ".fam"),
+
+ TEMP_OUT_FILE_BED=prefix + ".bed",
+ TEMP_OUT_FILE_BIM=prefix + ".bim",
+ TEMP_OUT_FILE_FAM=prefix + ".fam",
+ TEMP_OUT_FILE_POP=prefix + ".pop",
+
+ OUT_P=output_prefix + ".P",
+ OUT_Q=output_prefix + ".Q",
+ OUT_STDOUT=output_prefix + ".log",
+
+ CHECK_VERSION=ADMIXTURE_VERSION,
+ set_cwd=True)
+
+ cmd.set_option("-s", random.randint(0, 2 ** 16 - 1))
+
+ if self._supervised:
+ cmd.set_option("--supervised")
+
+ cmd.add_value("%(TEMP_OUT_FILE_BED)s")
+ cmd.add_value(int(k_groups))
+
+ CommandNode.__init__(self,
+ description="<Admixture -> '%s.*''>"
+ % (output_prefix,),
+ command=cmd.finalize(),
+ dependencies=dependencies)
+
+ def _setup(self, config, temp):
+ CommandNode._setup(self, config, temp)
+
+ input_files = [
+ self._input_file,
+ fileutils.swap_ext(self._input_file, ".bim"),
+ fileutils.swap_ext(self._input_file, ".fam"),
+ ]
+
+ for filename in input_files:
+ basename = os.path.basename(filename)
+ os.symlink(os.path.abspath(filename), os.path.join(temp, basename))
+
+ if self._supervised:
+ fam_filename = fileutils.swap_ext(self._input_file, ".fam")
+
+ pop_filename = fileutils.swap_ext(fam_filename, ".pop")
+ pop_filename = fileutils.reroot_path(temp, pop_filename)
+
+ key = "Group(%i)" % (self._k_groups,)
+ with open(fam_filename) as fam_handle:
+ with open(pop_filename, "w") as pop_handle:
+ for line in fam_handle:
+ sample, _ = line.split(None, 1)
+ group = self._samples.get(sample, {}).get(key, "-")
+
+ pop_handle.write("%s\n" % (group,))
+
+
+class SelectBestAdmixtureNode(Node):
+ def __init__(self, replicates, output_root, dependencies=()):
+ replicates = tuple(replicates)
+ if not replicates:
+ raise ValueError("No replicates passed to SelectBestAdmixture")
+
+ input_files = []
+ ref_filenames = None
+ for node in replicates:
+ filenames = frozenset(os.path.basename(filename)
+ for filename in node.output_files)
+
+ if ref_filenames is None:
+ ref_filenames = filenames
+ elif ref_filenames != filenames:
+ raise RuntimeError("Node %r does not contain expected files, "
+ "%r, vs %r" % (node, ref_filenames,
+ filenames))
+
+ input_files.extend(node.output_files)
+
+ output_files = [os.path.join(output_root, filename)
+ for filename in ref_filenames]
+
+ self._ref_filenames = ref_filenames
+ self._files = tuple(node.output_files for node in replicates)
+ self._output_root = output_root
+
+ Node.__init__(self,
+ description="<SelectBestAdmixture -> %r>"
+ % (output_root,),
+ input_files=input_files,
+ output_files=output_files,
+ dependencies=tuple(dependencies) + tuple(replicates))
+
+ def _run(self, config, temp):
+ likelihoods = []
+ for fileset in self._files:
+ for filename in fileset:
+ if filename.endswith(".log"):
+ likelihoods.append((self._read_admixture_log(filename),
+ fileset))
+ break
+ else:
+ raise NodeError("No log-file found in list of admixture "
+ "output-files: %r" % (fileset,))
+
+ _, fileset = max(likelihoods)
+ for src_filename in fileset:
+ dst_filename = fileutils.reroot_path(self._output_root,
+ src_filename)
+ fileutils.copy_file(src_filename, dst_filename)
+
+ @classmethod
+ def _read_admixture_log(cls, filename):
+ with open(filename) as handle:
+ for line in handle:
+ if line.startswith("Loglikelihood:"):
+ return float(line.split()[1])
+
+ raise NodeError("Could not find likelihood value in log-file %r; "
+ "looking for line starting with 'Loglikelihood:'"
+ % (filename,))
+
+
+class AdmixturePlotNode(CommandNode):
+ def __init__(self, input_file, output_prefix, order, samples,
+ dependencies=()):
+ self._samples = samples
+ self._order = tuple(order) + ("Sample",)
+
+ script = rtools.rscript("zonkey", "admixture.r")
+
+ cmd = AtomicCmd(("Rscript", script, "%(IN_FILE)s",
+ "%(TEMP_OUT_NAMES)s", "%(TEMP_OUT_PREFIX)s"),
+ AUX_RSCRIPT=script,
+ IN_FILE=input_file,
+ IN_SAMPLES=samples,
+ OUT_PDF=output_prefix + ".pdf",
+ OUT_PNG=output_prefix + ".png",
+ TEMP_OUT_NAMES="samples.txt",
+ TEMP_OUT_PREFIX=os.path.basename(output_prefix),
+ CHECK_R=RSCRIPT_VERSION,
+ CHECK_R_GGPLOT2=rtools.requirement("ggplot2"),
+ CHECK_R_RESHAPE2=rtools.requirement("reshape2"),
+ set_cwd=True)
+
+ CommandNode.__init__(self,
+ description="<AdmixturePlot -> '%s.*'>"
+ % (output_prefix,),
+ command=cmd,
+ dependencies=dependencies)
+
+ def _setup(self, config, temp):
+ samples = {}
+ with open(self._samples) as handle:
+ header = handle.readline().strip().split('\t')
+ for line in handle:
+ row = dict(zip(header, line.strip().split('\t')))
+ samples[row["Name"]] = row
+
+ with open(os.path.join(temp, "samples.txt"), "w") as handle:
+ handle.write("{}\n".format("\t".join(header)))
+
+ for name in self._order:
+ row = samples[name]
+ handle.write("{}\n".format("\t".join(row[key]
+ for key in header)))
+
+ CommandNode._setup(self, config, temp)
+
+
+class BuildFreqFilesNode(CommandNode):
+ def __init__(self, input_prefix, output_prefix, tfam,
+ parameters=None, dependencies=()):
+ basename = os.path.basename(output_prefix)
+
+ plink_cmd = ["plink", "--freq", "--missing", "--noweb",
+ "--bfile", os.path.abspath(input_prefix),
+ "--within", "%(TEMP_OUT_CLUST)s",
+ "--out", "%(TEMP_OUT_PREFIX)s"]
+
+ if parameters:
+ plink_cmd.extend(parameters.split())
+
+ plink = AtomicCmd(plink_cmd,
+ IN_BED=input_prefix + ".bed",
+ IN_BIM=input_prefix + ".bim",
+ IN_FAM=input_prefix + ".fam",
+ TEMP_OUT_CLUST="samples.clust",
+ OUT_NOSEX=output_prefix + ".frq.strat.nosex",
+ OUT_LOG=output_prefix + ".frq.strat.log",
+ TEMP_OUT_PREFIX=basename,
+ CHECK_VERSION=PLINK_VERSION,
+ set_cwd=True)
+
+ gzip = AtomicCmd(["gzip", "%(TEMP_IN_FREQ)s"],
+ TEMP_IN_FREQ=basename + ".frq.strat",
+ OUT_FREQ=output_prefix + ".frq.strat.gz")
+
+ self._tfam = tfam
+ self._basename = basename
+ CommandNode.__init__(self,
+ description="<BuildFreqFiles -> '%s.*'"
+ % (output_prefix,),
+ command=SequentialCmds((plink, gzip)),
+ dependencies=dependencies)
+
+ def _setup(self, config, temp):
+ CommandNode._setup(self, config, temp)
+
+ with open(self._tfam) as in_handle:
+ samples = [line.split(None, 1)[0] for line in in_handle]
+
+ with open(os.path.join(temp, "samples.clust"), "w") as handle:
+ for sample in samples:
+ handle.write("{0} {0} {0}\n".format(sample))
+
+ def _teardown(self, config, temp):
+ for ext in ("log", "nosex"):
+ os.rename(os.path.join(temp, self._basename + "." + ext),
+ os.path.join(temp, self._basename + ".frq.strat." + ext))
+
+ CommandNode._teardown(self, config, temp)
+
+
+class FreqToTreemixNode(Node):
+ def __init__(self, input_file, output_file, dependencies=()):
+ Node.__init__(self,
+ description="<FreqToTreemix -> %r" % (output_file,),
+ input_files=(input_file,),
+ output_files=(output_file,),
+ dependencies=dependencies)
+
+ def _run(self, _config, temp):
+ input_file, = self.input_files
+ output_file, = self.output_files
+ temp_filename = os.path.basename(output_file)
+
+ with open(os.path.join(temp, temp_filename), "w") as handle:
+ header = None
+ table = self._parse_freq_table(input_file)
+
+ for _, rows in itertools.groupby(table, lambda row: row[:2]):
+ if header is None:
+ rows = tuple(rows) # Must not consume iterator
+ header = list(sorted(row[2] for row in rows))
+ handle.write("%s\n" % (" ".join(header)))
+
+ result = []
+ rows = dict((row[2], row) for row in rows)
+ for sample in header:
+ _, _, _, mac, nchroms = rows[sample]
+ result.append("%s,%i" % (mac, int(nchroms) - int(mac)))
+
+ handle.write("%s\n" % (" ".join(result),))
+
+ def _teardown(self, config, temp):
+ output_file, = self.output_files
+ temp_file = os.path.join(temp, os.path.basename(output_file))
+
+ fileutils.move_file(temp_file, output_file)
+ Node._teardown(self, config, temp)
+
+ @classmethod
+ def _parse_freq_table(cls, filename):
+ with fileutils.open_ro(filename) as handle:
+ handle.readline() # Skip header
+
+ for line in handle:
+ chrom, snp, clst, _, _, _, mac, nchroms = line.split()
+
+ yield (chrom, snp, clst, int(mac), int(nchroms))
+
+
+class TreemixNode(CommandNode):
+ def __init__(self, data, input_file, output_prefix, m=0, k=100,
+ outgroup=(), dependencies=()):
+ call = ["treemix",
+ "-i", "%(IN_FILE)s",
+ "-o", "%(TEMP_OUT_PREFIX)s",
+ "-global",
+ "-m", m]
+
+ if outgroup:
+ call.extend(("-root", ",".join(outgroup)))
+
+ self._param_m = m
+ self._param_outgroup = outgroup
+ self._params_file = output_prefix + ".parameters.txt"
+
+ if isinstance(k, int):
+ call.extend(("-k", k))
+ self._param_k = k
+ self._k_file = self._k_field = None
+ elif isinstance(k, tuple) and all(isinstance(v, str) for v in k):
+ self._k_field, self._k_file = k
+ self._genome_size = sum(value["Size"]
+ for value in data.contigs.itervalues())
+ self._snp_distance = data.settings["SNPDistance"]
+ else:
+ raise ValueError("k must be int or (key, path) in TreemixNode")
+
+ self._parameters_hash \
+ = "%s.%s" % (output_prefix,
+ hash_params(k=k, m=m, global_set=True,
+ outgroup=tuple(sorted(outgroup))))
+
+ cmd = AtomicCmd(call,
+ IN_FILE=input_file,
+ TEMP_OUT_PREFIX=os.path.basename(output_prefix),
+ OUT_FILE_COV=output_prefix + ".cov.gz",
+ OUT_FILE_COVSE=output_prefix + ".covse.gz",
+ OUT_FILE_EDGES=output_prefix + ".edges.gz",
+ OUT_FILE_LLIK=output_prefix + ".llik",
+ OUT_FILE_MODELCOV=output_prefix + ".modelcov.gz",
+ OUT_FILE_TREEOUT=output_prefix + ".treeout.gz",
+ OUT_FILE_VERTICES=output_prefix + ".vertices.gz",
+ OUT_FILE_PARAMS=self._params_file,
+ OUT_FILE_PARAMS_HASH=self._parameters_hash,
+ CHECK_VERSION=TREEMIX_VERSION,
+ set_cwd=True)
+
+ CommandNode.__init__(self,
+ description="<Treemix -> '%s.*'>"
+ % (output_prefix,),
+ command=cmd,
+ dependencies=dependencies)
+
+ def _setup(self, config, temp):
+ if self._k_file is not None:
+ stats = read_summary(self._k_file)
+ n_sites = float(stats[self._k_field])
+ k = max(1, int(math.ceil(self._snp_distance /
+ (self._genome_size / n_sites))))
+
+ self._param_k = k
+ self._command._command.extend(("-k", str(k)))
+
+ CommandNode._setup(self, config, temp)
+
+ def _teardown(self, config, temp):
+ with open(fileutils.reroot_path(temp, self._params_file), "w") as out:
+ out.write("k: %i\n" % (self._param_k,))
+ out.write("m: %i\n" % (self._param_m,))
+ out.write("outgroup: %r\n" % (list(self._param_outgroup),))
+
+ open(fileutils.reroot_path(temp, self._parameters_hash), "w").close()
+
+ CommandNode._teardown(self, config, temp)
+
+
+class PlotTreemixNode(CommandNode):
+ def __init__(self, samples, prefix, output_prefix, dependencies=()):
+ abs_prefix = os.path.abspath(prefix)
+ basename = os.path.basename(output_prefix)
+
+ # TreeMix plots with migration edges
+ cmd_1 = self._plot_command(prefix, "plot_tree", abs_prefix,
+ "%(IN_SAMPLES)s", "%(TEMP_OUT_PREFIX)s",
+ IN_SAMPLES=samples,
+ TEMP_OUT_PREFIX=basename + "_tree",
+ OUT_PDF=output_prefix + "_tree.pdf",
+ OUT_PNG=output_prefix + "_tree.png")
+
+ # Heatmap showing TreeMix residuals
+ cmd_2 = self._plot_command(prefix, "plot_residuals", abs_prefix,
+ "%(IN_SAMPLES)s", "%(TEMP_OUT_PREFIX)s",
+ IN_SAMPLES=samples,
+ TEMP_OUT_PREFIX=basename + "_residuals",
+ OUT_PDF=output_prefix + "_residuals.pdf",
+ OUT_PNG=output_prefix + "_residuals.png")
+
+ # Text file containing % of variance explained by model
+ cmd_3 = self._plot_command(prefix, "variance", abs_prefix,
+ "%(OUT_TXT)s",
+ OUT_TXT=output_prefix + "_variance.txt")
+
+ CommandNode.__init__(self,
+ description="<PlotTreemix -> '%s.*'>"
+ % (output_prefix,),
+ command=SequentialCmds((cmd_1, cmd_2, cmd_3)),
+ dependencies=dependencies)
+
+ @classmethod
+ def _plot_command(cls, input_prefix, *args, **kwargs):
+ script = rtools.rscript("zonkey", "treemix.r")
+
+ return AtomicCmd(("Rscript", script) + args,
+ AUX_RSCRIPT=script,
+ IN_FILE_COV=input_prefix + ".cov.gz",
+ IN_FILE_COVSE=input_prefix + ".covse.gz",
+ IN_FILE_EDGES=input_prefix + ".edges.gz",
+ IN_FILE_MODELCOV=input_prefix + ".modelcov.gz",
+ IN_FILE_VERTICES=input_prefix + ".vertices.gz",
+ CHECK_R=RSCRIPT_VERSION,
+ CHECK_R_BREW=rtools.requirement("RColorBrewer"),
+ set_cwd=True,
+ **kwargs)
+
+
+class SmartPCANode(CommandNode):
+ def __init__(self, input_prefix, output_prefix, nchroms, dependencies=()):
+ self._input_prefix = input_prefix
+ self._output_prefix = output_prefix
+ self._nchroms = nchroms
+
+ cmd = AtomicCmd(("smartpca", "-p", "%(TEMP_OUT_PARAMS)s"),
+ TEMP_OUT_PARAMS="parameters.txt",
+ IN_FILE_BED=input_prefix + ".bed",
+ IN_FILE_BIM=input_prefix + ".bim",
+ IN_FILE_FAM=input_prefix + ".fam",
+ OUT_STDOUT=output_prefix + ".log",
+ OUT_EVEC=output_prefix + ".evec",
+ OUT_EVAL=output_prefix + ".eval",
+ OUT_SNPS=output_prefix + ".deleted_snps",
+ CHECK_VERSION=SMARTPCA_VERSION,
+ set_cwd=True)
+
+ CommandNode.__init__(self,
+ description="<SmartPCA -> '%s.*>"
+ % (output_prefix,),
+ command=cmd,
+ dependencies=dependencies)
+
+ def _setup(self, config, temp):
+ CommandNode._setup(self, config, temp)
+
+ with open(os.path.join(temp, "parameters.txt"), "w") as handle:
+ handle.write("""
+genotypename: {input_prefix}.bed
+snpname: {input_prefix}.bim
+indivname: {input_prefix}.fam
+evecoutname: {output_prefix}.evec
+evaloutname: {output_prefix}.eval
+deletsnpoutname: {output_prefix}.deleted_snps
+altnormstyle: NO
+numoutevec: 5
+familynames: YES
+numoutlieriter: 1
+numchrom: {nchroms}
+numthreads: 1
+""".format(input_prefix=os.path.abspath(self._input_prefix),
+ output_prefix=os.path.basename(self._output_prefix),
+ nchroms=self._nchroms))
+
+ def _teardown(self, config, temp):
+ # Ensure that this file exists even when no filtered SNPs
+ deleted_snps = os.path.basename(self._output_prefix) + ".deleted_snps"
+ open(os.path.join(temp, deleted_snps), "a").close()
+
+ CommandNode._teardown(self, config, temp)
+
+
+class PlotPCANode(CommandNode):
+ def __init__(self, samples, prefix, output_prefix, dependencies=()):
+ abs_prefix = os.path.abspath(prefix)
+
+ script = rtools.rscript("zonkey", "pca.r")
+ call = ["Rscript", script,
+ abs_prefix, "%(IN_SAMPLES)s", "%(TEMP_OUT_PREFIX)s"]
+
+ cmd = AtomicCmd(call,
+ AUX_SCRIPT=script,
+ IN_FILE_EVAL=prefix + ".eval",
+ IN_FILE_EVEC=prefix + ".evec",
+ IN_SAMPLES=samples,
+ TEMP_OUT_PREFIX=os.path.basename(output_prefix),
+ OUT_PDF=output_prefix + ".pdf",
+ OUT_PNG=output_prefix + ".png",
+ CHECK_R=RSCRIPT_VERSION,
+ CHECK_R_GGPLOT2=rtools.requirement("ggplot2"),
+ CHECK_R_LABELS=rtools.requirement("ggrepel"),
+ set_cwd=True)
+
+ CommandNode.__init__(self,
+ description="<PlotPCA -> '%s.*'>"
+ % (output_prefix,),
+ command=cmd,
+ dependencies=dependencies)
+
+
+class PlotCoverageNode(CommandNode):
+ def __init__(self, contigs, input_file, output_prefix, dependencies=()):
+ self._contigs = contigs
+ self._input_file = input_file
+
+ script = rtools.rscript("zonkey", "coverage.r")
+ cmd = AtomicCmd(("Rscript", script,
+ "%(TEMP_OUT_TABLE)s", "%(TEMP_OUT_PREFIX)s"),
+ AUX_RSCRIPT=script,
+ IN_FILE=input_file,
+ TEMP_OUT_TABLE="contigs.table",
+ OUT_PDF=output_prefix + ".pdf",
+ OUT_PNG=output_prefix + ".png",
+ TEMP_OUT_PREFIX=os.path.basename(output_prefix),
+ CHECK_R=RSCRIPT_VERSION,
+ CHECK_R_GGPLOT2=rtools.requirement("ggplot2"),
+ set_cwd=True)
+
+ CommandNode.__init__(self,
+ description="<CoveragePlot -> '%s.*'>"
+ % (output_prefix,),
+ command=cmd,
+ dependencies=dependencies)
+
+ def _setup(self, config, temp):
+ with open(os.path.join(temp, "contigs.table"), "w") as handle:
+ handle.write("ID\tSize\tNs\tHits\n")
+
+ # Workaround for pysam < 0.9 returning list, >= 0.9 returning str
+ for line in "".join(pysam.idxstats(self._input_file)).split('\n'):
+ line = line.strip()
+ if not line:
+ continue
+
+ name, size, hits, _ = line.split('\t')
+ name = contig_name_to_plink_name(name)
+ if name is None or not (name.isdigit() or name == 'X'):
+ continue
+ elif name not in self._contigs:
+ # Excluding contigs is allowed
+ continue
+
+ if int(size) != self._contigs[name]['Size']:
+ raise NodeError("Size mismatch between database and BAM; "
+ "expected size %i, found %i for contig %r"
+ % (int(size), self._contigs[name]['Size'],
+ name))
+
+ row = {
+ 'ID': name,
+ 'Size': self._contigs[name]['Size'],
+ 'Ns': self._contigs[name]['Ns'],
+ 'Hits': hits,
+ }
+
+ handle.write('{ID}\t{Size}\t{Ns}\t{Hits}\n'.format(**row))
+
+ CommandNode._setup(self, config, temp)
+
+
+def hash_params(*args, **kwargs):
+ return hashlib.md5(repr([args, kwargs])).hexdigest()
diff --git a/paleomix/tools/zonkey/parts/report.py b/paleomix/tools/zonkey/parts/report.py
new file mode 100644
index 0000000..c454020
--- /dev/null
+++ b/paleomix/tools/zonkey/parts/report.py
@@ -0,0 +1,869 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2016 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import copy
+import os
+
+import pysam
+
+import paleomix
+import paleomix.resources
+
+from paleomix.node import Node
+
+import paleomix.common.fileutils as fileutils
+
+import paleomix.tools.zonkey.parts.admixture as admixture
+import paleomix.tools.zonkey.parts.nuclear as nuclear
+
+from paleomix.tools.zonkey.common import \
+ RSCRIPT_VERSION, \
+ read_summary
+
+from paleomix.nodes.samtools import SAMTOOLS_VERSION
+from paleomix.nodes.raxml import RAXML_VERSION
+
+
+class ReportNode(Node):
+ def __init__(self, config, root, has_nuc, has_mt, dependencies=()):
+ """
+
+ Arguments:
+ config -- Config object generated using paleomix.tools.zonkey.config.
+ root -- Root folder containing current analysis.
+ has_nuc -- True if a nuclear BAM was provided.
+ has_mt -- True if a mitochondrial BAM was provided.
+ dependencies -- Nodes for ReportNode to depend on.
+ """
+
+ self._root = root
+ self._data = copy.deepcopy(config.database)
+ self._report = AnalysisReport(config, root, has_nuc, has_mt)
+ self._has_nuc = bool(has_nuc)
+ self._has_mt = bool(has_mt)
+
+ self._treemix_outgroup = config.treemix_outgroup
+ self._treemix_k = config.treemix_k
+ if self._treemix_k is None:
+ self._treemix_k = '<automatic>'
+
+ Node.__init__(self,
+ description="<Report -> %r>"
+ % (os.path.join(self._root, "report.html"),),
+ input_files=self._report.input_files(),
+ output_files=(os.path.join(self._root, "report.html"),
+ os.path.join(self._root, "report.css")),
+ dependencies=dependencies)
+
+ def _run(self, _config, temp):
+ with open(os.path.join(temp, "report.html"), "w") as output_handle:
+ revision = self._data.settings['Revision']
+ header = _HTML_HEADER.format(Version=paleomix.__version__,
+ Database=revision,
+ Sidebar=self._build_sidebar())
+ output_handle.write(header)
+
+ self._write_intro_and_overview(output_handle)
+ self._write_sample_description(output_handle)
+
+ if self._has_nuc:
+ self._write_admixture_estimates(output_handle)
+ self._write_pca_plots(output_handle)
+ self._write_treemix_plots(output_handle)
+
+ if self._has_mt:
+ self._write_mitochondrial_plots(output_handle)
+
+ self._write_references(output_handle)
+
+ output_handle.write(_HTML_FOOTER)
+
+ def _teardown(self, config, temp):
+ fileutils.make_dirs(self._root)
+
+ fileutils.move_file(os.path.join(temp, "report.html"),
+ os.path.join(self._root, "report.html"))
+
+ css_path = paleomix.resources.report("zonkey", "report.css")
+ fileutils.copy_file(css_path, os.path.join(self._root, "report.css"))
+
+ def _write_intro_and_overview(self, output_handle):
+ output_handle.write(_SECTION_HEADER.format(name="intro",
+ title="Introduction"))
+ output_handle.write(_INTRODUCTION)
+
+ output_handle.write(_SECTION_HEADER.format(name="overview",
+ title="Analysis overview"))
+
+ revision = self._data.settings['Revision']
+ overview = _OVERVIEW_HEADER.format(DATABASE=revision,
+ PYSAM=pysam.__version__,
+ SAMTOOLS=_fmt_v(SAMTOOLS_VERSION),
+ PLINK=_fmt_v(nuclear.PLINK_VERSION),
+ RSCRIPT=_fmt_v(RSCRIPT_VERSION))
+ output_handle.write(overview)
+
+ if self._has_nuc:
+ summary = self._report.snp_summary()
+ output_handle.write(_OVERVIEW_NUCLEAR % summary)
+
+ if self._has_mt:
+ summary = self._report.mito_summary()
+ output_handle.write(_OVERVIEW_MITOCHONDRIA % summary)
+
+ if self._has_nuc:
+ output_handle.write(_OVERVIEW_NUCLEAR_COVERAGE)
+
+ output_handle.write(_OVERVIEW_FOOTER)
+
+ def _write_sample_description(self, output_handle):
+ output_handle.write(_SECTION_HEADER.format(name="samples",
+ title="Reference Panel"))
+
+ output_handle.write(_SAMPLE_LIST_HEADER)
+
+ last_group_2 = None
+ last_group_3 = None
+ for row in sorted(self._data.samples.itervalues(),
+ key=lambda row: (row["Group(2)"],
+ row["Group(3)"],
+ row["ID"])):
+
+ row = dict(row)
+ if last_group_2 != row["Group(2)"]:
+ last_group_2 = row["Group(2)"]
+ last_group_3 = row["Group(3)"]
+ else:
+ row["Group(2)"] = ""
+
+ if last_group_3 != row["Group(3)"]:
+ last_group_3 = row["Group(3)"]
+ else:
+ row["Group(3)"] = ""
+
+ pub = row["Publication"]
+ if pub.startswith("http"):
+ row["Publication"] \
+ = '<a href="{0}">Link</a>'.format(pub.strip())
+ elif row["Publication"].startswith("doi:"):
+ pub = pub[4:].strip()
+ url = "https://doi.org/{}".format(pub)
+ row["Publication"] \
+ = 'doi:<a href="{0}">{1}</a>'.format(url, pub)
+
+ output_handle.write(_SAMPLE_LIST_ROW.format(**row))
+
+ output_handle.write(" </table>\n")
+
+ def _write_admixture_estimates(self, output_handle):
+ header = _SECTION_HEADER.format(name="admixture",
+ title="Admixture Estimates")
+ output_handle.write(header)
+
+ admixture_v = _fmt_v(nuclear.ADMIXTURE_VERSION)
+ overview = _ADMIXTURE_OVERVIEW.format(ADMIXTURE=admixture_v)
+ output_handle.write(overview)
+
+ for k_groups in (2, 3):
+ summary_incl = self._build_admixture_cell(k_groups, True)
+ summary_excl = self._build_admixture_cell(k_groups, False)
+
+ output_handle.write(_ADMIXTURE_ROW.format(K=k_groups,
+ Incl_TS=summary_incl,
+ Excl_TS=summary_excl))
+
+ def _build_admixture_cell(self, k_groups, incl_ts,
+ cutoff=admixture.CUTOFF):
+ try:
+ groups = self._report.admixture_results(k_groups, incl_ts)
+ except admixture.AdmixtureError, error:
+ return _warn("ERROR: {}</strong".format(error))
+
+ n_admixture_candidates = sum((value >= cutoff) for _, value in groups)
+ if n_admixture_candidates < 2:
+ return "<strong>No admixture detected.</strong>"
+
+ lines = [
+ "<strong>Possible hybridization detected:</strong>",
+ "<ul>",
+ ]
+
+ for group, value in groups:
+ if value >= cutoff:
+ name = " / ".join(sorted(group))
+
+ lines.append(" <li>%s (%.2f%%)</li>" % (name, value * 100))
+
+ lines.append("</ul>")
+
+ if n_admixture_candidates != 2:
+ lines.append(_warn('WARNING: %s-way admixture detected; this may '
+ 'indicate a false-positive result!'
+ % (n_admixture_candidates,)))
+ return "\n ".join(lines)
+
+ percentiles = self._report.admixture_percentiles(data=self._data,
+ k_groups=k_groups,
+ incl_ts_k=incl_ts)
+
+ if not ('Lower' in percentiles or 'Upper' in percentiles):
+ lines.append(_warn("WARNING: Could not determine percentiles."))
+
+ return "\n ".join(lines)
+
+ if 'Lower' not in percentiles:
+ percentiles['Lower'] = percentiles['Upper']
+ finale = \
+ '; note that this is more simulated reads than what was ' \
+ 'processed in this analyses, potentially resulting in an ' \
+ 'overrestimating of percentages.'
+ elif 'Upper' not in percentiles:
+ percentiles['Upper'] = percentiles['Lower']
+ finale = \
+ '; note that this is fewer simulated reads than what was ' \
+ 'processed in this analyses, potentially resulting in an ' \
+ 'underrestimating of percentages'
+ else:
+ finale = '.'
+
+ lower_pct = "%.1f" % ((1.0 - max(percentiles['Lower']['Upper'],
+ percentiles['Upper']['Lower'])) *
+ 100.0,)
+ upper_pct = "%.1f" % ((1.0 - min(percentiles['Lower']['Upper'],
+ percentiles['Upper']['Lower'])) *
+ 100.0,)
+
+ pct_range = lower_pct
+ if lower_pct != upper_pct:
+ pct_range = '%s - %s' % (lower_pct, upper_pct)
+
+ lower_reads = min(percentiles['Lower']['NReads'],
+ percentiles['Upper']['NReads'])
+ upper_reads = max(percentiles['Lower']['NReads'],
+ percentiles['Upper']['NReads'])
+
+ reads_range = lower_reads
+ if lower_reads != upper_reads:
+ reads_range = '%s to %s' % (lower_reads, upper_reads)
+
+ lines.append('Admixture results fall within %s percent of those '
+ 'observed for simulated F1 %s / %s hybrids, based on '
+ '%s randomly selected reads%s'
+ % (pct_range,
+ percentiles['Sample1'],
+ percentiles['Sample2'],
+ reads_range, finale))
+
+ return "\n ".join(lines)
+
+ def _write_pca_plots(self, output_handle):
+ output_handle.write(_SECTION_HEADER.format(name="pca",
+ title="PCA Plots"))
+
+ smartpca_v = _fmt_v(nuclear.SMARTPCA_VERSION)
+ output_handle.write(_PCA_SECTION.format(SMARTPCA=smartpca_v))
+
+ def _write_treemix_plots(self, output_handle):
+ output_handle.write(_SECTION_HEADER.format(name="treemix",
+ title="Treemix Plots"))
+
+ outgroups = ""
+ if self._treemix_outgroup:
+ outgroups = ", ".join(map(repr, self._treemix_outgroup))
+ outgroups = ". The tree was rooted on the clade containing the " \
+ "sample(s) %s" % (outgroups)
+
+ treemix_v = _fmt_v(nuclear.TREEMIX_VERSION)
+ overview = _TREEMIX_OVERVIEW.format(treemix_k=self._treemix_k,
+ treemix_outgroup=outgroups,
+ TREEMIX=treemix_v)
+ output_handle.write(overview)
+
+ for prefix in ("incl_ts", "excl_ts"):
+ output_handle.write("<h2>%s</h2>\n" % (_TS_LABELS[prefix],))
+
+ for n_edges in (0, 1):
+ variance_file = os.path.join(self._root,
+ "figures",
+ "treemix",
+ "%s_%i_variance.txt")
+
+ with open(variance_file % (prefix, n_edges)) as handle:
+ variance = handle.read().strip()
+
+ treemix_row = _TREEMIX_TREE_ROW.format(Prefix=prefix,
+ Edges=n_edges,
+ Variance=variance)
+ output_handle.write(treemix_row)
+
+ def _write_mitochondrial_plots(self, output_handle):
+ header = _SECTION_HEADER.format(name="mitochondria",
+ title="Mitochondrial Phylogeny")
+ output_handle.write(header)
+ raxml_v = _fmt_v(RAXML_VERSION)
+ output_handle.write(_MITOCONDRIAL_SECTION.format(RAXML=raxml_v))
+
+ def _write_references(self, output_handle):
+ header = _SECTION_HEADER.format(name="references",
+ title="References")
+ output_handle.write(header)
+ output_handle.write(_REFERENCES)
+
+ def _build_sidebar(self):
+ lines = [_SIDEBAR_HEADER]
+
+ if self._has_nuc:
+ lines.append(_SIDEBAR_NUCLEAR)
+
+ if self._has_mt:
+ lines.append(_SIDEBAR_MITO)
+
+ lines.append(_SIDEBAR_FOOTER)
+
+ return "\n".join(lines)
+
+
+class AnalysisReport(object):
+ def __init__(self, config, root, has_nuc, has_mt):
+ self._has_nuc = bool(has_nuc)
+ self._has_mt = bool(has_mt)
+ self._filtered = bool(config.indep)
+ self._config = config
+ self._root = root
+ self._data = config.database
+
+ def input_files(self):
+ input_files = [self._config.tablefile]
+ if self._has_nuc:
+ # Summary file generated while building TPED files
+ input_files.append(os.path.join(self._root,
+ "results",
+ "plink",
+ "common.summary"))
+
+ for postfix in ('incl_ts', 'excl_ts'):
+ admix_root = os.path.join(self._root, "results", "admixture")
+
+ if self._filtered:
+ # Required to count number of SNPs included after filtering
+ input_files.append(os.path.join(self._root,
+ "results",
+ "plink",
+ postfix + ".bim"))
+
+ # Include files showing proproation of ancestral populations,
+ # which are required to build admixture figures in the reports.
+ for k_groups in (2, 3):
+ input_files.append(os.path.join(admix_root,
+ "%s.%i.Q" % (postfix,
+ k_groups)))
+
+ # Include files tabulating variance explained by models
+ figures_path = os.path.join(self._root, "figures", "treemix")
+ for n_edges in (0, 1):
+ variance_path = os.path.join(figures_path,
+ "%s_%i_variance.txt"
+ % (postfix, n_edges))
+
+ input_files.append(variance_path)
+
+ if self._has_mt:
+ input_files.append(os.path.join(self._root,
+ "results",
+ "mitochondria",
+ "sequences.summary"))
+
+ return input_files
+
+ def snp_summary(self):
+ summary = read_summary(os.path.join(self._root,
+ "results",
+ "plink",
+ "common.summary"))
+
+ if self._filtered:
+ for postfix in ('incl_ts', 'excl_ts'):
+ key = "n_sites_%s" % (postfix,)
+
+ filename = os.path.join(self._root, "plink", postfix + ".bim")
+ with open(filename) as handle:
+ n_used = sum(1 for _ in handle)
+
+ summary[key] = "%i of %i" % (n_used, summary[key])
+
+ return summary
+
+ def mito_summary(self):
+ return read_summary(os.path.join(self._root,
+ "results",
+ "mitochondria",
+ "sequences.summary"))
+
+ def admixture_results(self, k_groups, incl_ts,
+ cutoff=admixture.CUTOFF):
+ prefix = "incl_ts" if incl_ts else "excl_ts"
+ filename = os.path.join(self._root, "results", "admixture",
+ "%s.%i.Q" % (prefix, k_groups))
+
+ return admixture.read_admixture_results(filename=filename,
+ data=self._config.database,
+ k_groups=k_groups,
+ cutoff=cutoff)
+
+ def admixture_percentiles(self, data, k_groups, incl_ts_k,
+ cutoff=admixture.CUTOFF):
+ try:
+ results = self.admixture_results(k_groups, incl_ts_k, cutoff)
+ except admixture.AdmixtureError:
+ return
+
+ groups = [group for group, value in results if value >= cutoff]
+ if len(groups) != 2:
+ return
+
+ sample1, = groups[0]
+ sample2, = groups[1]
+ delta = abs(max(value for _, value in results) - 0.5)
+ summary = read_summary(os.path.join(self._root,
+ "results",
+ "plink",
+ "common.summary"))
+
+ return admixture.get_percentiles(data=data,
+ sample1=sample1,
+ sample2=sample2,
+ nreads=summary['n_reads_used'],
+ k_groups=k_groups,
+ has_ts=incl_ts_k,
+ value=delta)
+
+
+def _fmt_v(requirement):
+ return ".".join(map(str, requirement.version))
+
+
+def _warn(text):
+ return "<div class='warning'>%s</div>" % (text,)
+
+
+###############################################################################
+
+_TS_LABELS = {
+ "incl_ts": "Including transitions",
+ "excl_ts": "Excluding transitions",
+}
+
+
+###############################################################################
+
+_HTML_HEADER = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+<title>PALEOMIX Zonkey v{Version} - db rev. {Database}</title>
+<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
+<link href="report.css" rel="stylesheet" type="text/css" />
+</head>
+<body>
+<a name="top" id="top"></a>
+<center>
+ <div id="header">
+ <h1>PALEOMIX Zonkey v{Version} - db rev. {Database}</h1>
+ <h2>A pipeline for detection of F1 hybrids in equids.</h2>
+ </div>
+ <div id="content">
+ <p class="introduction">
+ Schubert M, Ermini L, Sarkissian CD, Jónsson H, Ginolhac A,
+ Schaefer R, Martin MD, Fernández R, Kircher M, McCue M,
+ Willerslev E, and Orlando L. "<strong>Characterization of ancient and
+ modern genomes by SNP detection and phylogenomic and metagenomic analysis
+ using PALEOMIX</strong>". Nat Protoc. 2014 May;9(5):1056-82. doi:<a
+ href="https://doi.org/10.1038/nprot.2014.063">10.1038/nprot.2014.063</a>.
+ Epub 2014 Apr 10. PubMed PMID: <a
+ href="http://www.ncbi.nlm.nih.gov/pubmed/24722405">24722405</a>.
+ </p>
+{Sidebar}
+ <div id="mainbar">
+"""
+
+_SECTION_HEADER = """ <h1><a name="{name}" id="{name}"></a>{title}</h1>
+"""
+
+_INTRODUCTION = """
+ <div>
+ <div>
+ The Zonkey Pipeline is a easy-to-use pipeline designed for the
+ analyses of low-coverage, ancient DNA derived from historical
+ equid samples, with the purpose of determining the species of
+ the sample, as well as determining possible hybridization between
+ horses, zebras, and asses. This is accomplished by comparing one
+ or more samples aligned against the <em>Equus caballus</em> 2.0
+ reference sequence with a reference panel of modern equids,
+ including wild and domesticated equids.
+ </div>
+ <br/>
+ <div>
+ For more information, please refer to the
+ <a href="http://paleomix.readthedocs.org/en/latest/zonkey_pipeline/index.html">
+ the documentation for the Zonkey pipeline
+ </a>
+ or
+ <a href="http://paleomix.readthedocs.org/en/latest/">
+ the documentation for the PALEOMIX pipeline,
+ </a>
+ on which the Zonkey pipeline is based.
+ </div>
+ <br/>
+"""
+
+_OVERVIEW_HEADER = """
+ <div>
+ Zonkey run using database rev. {DATABASE}. Data processed using
+ <a href="https://github.com/pysam-developers/pysam">pysam</a> v{PYSAM},
+ <a href="https://samtools.github.io/">SAMTools</a> v{SAMTOOLS}
+ [<em>Li <em>et al.</em> 2009</em>] and
+ <a href="http://pngu.mgh.harvard.edu/purcell/plink/">PLINK</a> v{PLINK}
+ [<em>Purcell <em>et al.</em> 2007</em>]; plotting was carried out using
+ <a href="https://www.r-project.org/">R</a> v{RSCRIPT}. Additional
+ tools listed below.
+ </div>
+ <br/>
+ <div style="display:table;width:100%;">
+ <div style="display:table-cell;">
+"""
+
+_OVERVIEW_NUCLEAR = """
+ <strong>Nuclear report from '<em>%(filename)s</em>'</strong>
+
+ <table style="width:95%%">
+ <tr>
+ <td style="width:50%%;">Number of reads processed:</td>
+ <td>%(n_reads)s</td>
+ </tr>
+ <tr>
+ <td>Number of reads overlapping SNPs:</td>
+ <td>%(n_reads_used)s</td>
+ </tr>
+ <tr>
+ <td>Number of SNPs used (incl. transitions):</td>
+ <td>%(n_sites_incl_ts)s</td>
+ </tr>
+ <tr>
+ <td>Number of SNPs used (excl. transitions):</td>
+ <td>%(n_sites_excl_ts)s</td>
+ </tr>
+ </table>
+"""
+
+_OVERVIEW_MITOCHONDRIA = """
+ <br>
+
+ <h4>Mitochondrial report from '<em>%(filename)s</em>'</h4>
+
+ <table style="width:95%%">
+ <tr>
+ <td style="width:50%%;">Reference sequence used:</td>
+ <td>%(sequence_name)s</td>
+ </tr>
+ <tr>
+ <td>Reference sequence length:</td>
+ <td>%(sequence_len)s</td>
+ </tr>
+ <tr>
+ <td>Number of sites covered:</td>
+ <td>%(covered_sites)s</td>
+ </tr>
+ <tr>
+ <td>Percentage of sites covered:</td>
+ <td>%(covered_pct)s</td>
+ </tr>
+ <tr>
+ <td>Mean coverage per site:</td>
+ <td>%(mean_coverage)s</td>
+ </tr>
+ </table>
+"""
+
+
+_OVERVIEW_NUCLEAR_COVERAGE = """
+ </div>
+ <div style="display:table-cell;width:25%;max-width:350px;">
+ <div>
+ <strong>Autosomes vs. sex-chromosomes:</strong>
+ </div>
+ <div>
+ <a href="figures/coverage/coverage.pdf">
+ <img src="figures/coverage/coverage.png"
+ style="vertical-align:top;horizontal-align:center;">
+ </a>
+ </div>
+"""
+
+_OVERVIEW_FOOTER = """
+ </div>
+ </div>
+"""
+
+_SAMPLE_LIST_HEADER = """
+ <table summary="List of samples in the reference panel.">
+ <tr>
+ <th>Group(2)</th>
+ <th>Group(3)</th>
+ <th>ID</th>
+ <th>Species</th>
+ <th>Sex</th>
+ <th>Sample Name</th>
+ <th>Publication</th>
+ </tr>
+"""
+
+_SAMPLE_LIST_ROW = """
+ <tr>
+ <td>{Group(2)}
+ <td>{Group(3)}
+ <td>{ID}</td>
+ <td><em>{Species}</em></td>
+ <td>{Sex}</th>
+ <td>{SampleID}</td>
+ <td>{Publication}</td>
+ </tr>
+"""
+
+
+_ADMIXTURE_OVERVIEW = """
+ <p>
+ Admixture proportions estimated using
+ <a href="https://www.genetics.ucla.edu/software/admixture/">ADMIXTURE</a>
+ v{ADMIXTURE} <em>[Alexander <em>et al.</em> 2009]</em>, using default
+ parameters.
+ </p>
+"""
+
+
+_ADMIXTURE_ROW = """
+ <table summary="Admixture between sample and reference populations.">
+ <tr>
+ <th>{K} ancestral groups</th>
+ <th>{K} ancestral groups, excluding transitions</th>
+ </tr>
+ <tr>
+ <td>
+ <a href="figures/admixture/incl_ts_k{K}.pdf">
+ <img style="width:95%" src="figures/admixture/incl_ts_k{K}.png"
+ alt="Admixture for k={K}, including transitions." />
+ </a>
+ <br/>
+ <br/>
+ {Incl_TS}
+ </td>
+ <td>
+ <a href="figures/admixture/excl_ts_k{K}.pdf">
+ <img style="width:95%" src="figures/admixture/excl_ts_k{K}.png"
+ alt="Admixture for k={K}, excluding transitions." />
+ </a>
+ <br/>
+ <br/>
+ {Excl_TS}
+ </td>
+ </tr>
+ </table>
+"""
+
+
+_PCA_SECTION = """
+ <p>
+ Principal Component Analysis carried out using SmartPCA v{SMARTPCA},
+ from the <a
+ href="http://www.hsph.harvard.edu/alkes-price/software/">EIGENSOFT</a>
+ toolkit.
+ </p>
+
+ <table summary="PCA plots comparing sample with the reference panel.">
+ <tr>
+ <th>Including transitions</th>
+ <th>Excluding transitions</th>
+ </tr>
+ <tr>
+ <td>
+ <a href="figures/pca/incl_ts.pdf">
+ <img style="width:95%" src="figures/pca/incl_ts.png"
+ alt="PCA plot, including transitions." />
+ </a>
+ </td>
+ <td>
+ <a href="figures/pca/excl_ts.pdf">
+ <img style="width:95%" src="figures/pca/excl_ts.png"
+ alt="PCA plot, excluding transitions." />
+ </a>
+ </td>
+ </tr>
+ </table>
+"""
+
+
+_TREEMIX_OVERVIEW = """
+ <p>
+ Detection of population mixture using
+ <a href="https://bitbucket.org/nygcresearch/treemix/wiki/Home">TreeMix</a>
+ v{TREEMIX} <em>[Pickrell and Pritchard 2012]</em>; parameters were -k
+ {treemix_k}; -global; and supervised estimation using ancestral groups
+ listed in the Reference Panel{treemix_outgroup}.
+ </p>
+"""
+
+
+_TREEMIX_TREE_ROW = """
+ <table summary="Treemix plots, for {Edges} edge(s).">
+ <tr>
+ <th>Edges = {Edges}</th>
+ <th>Residuals</th>
+ </tr>
+ <tr>
+ <td>
+ <a href="figures/treemix/{Prefix}_{Edges}_tree.pdf">
+ <img style="width:95%"
+ src="figures/treemix/{Prefix}_{Edges}_tree.png"
+ alt="Treemix plot, {Edges} edge(s), incl. transitions." />
+ </a>
+ </td>
+ <td>
+ <a href="figures/treemix/{Prefix}_{Edges}_residuals.pdf">
+ <img style="width:95%"
+ src="figures/treemix/{Prefix}_{Edges}_residuals.png"
+ alt="Treemix plot, {Edges} edge(s), excl. transitions." />
+ </a>
+ </td>
+ </tr>
+ </table>
+
+ <p>
+ Variance explained by model = {Variance}.
+ </p>
+"""
+
+
+_MITOCONDRIAL_SECTION = """
+ <p>
+ Phylogenetic inference performed using RAxML
+ v{RAXML} [<em>Stamatakis 2006</em>].
+ <p>
+
+ <div style="text-align:center;">
+ <a href="figures/mitochondria/mito_phylo.pdf">
+ <img src="figures/mitochondria/mito_phylo.png"
+ alt="Mitochondrial maximum likelihood phylogeny." />
+ </a>
+ </div>
+"""
+
+
+_REFERENCES = """
+ <p>
+ <ul>
+ <li>
+ Alexander <em>et al</em>. "<strong>Fast model-based estimation of
+ ancestry in unrelated individuals</strong>". <em>Genome Res</em>.
+ 2009 Sep;19(9):1655-64.
+ doi:<a href="https://doi.org/10.1101/gr.094052.109">
+ 10.1101/gr.094052.109</a>.
+ PMID:<a href="http://www.ncbi.nlm.nih.gov/pubmed/19648217">
+ 19648217</a>.
+ </li>
+ <li>
+ Li <em>et al</em>. "<strong>The Sequence Alignment/Map format and
+ SAMtools</strong>. <em>Bioinformatics</em>. 2009 Aug
+ 15;25(16):2078-9.
+ doi:<a href="https://doi.org/10.1093/bioinformatics/btp352">
+ 10.1093/bioinformatics/btp352</a>.
+ PMID:<a href="http://www.ncbi.nlm.nih.gov/pubmed/19505943">
+ 19505943</a>.
+ </li>
+ <li>
+ Pickrell and Pritchard. "<strong>Inference of population splits
+ and mixtures from genome-wide allele frequency data</strong>".
+ <em>PLoS Genet</em>. 2012;8(11):e1002967.
+ doi:<a href="https://doi.org/10.1371/journal.pgen.1002967">
+ 10.1371/journal.pgen.1002967</a>.
+ PMID:<a href="http://www.ncbi.nlm.nih.gov/pubmed/23166502">
+ 23166502</a>.
+ </li>
+ <li>
+ Purcell <em>et al</em>. "<strong>PLINK: a tool set for whole-
+ genome association and population-based linkage
+ analyses</strong>". <em>Am J Hum Genet</em>. 2007
+ Sep;81(3):559-75.
+ PMID:<a href="http://www.ncbi.nlm.nih.gov/pubmed/17701901">
+ 17701901</a>.
+ </li>
+ <li>
+ Stamatakis. "<strong>RAxML-VI-HPC: maximum likelihood-based
+ phylogenetic analyses with thousands of taxa and mixed
+ models</strong>". <em>Bioinformatics</em>. 2006 Nov
+ 1;22(21):2688-90. Epub 2006 Aug 23.
+ doi:<a href="https://doi.org/10.1093/bioinformatics/btl446">
+ 10.1093/bioinformatics/btl446</a>.
+ PMID:<a href="http://www.ncbi.nlm.nih.gov/pubmed/16928733">
+ 16928733</a>.
+ </ul>
+ </p>
+"""
+
+
+_HTML_FOOTER = """
+ </div>
+ </div>
+ <div id="footer">
+ This report is based on the PLAIN 1.0 design by
+ <a href="http://www.sixshootermedia.com/">6ix Shooter Media</a>,
+ Creative Commons license.<br/>
+ </div>
+</center>
+</body>
+</html>
+"""
+
+###############################################################################
+
+_SIDEBAR_HEADER = """
+ <div id="sidebar">
+ <h1>Contents</h1>
+ <div class="submenu">
+ <a href="#">Top</a>
+ <a href="#intro">Introduction</a>
+ <a href="#overview">Analysis overview</a>
+ <a href="#samples">Reference Panel</a>
+"""
+
+_SIDEBAR_NUCLEAR = """
+ <a href="#admixture">Admixture Estimates</a>
+ <a href="#pca">PCA Plots</a>
+ <a href="#treemix">Treemix Analyses</a>
+"""
+
+_SIDEBAR_MITO = """
+ <a href="#mito_phylo">MT Phylogeny</a>
+"""
+
+_SIDEBAR_FOOTER = """
+ <a href="#references">References</a>
+ </div>
+ </div>
+"""
diff --git a/paleomix/tools/zonkey/parts/summary.py b/paleomix/tools/zonkey/parts/summary.py
new file mode 100644
index 0000000..60dd874
--- /dev/null
+++ b/paleomix/tools/zonkey/parts/summary.py
@@ -0,0 +1,532 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+
+import paleomix
+import paleomix.resources
+
+from paleomix.node import Node
+
+import paleomix.common.fileutils as fileutils
+
+import paleomix.tools.zonkey.parts.admixture as admixture
+
+from paleomix.tools.zonkey.parts.report import AnalysisReport
+
+
+class SummaryNode(Node):
+ def __init__(self, config, dependencies=()):
+ self._root = config.destination
+ self._data = config.database
+ self._samples = config.samples
+ self._sample_keys = self._samples.keys()
+
+ input_files = set()
+ self._reports = {}
+ for sample, info in self._samples.iteritems():
+ report = AnalysisReport(config=config,
+ root=os.path.join(self._root, sample),
+ has_nuc="Nuc" in info["Files"],
+ has_mt="Mito" in info["Files"])
+
+ input_files.update(report.input_files())
+ self._reports[sample] = report
+
+ output_prefix = os.path.join(self._root, "summary")
+ Node.__init__(self,
+ description="<SummaryReport -> %r>"
+ % (output_prefix + '.html',),
+ input_files=input_files,
+ output_files=(output_prefix + '.html',
+ output_prefix + '.css'),
+ dependencies=dependencies)
+
+ def _run(self, _config, temp):
+ with open(os.path.join(temp, "summary.html"), "w") as output_handle:
+ menu_entries = self._build_sidemenu()
+ html_header = _HTML_HEADER.format(Version=paleomix.__version__,
+ MenuEntries=menu_entries)
+ output_handle.write(html_header)
+
+ for sample in sorted(self._samples):
+ self._write_sample_overview(output_handle, sample)
+
+ output_handle.write(_HTML_FOOTER)
+
+ def _teardown(self, config, temp):
+ fileutils.make_dirs(self._root)
+
+ fileutils.move_file(os.path.join(temp, "summary.html"),
+ os.path.join(self._root, "summary.html"))
+
+ css_path = paleomix.resources.report("zonkey", "report.css")
+ fileutils.copy_file(css_path, os.path.join(self._root, "summary.css"))
+
+ def _build_sidemenu(self):
+ lines = []
+ for sample in sorted(self._samples):
+ lines.append(' <a href="#sample_{0}">{0}</a>'.format(sample))
+ return "\n".join(lines)
+
+ def _write_sample_overview(self, handle, sample):
+ info = self._samples[sample]
+ sample_idx = self._sample_keys.index(sample)
+
+ admixture = " "
+ if "Nuc" in info["Files"]:
+ admixture = self._read_admixture_results(sample)
+
+ handle.write(_SAMPLE_HEADER.format(sample=sample,
+ admixture=admixture))
+
+ handle.write(""" <ul id="tabs">""")
+ handle.write(_SAMPLE_TAB_SELECTED.format(sample_idx=sample_idx,
+ page=1, title="Overview"))
+ handle.write(_SAMPLE_TAB.format(sample_idx=sample_idx,
+ page=2, title="Sample Data"))
+
+ if "Nuc" in info["Files"]:
+ handle.write(_SAMPLE_TAB.format(sample_idx=sample_idx,
+ page=3, title="PCA Plots"))
+ handle.write(_SAMPLE_TAB.format(sample_idx=sample_idx,
+ page=4, title="Treemix Plots"))
+
+ if "Mito" in info["Files"]:
+ handle.write(_SAMPLE_TAB.format(sample_idx=sample_idx,
+ page=5,
+ title="Mitochondrial Phylogeny"))
+
+ handle.write(""" </ul>""")
+
+ self._write_overview(handle, sample, sample_idx)
+
+ if "Nuc" in info["Files"]:
+ handle.write(_SAMPLE_PAGE_PCA.format(sample=sample,
+ sample_idx=sample_idx))
+ handle.write(_SAMPLE_PAGE_TREEMIX.format(sample=sample,
+ sample_idx=sample_idx))
+
+ if "Mito" in info["Files"]:
+ handle.write(_SAMPLE_PAGE_MITO_PHYLO.format(sample=sample,
+ sample_idx=sample_idx))
+
+ handle.write(_SAMPLE_FOOTER)
+
+ def _write_overview(self, output_handle, sample, sample_idx,
+ cutoff=admixture.CUTOFF):
+ info = self._samples[sample]
+ report = self._reports[sample]
+
+ n_tests = 4
+ n_pos_tests = 0
+
+ if "Nuc" in info["Files"]:
+
+ for postfix in ('incl_ts', 'excl_ts'):
+ admix_root = os.path.join(self._root, sample,
+ "results", "admixture")
+
+ for k_groups in (2, 3):
+ filename = os.path.join(admix_root, "%s.%i.Q" % (postfix,
+ k_groups))
+
+ result = admixture.read_admixture_results(filename,
+ self._data,
+ k_groups)
+
+ if sum(value >= cutoff for _, value in result) > 1:
+ n_pos_tests += 1
+
+ if n_pos_tests:
+ tmpl = _SAMPLE_OVERVIEW_INCL_NUCL_POSITIVE
+ else:
+ tmpl = _SAMPLE_OVERVIEW_INCL_NUCL_NEGATIVE
+ else:
+ tmpl = _SAMPLE_OVERVIEW_EXCL_NUCL
+
+ output_handle.write(tmpl.format(sample_idx=sample_idx,
+ n_tests=n_tests,
+ n_pos_tests=n_pos_tests))
+ output_handle.write(_SAMPLE_OVERVIEW_FOOTER.format(sample=sample))
+
+ output_handle.write(_SAMPLE_DATA_HEADER.format(sample_idx=sample_idx))
+
+ if "Nuc" in info["Files"]:
+ summary = report.snp_summary()
+ output_handle.write(_SAMPLE_DATA_NUCLEAR % summary)
+
+ if "Mito" in info["Files"]:
+ summary = report.mito_summary()
+ output_handle.write(_SAMPLE_DATA_MITOCHONDRIA % summary)
+
+ output_handle.write(_SAMPLE_DATA_FOOTER)
+
+ def _read_admixture_results(self, sample, cutoff=admixture.CUTOFF):
+ lines = []
+ lines.append(' <table summary="Admixture overview for sample {}" style="width:125px;">'.format(sample.replace('"', "")))
+ lines.append(" <tr>")
+ for postfix in ('incl_ts', 'excl_ts'):
+ admix_root = os.path.join(self._root, sample,
+ "results", "admixture")
+
+ for k_groups in (2, 3):
+ filename = os.path.join(admix_root, "%s.%i.Q" % (postfix, k_groups))
+
+ lines.append(" <td>")
+ try:
+ ancestral_groups = admixture.read_admixture_results(filename, self._data, k_groups, cutoff)
+ lines.extend(self._build_admixture_figure([value for _, value in ancestral_groups]))
+ except admixture.AdmixtureError:
+ lines.append(' <div style="height:100px;background:gray"></div>')
+ lines.append(" </td>")
+ lines.append(" </tr>")
+ lines.append(" </table>\n")
+
+ return "\n".join(lines)
+
+ @classmethod
+ def _build_admixture_figure(cls, fractions, max_height=100):
+ lines = []
+ if len(fractions) == 2:
+ colors = ("red", "green")
+ elif len(fractions) == 3:
+ colors = ("red", "green", "blue")
+ else:
+ raise RuntimeError("Unexpected number of fractions: %r"
+ % (fractions,))
+
+ values = [value for value in fractions if value >= admixture.CUTOFF]
+
+ tmpl = ' ' * 18 + '<div style="height:{}px;background:{}"></div>'
+ for index, value in enumerate(values):
+ height = round((float(value) / sum(values)) * max_height)
+ lines.append(tmpl.format(height, colors[index]))
+
+ return lines
+
+
+###############################################################################
+
+_TS_LABELS = {
+ "incl_ts": "Including transitions",
+ "excl_ts": "Excluding transitions",
+}
+
+
+###############################################################################
+
+_HTML_HEADER = """<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
+<html xmlns="http://www.w3.org/1999/xhtml">
+<head>
+ <title>PALEOMIX Zonkey v{Version}</title>
+ <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1" />
+ <link href="summary.css" rel="stylesheet" type="text/css" />
+
+ <script type="text/javascript">
+ function selectTab(sample, page) {{
+ for (var i = 1; i <= 5; ++i) {{
+ elem_id = "sample" + sample.toString() + ".link" + i.toString()
+
+ var elem = document.getElementById(elem_id);
+ if (elem) {{
+ elem.className = (i == page) ? 'selected' : '';
+ }}
+
+ elem_id = "sample" + sample.toString() + ".page" + i.toString()
+ var elem = document.getElementById(elem_id);
+ if (elem) {{
+ if (i == page) {{
+ if (page == 1) {{
+ elem.className = 'tabContent small';
+ }} else {{
+ elem.className = 'tabContent';
+ }}
+ }} else {{
+ elem.className = 'tabContent hide';
+ }}
+ }}
+ }}
+ }}
+ </script>
+</head>
+<body>
+ <a name="top" id="top"></a>
+ <center>
+ <div id="header">
+ <h1>PALEOMIX Zonkey {Version}</h1>
+ <h2>A pipeline for detection of F1 hybrids in equids.</h2>
+ </div>
+
+ <div id="content" style="width:1050px;">
+ <p class="introduction">
+ Schubert M, Ermini L, Sarkissian CD, Jónsson H, Ginolhac A,
+ Schaefer R, Martin MD, Fernández R, Kircher M, McCue M,
+ Willerslev E, and Orlando L. "<strong>Characterization of ancient and
+ modern genomes by SNP detection and phylogenomic and metagenomic
+ analysis using PALEOMIX</strong>". Nat Protoc. 2014 May;9(5):1056-82.
+ doi:<a href="https://doi.org/10.1038/nprot.2014.063">
+ 10.1038/nprot.2014.063
+ </a>.
+ Epub 2014 Apr 10. PubMed PMID:
+ <a href="http://www.ncbi.nlm.nih.gov/pubmed/24722405">24722405</a>.
+ </p>
+ <div id="sidebar">
+ <h1>Contents</h1>
+ <div class="submenu">
+ <a href="#">Top</a>
+ <a href="#intro">Introduction</a>
+ {MenuEntries}
+ </div>
+ </div>
+ <div id="mainbar">
+ <h1><a name="introduction" id="introduction"></a>Introduction</h1>
+ <div>
+ <div>
+ The Zonkey Pipeline is a easy-to-use pipeline designed for the
+ analyses of low-coverage, ancient DNA derived from historical
+ equid samples, with the purpose of determining the species of
+ the sample, as well as determining possible hybridization between
+ horses, zebras, and asses. This is accomplished by comparing one
+ or more samples aligned against the <em>Equus caballus</em> 2.0
+ reference sequence with a reference panel of modern equids,
+ including wild and domesticated equids.
+ </div>
+ <br/>
+ <div>
+ For more information, please refer to the
+ <a href="http://paleomix.readthedocs.org/en/latest/zonkey_pipeline/index.html">
+ the documentation for the Zonkey pipeline
+ </a>
+ or
+ <a href="http://paleomix.readthedocs.org/en/latest/">
+ the documentation for the PALEOMIX pipeline,
+ </a>
+ on which the Zonkey pipeline is based.
+ </div>
+ </div>
+ <h1><a name="samples" id="samples"></a>Sample Overview</h1>
+"""
+
+
+_SAMPLE_HEADER = """
+ <h2 style="margin: 0px; padding: 0px;">
+ <a name="sample_{sample}" id="sample_{sample}"
+ href="{sample}/report.html">{sample}</a>
+ </h2>
+
+ <table style="margin: 0px; padding: 0px;">
+ <tr>
+ <td style="width:150px; vertical-align:top; padding-top:20px;">
+{admixture}
+ </td>
+ <td>
+"""
+
+_SAMPLE_FOOTER = """
+ </td>
+ </tr>
+ </table>
+"""
+
+_SAMPLE_TAB_SELECTED = """
+ <li>
+ <a id="sample{sample_idx}.link{page}"
+ href="javascript:selectTab({sample_idx}, {page});"
+ class="selected">{title}</a>
+ </li>
+"""
+
+_SAMPLE_TAB = """
+ <li>
+ <a id="sample{sample_idx}.link{page}"
+ href="javascript:selectTab({sample_idx}, {page});">{title}</a>
+ </li>
+"""
+
+
+_SAMPLE_OVERVIEW_INCL_NUCL_POSITIVE = """
+ <div class="tabContent small" id="sample{sample_idx}.page1">
+ <div>
+ Possible admixture detected in {n_pos_tests} of {n_tests} tests.
+ </div>
+"""
+
+
+_SAMPLE_OVERVIEW_INCL_NUCL_NEGATIVE = """
+ <div class="tabContent small" id="sample{sample_idx}.page1">
+ <div>
+ No admixture detected in nuclear genome.
+ </div>
+"""
+
+
+_SAMPLE_OVERVIEW_EXCL_NUCL = """
+ <div class="tabContent small" id="sample{sample_idx}.page1">
+ <div>
+ Nuclear BAM not provided; no admixture were tests perfomed.
+ </div>
+"""
+
+
+_SAMPLE_OVERVIEW_FOOTER = """
+ <div style="text-align:center;">
+ <a href="{sample}/report.html">(Click here for the full report)</a>
+ </div>
+ </div>
+"""
+
+
+_SAMPLE_DATA_HEADER = """
+ <div class="tabContent hide" id="sample{sample_idx}.page2">
+ <ul>
+"""
+
+
+_SAMPLE_DATA_NUCLEAR = """
+ <strong>Nuclear report:</strong>
+ <ul>
+ <li>BAM file: <em>%(filename)s</em></li>
+ <li>Number of SNPs used (incl. transitions):
+ <em>%(n_sites_incl_ts)s</em></li>
+ <li>Number of SNPs used (excl. transitions):
+ <em>%(n_sites_excl_ts)s</em></li>
+ </ul>
+
+ <br>
+"""
+
+
+_SAMPLE_DATA_MITOCHONDRIA = """
+ <strong>Mitochondrial report:</strong>
+ <ul>
+ <li>BAM file: <em>%(filename)s</em></li>
+ <li>Percentage of sites covered: <em>%(covered_pct)s</em></li>
+ <li>Mean coverage per site: <em>%(mean_coverage)s</em></li>
+ </ul>
+"""
+
+
+_SAMPLE_DATA_FOOTER = """
+ </ul>
+ </div>
+"""
+
+
+_SAMPLE_PAGE_PCA = """
+ <div class="tabContent hide" id="sample{sample_idx}.page3">
+ <table>
+ <tr style="background-color:#f1f0ee">
+ <td style="text-align: center;">
+ <div class="thumbnail">
+ <strong>Including transitions</strong>
+
+ <div class="image">
+ <a href="{sample}/figures/pca/incl_ts.pdf">
+ <img style="width:13em"
+ src="{sample}/figures/pca/incl_ts.png"
+ alt="PCA plot for {sample}, including transitions."/>
+ </a>
+ </div>
+ </div>
+ </td>
+ <td style="text-align: center;">
+ <div class="thumbnail">
+ <strong>Excluding transitions</strong>
+ <div class="image">
+ <a href="{sample}/figures/pca/excl_ts.pdf">
+ <img style="width:13em"
+ src="{sample}/figures/pca/excl_ts.png"
+ alt="PCA plot for {sample}, excluding transitions."/>
+ </a>
+ </div>
+ </div>
+ </td>
+ </tr>
+ </table>
+ </div>
+"""
+
+
+_SAMPLE_PAGE_TREEMIX = """
+ <div class="tabContent hide" id="sample{sample_idx}.page4">
+ <table>
+ <tr style="background-color:#f1f0ee">
+ <td style="text-align: center;">
+ <div class="thumbnail">
+ <strong>Including transitions</strong>
+ <div class="image">
+ <a href="{sample}/figures/treemix/incl_ts_1_tree.pdf">
+ <img style="width:13em"
+ src="{sample}/figures/treemix/incl_ts_1_tree.png"
+ alt="Treemix plot for {sample}, including transitions,
+ with one migration edge."/>
+ </a>
+ </div>
+ </div>
+ </td>
+ <td style="text-align: center;">
+ <div class="thumbnail">
+ <strong>Excluding transitions</strong>
+ <div class="image">
+ <a href="{sample}/figures/treemix/excl_ts_1_tree.pdf">
+ <img style="width:13em"
+ src="{sample}/figures/treemix/excl_ts_1_tree.png"
+ alt="Treemix plot for {sample}, excluding transitions,
+ with one migration edge."></img>
+ </a>
+ </div>
+ </div>
+ </td>
+ </tr>
+ </table>
+ </div>
+"""
+
+
+_SAMPLE_PAGE_MITO_PHYLO = """
+ <div class="tabContent hide" id="sample{sample_idx}.page5">
+ <div class="thumbnail" style="text-align: center;">
+ <div class="image">
+ <a href="{sample}/figures/mitochondria/mito_phylo.pdf">
+ <img style="width:15em"
+ src="{sample}/figures/mitochondria/mito_phylo.png"
+ alt="Mitochondrial phylogeny for {sample}."/>
+ </a>
+ </div>
+ </div>
+ </div>
+"""
+
+
+_HTML_FOOTER = """
+ </div>
+ </div>
+ <div id="footer">
+ This report is based on the PLAIN 1.0 design by
+ <a href="http://www.sixshootermedia.com/">6ix Shooter Media</a>,
+ Creative Commons license.<br/>
+ </div>
+ </center>
+</body>
+</html>
+"""
diff --git a/paleomix/tools/zonkey/pipeline.py b/paleomix/tools/zonkey/pipeline.py
new file mode 100755
index 0000000..8a8ed55
--- /dev/null
+++ b/paleomix/tools/zonkey/pipeline.py
@@ -0,0 +1,458 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2016 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import logging
+import os
+import shutil
+import tarfile
+import time
+
+import paleomix
+import paleomix.yaml
+import paleomix.logger
+
+import paleomix.common.fileutils as fileutils
+
+from paleomix.common.console import \
+ print_err, \
+ print_info, \
+ print_warn
+
+from paleomix.pipeline import \
+ Pypeline
+
+from paleomix.nodes.samtools import \
+ BAMIndexNode
+
+from paleomix.nodes.raxml import \
+ RAxMLRapidBSNode
+
+import paleomix.tools.bam_pipeline.mkfile as bam_mkfile
+
+import paleomix.tools.zonkey.config as zonkey_config
+import paleomix.tools.zonkey.parts.mitochondria as mitochondria
+import paleomix.tools.zonkey.parts.nuclear as nuclear
+import paleomix.tools.zonkey.parts.report as report
+import paleomix.tools.zonkey.parts.summary as summary
+import paleomix.tools.zonkey.parts.common as common_nodes
+
+
+def run_pipeline(config, nodes, msg):
+ pipeline = Pypeline(config)
+ pipeline.add_nodes(nodes)
+
+ logfile_template = time.strftime("zonkey_pipeline.%Y%m%d_%H%M%S_%%02i.log")
+ paleomix.logger.initialize(config, logfile_template)
+
+ logger = logging.getLogger(__name__)
+ logger.info(msg)
+
+ if config.list_executables:
+ pipeline.print_required_executables()
+ return True
+ elif config.list_output_files:
+ pipeline.print_output_files()
+ return True
+ elif config.list_input_files:
+ pipeline.print_input_files()
+ return True
+ elif config.dot_file:
+ logger.info("Writing dependency graph to %r ...", config.dot_file)
+ return pipeline.to_dot(config.dot_file)
+
+ return pipeline.run(max_threads=config.max_threads,
+ progress_ui=config.progress_ui,
+ dry_run=config.dry_run)
+
+
+def build_plink_nodes(config, data, root, bamfile, dependencies=()):
+ plink = {"root": os.path.join(root, 'results', 'plink')}
+
+ ped_node = nuclear.BuildTPEDFilesNode(output_root=plink["root"],
+ table=config.tablefile,
+ downsample=config.downsample_to,
+ bamfile=bamfile,
+ dependencies=dependencies)
+
+ for postfix in ('incl_ts', 'excl_ts'):
+ parameters = {
+ "output_prefix": os.path.join(plink["root"], postfix),
+ "tfam": os.path.join(plink["root"], "common.tfam"),
+ "tped": os.path.join(plink["root"], postfix + ".tped"),
+ "plink_parameters": config.database.settings["Plink"],
+ "dependencies": (ped_node,),
+ }
+
+ if config.indep:
+ parameters["indep_filter"] = config.indep
+ parameters["indep_parameters"] = config.indep_params
+
+ bed_node = nuclear.BuildFilteredBEDFilesNode(**parameters)
+ else:
+ bed_node = nuclear.BuildBEDFilesNode(**parameters)
+
+ plink[postfix] = bed_node
+
+ return plink
+
+
+def build_admixture_nodes(config, data, root, plink):
+ nodes = []
+ for postfix in ('incl_ts', 'excl_ts'):
+ bed_node = plink[postfix]
+
+ admix_root = os.path.join(root, "results", "admixture")
+ report_root = os.path.join(root, "figures", "admixture")
+ for k_groups in (2, 3):
+ replicates = []
+
+ input_file = os.path.join(plink["root"], postfix + ".bed")
+ for replicate in xrange(config.admixture_replicates):
+ output_root = os.path.join(admix_root, "%02i" % (replicate,))
+
+ node = nuclear.AdmixtureNode(input_file=input_file,
+ output_root=output_root,
+ k_groups=k_groups,
+ samples=data.samples,
+ dependencies=(bed_node,))
+
+ replicates.append(node)
+
+ node = nuclear.SelectBestAdmixtureNode(replicates=replicates,
+ output_root=admix_root)
+
+ if config.admixture_only:
+ nodes.append(node)
+ else:
+ samples = os.path.join(root, "figures", "samples.txt")
+ plot = nuclear.AdmixturePlotNode(input_file=os.path.join(admix_root, "%s.%i.Q" % (postfix, k_groups)),
+ output_prefix=os.path.join(report_root, "%s_k%i" % (postfix, k_groups)),
+ samples=samples,
+ order=data.sample_order,
+ dependencies=node)
+
+ nodes.append(plot)
+
+ return nodes
+
+
+def build_treemix_nodes(config, data, root, plink):
+ tmix_root = os.path.join(root, 'results', 'treemix')
+
+ nodes = []
+ for postfix in ('incl_ts', 'excl_ts'):
+ plink_prefix = os.path.join(plink["root"], postfix)
+ plink_nodes = plink[postfix]
+
+ freq_node = nuclear.BuildFreqFilesNode(output_prefix=plink_prefix,
+ input_prefix=os.path.join(plink["root"], postfix),
+ tfam=os.path.join(plink["root"], "common.tfam"),
+ parameters=config.database.settings["Plink"],
+ dependencies=plink_nodes)
+
+ tmix_prefix = os.path.join(tmix_root, postfix)
+ tmix_file_node = nuclear.FreqToTreemixNode(input_file=plink_prefix + ".frq.strat.gz",
+ output_file=tmix_prefix + ".gz",
+ dependencies=(freq_node,))
+
+ k_snps = config.treemix_k
+ if not k_snps:
+ k_snps = ('n_sites_%s' % (postfix,),
+ os.path.join(plink["root"], "common.summary"))
+
+ for n_migrations in (0, 1):
+ n_prefix = "%s.%i" % (tmix_prefix, n_migrations)
+
+ tmix_node = nuclear.TreemixNode(data=data,
+ input_file=tmix_prefix + ".gz",
+ output_prefix=n_prefix,
+ m=n_migrations,
+ k=k_snps,
+ outgroup=config.treemix_outgroup,
+ dependencies=(tmix_file_node,))
+
+ samples = os.path.join(root, "figures", "samples.txt")
+ output_prefix = os.path.join(root, "figures", "treemix", "%s_%i" % (postfix, n_migrations))
+ plot_node = nuclear.PlotTreemixNode(samples=samples,
+ prefix=n_prefix,
+ output_prefix=output_prefix,
+ dependencies=(tmix_node,))
+
+ nodes.append(plot_node)
+
+ return nodes
+
+
+def build_pca_nodes(config, data, root, plink):
+ pca_root = os.path.join(root, 'results', 'pca')
+
+ nodes = []
+ for postfix in ('incl_ts', 'excl_ts'):
+ plink_prefix = os.path.join(plink["root"], postfix)
+ plink_nodes = plink[postfix]
+
+ pca_prefix = os.path.join(pca_root, postfix)
+ pca_node = nuclear.SmartPCANode(input_prefix=plink_prefix,
+ output_prefix=pca_prefix,
+ nchroms=data.settings["NChroms"],
+ dependencies=plink_nodes)
+
+ samples = os.path.join(root, "figures", "samples.txt")
+ pca_plots = os.path.join(root, "figures", "pca", postfix)
+ pca_plot_node = nuclear.PlotPCANode(samples=samples,
+ prefix=pca_prefix,
+ output_prefix=pca_plots,
+ dependencies=pca_node)
+
+ nodes.append(pca_plot_node)
+
+ return nodes
+
+
+def build_coverage_nodes(data, root, nuc_bam, dependencies=()):
+ output_prefix = os.path.join(root, 'figures', 'coverage', 'coverage')
+
+ return nuclear.PlotCoverageNode(contigs=data.contigs,
+ input_file=nuc_bam,
+ output_prefix=output_prefix,
+ dependencies=dependencies),
+
+
+def build_mito_nodes(config, root, bamfile, dependencies=()):
+ if config.database.mitochondria is None:
+ print_warn("WARNING: Zonkey database %r does not contain "
+ "mitochondrial sequences; cannot analyze MT BAM %r!\n"
+ % (config.tablefile, bamfile))
+ return ()
+
+ samples = os.path.join(root, "figures", "samples.txt")
+
+ mt_prefix = os.path.join(root, "results", "mitochondria", "sequences")
+ alignment = mitochondria.MitoConsensusNode(database=config.tablefile,
+ bamfile=bamfile,
+ output_prefix=mt_prefix,
+ dependencies=dependencies)
+
+ raxml_template = os.path.join(root, "results", "mitochondria", "raxml_%s")
+ phylo = RAxMLRapidBSNode.customize(input_alignment=mt_prefix + ".phy",
+ output_template=raxml_template,
+ dependencies=(alignment,))
+
+ phylo.command.set_option("-N", 100)
+ phylo.command.set_option("-m", "GTRGAMMA")
+ phylo = phylo.build_node()
+
+ output_prefix = os.path.join(root, "figures", "mitochondria", "mito_phylo")
+ trees = mitochondria.DrawPhylogenyNode(samples=samples,
+ treefile=raxml_template % ("bestTree",),
+ bootstraps=raxml_template % ("bootstrap",),
+ output_prefix=output_prefix,
+ dependencies=(phylo,))
+
+ return (trees,)
+
+
+def build_pipeline(config, root, nuc_bam, mito_bam, cache):
+ nodes = []
+ sample_tbl = os.path.join(root, "figures", "samples.txt")
+ samples = common_nodes.WriteSampleList(config=config,
+ output_file=sample_tbl)
+
+ if nuc_bam is not None:
+ # When not sampling, BuildTPED relies on indexed access to ease
+ # processing of one chromosome at a time. The index is further required
+ # for idxstats used by the PlotCoverageNode.
+ index = cache.get(nuc_bam)
+ if index is None:
+ index = cache[nuc_bam] = BAMIndexNode(infile=nuc_bam)
+
+ plink = build_plink_nodes(config, config.database, root, nuc_bam,
+ dependencies=(samples, index))
+
+ nodes.extend(build_admixture_nodes(config, config.database, root,
+ plink))
+
+ if not config.admixture_only:
+ nodes.extend(build_coverage_nodes(config.database,
+ root, nuc_bam, (index,)))
+ nodes.extend(build_pca_nodes(config, config.database,
+ root, plink))
+ nodes.extend(build_treemix_nodes(config, config.database,
+ root, plink))
+
+ if mito_bam is not None and not config.admixture_only:
+ index = cache.get(mito_bam)
+ if index is None:
+ index = cache[mito_bam] = BAMIndexNode(infile=mito_bam)
+
+ nodes.extend(build_mito_nodes(config, root, mito_bam,
+ dependencies=(samples, index)))
+
+ if not config.admixture_only:
+ nodes.append(report.ReportNode(config, root, nuc_bam, mito_bam,
+ dependencies=nodes))
+
+ return nodes
+
+
+def run_admix_pipeline(config):
+ print_info("Building Zonkey pipeline ", end='')
+ config.temp_root = os.path.join(config.destination, "temp")
+ if not config.dry_run:
+ fileutils.make_dirs(config.temp_root)
+
+ cache = {}
+ nodes = []
+ for sample in config.samples.itervalues():
+ root = sample["Root"]
+ nuc_bam = sample["Files"].get("Nuc")
+ mito_bam = sample["Files"].get("Mito")
+
+ nodes.extend(build_pipeline(config, root, nuc_bam, mito_bam, cache))
+ print_info(".", end='')
+
+ if config.multisample and not config.admixture_only:
+ nodes = [summary.SummaryNode(config, nodes)]
+ print_info(".")
+
+ if not run_pipeline(config, nodes, "\nRunning Zonkey ..."):
+ return 1
+
+
+def setup_mito_mapping(config):
+ genomes_root = os.path.join(config.destination, "genomes")
+ if not os.path.exists(genomes_root):
+ fileutils.make_dirs(genomes_root)
+
+ mkfile_fpath = os.path.join(config.destination, "makefile.yaml")
+
+ filenames = [mkfile_fpath]
+ for name, record in sorted(config.database.mitochondria.iteritems()):
+ filenames.append(os.path.join(genomes_root, "%s.fasta"
+ % (record.name,)))
+
+ existing_filenames = [filename for filename in filenames
+ if os.path.exists(filename)]
+
+ # A bit strict, but avoid accidential overwrites
+ if existing_filenames:
+ print_err("ERROR: Output file(s) already exists, "
+ "cannot proceed:\n %s"
+ % ("\n ".join(map(repr, existing_filenames),)))
+
+ return 1
+
+ with open(mkfile_fpath, "w") as mkfile:
+ mkfile.write(bam_mkfile.build_makefile(add_prefix_tmpl=False,
+ add_sample_tmpl=False))
+
+ mkfile.write("\n\nPrefixes:\n")
+
+ for name, record in sorted(config.database.mitochondria.iteritems()):
+ meta = (record.meta or "").upper()
+ if "EXCLUDE" in meta:
+ continue
+
+ mkfile.write(" %s:\n" % (record.name,))
+ mkfile.write(" Path: genomes/%s.fasta\n" % (record.name,))
+
+ info = config.database.samples.get(record.name)
+ if info is not None:
+ mkfile.write(" # Group: %s\n"
+ % (info.get('Group(3)', 'NA'),))
+ mkfile.write(" # Species: %s\n"
+ % (info.get('Species', 'NA'),))
+ mkfile.write(" # Sex: %s\n"
+ % (info.get('Sex', 'NA'),))
+ mkfile.write(" # Publication: %s\n"
+ % (info.get('Publication', 'NA'),))
+ mkfile.write(" # Sample ID: %s\n"
+ % (info.get('SampleID', 'NA'),))
+
+ mkfile.write('\n')
+
+ fasta_fpath = os.path.join(genomes_root,
+ "%s.fasta" % (record.name,))
+
+ with open(fasta_fpath, "w") as fasta_handle:
+ fasta_handle.write(str(record))
+ fasta_handle.write("\n")
+
+ mkfile.write("\n")
+
+ return 0
+
+
+def setup_example(config):
+ root = os.path.join(config.destination, 'zonkey_pipeline')
+
+ with tarfile.TarFile(config.tablefile) as tar_handle:
+ example_files = []
+ existing_files = []
+ for member in tar_handle.getmembers():
+ if os.path.dirname(member.name) == 'examples' and member.isfile():
+ example_files.append(member)
+
+ destination = fileutils.reroot_path(root, member.name)
+ if os.path.exists(destination):
+ existing_files.append(destination)
+
+ if existing_files:
+ print_err("Output files already exist at destination:\n - %s"
+ % ("\n - ".join(map(repr, existing_files))))
+ return 1
+ elif not example_files:
+ print_err("Sample database %r does not contain example data; "
+ "cannot proceed." % (config.tablefile,))
+ return 1
+
+ if not os.path.exists(root):
+ fileutils.make_dirs(root)
+
+ for member in example_files:
+ destination = fileutils.reroot_path(root, member.name)
+ src_handle = tar_handle.extractfile(member)
+ with open(destination, 'w') as out_handle:
+ shutil.copyfileobj(src_handle, out_handle)
+
+ print_info("Sucessfully saved example data in %r" % (root,))
+
+ return 0
+
+
+def main(argv):
+ try:
+ config = zonkey_config.parse_config(argv)
+ if config is None:
+ return 1
+ except zonkey_config.ConfigError, error:
+ print_err(error)
+ return 1
+
+ if config.command == "run":
+ return run_admix_pipeline(config)
+ elif config.command == "mito":
+ return setup_mito_mapping(config)
+ elif config.command == "example":
+ return setup_example(config)
+
+ return 1
diff --git a/paleomix/ui.py b/paleomix/ui.py
new file mode 100644
index 0000000..7710a3b
--- /dev/null
+++ b/paleomix/ui.py
@@ -0,0 +1,469 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+"""Functions relating to the CLI interface."""
+import datetime
+import multiprocessing
+import optparse
+import os
+import select
+import sys
+import termios
+import time
+import tty
+
+import paleomix.nodegraph
+import paleomix.logger
+import paleomix.common.text as text
+
+from paleomix.common.console import \
+ print_msg, \
+ print_debug, \
+ print_info, \
+ print_err, \
+ print_disabled, \
+ print_warn
+
+
+def add_optiongroup(parser, ui_default="running", color_default="on"):
+ """Adds an option-group to an OptionParser object, with options
+ pertaining to logging. Note that 'initialize' expects the config
+ object to have these options."""
+ group = optparse.OptionGroup(parser, "Progress reporting")
+ group.add_option("--progress-ui", default=ui_default, type="choice",
+ choices=("running", "progress", "summary",
+ "verbose", "quiet"),
+ help="Select method for displaying the progress of the "
+ "pipeline: 'running' = Display only currently "
+ "running nodes; 'progress' = Display changes in "
+ "state; 'summary'; one-line summary only. "
+ "[Default is '%default']")
+ group.add_option("--ui-colors", default=color_default,
+ choices=("on", "off", "force"),
+ help="Enable, disable, or force the use of color codes "
+ "when printing the command-line UI. Unless forced, "
+ "colors will only be printed if STDOUT is a TTY "
+ "[Default is '%default']")
+ parser.add_option_group(group)
+
+
+def get_ui(ui_name):
+ """Returns a UI instance by name, using the choices allowed by
+ the 'add_optiongroup' function. See keys in 'UI_TYPES'."""
+ ui_name = ui_name.title()
+ if ui_name not in UI_TYPES:
+ raise ValueError("Unknown UI type %r" % (ui_name,))
+ return UI_TYPES[ui_name]()
+
+
+def set_ui_colors(choice):
+ import paleomix.common.console as console
+ choice = choice.lower()
+ if choice == "on":
+ console.set_color_output(console.COLORS_ON)
+ elif choice == "off":
+ console.set_color_output(console.COLORS_OFF)
+ elif choice == "force":
+ console.set_color_output(console.COLORS_FORCED)
+ else:
+ raise ValueError("Unknown color setting %r" % (choice,))
+
+
+class CommandLine(object):
+ def __init__(self):
+ self._tty_settings = None
+
+ def __enter__(self):
+ assert not self._tty_settings
+ # False if the pipeline is being piped somewhere
+ if sys.stdin.isatty() and sys.stdout.isatty():
+ # False if the process is running in the background
+ if os.getpgrp() == os.tcgetpgrp(sys.stdout.fileno()):
+ try:
+ # Store old settings
+ self._tty_settings = termios.tcgetattr(sys.stdin)
+ # Disable echo
+ tty.setcbreak(sys.stdin.fileno())
+ except tty.error:
+ pass # Silently ignore failures
+
+ return self
+
+ def __exit__(self, _type, _value, _traceback):
+ if self._tty_settings:
+ # Restore settings (re-enable echo)
+ termios.tcsetattr(sys.stdin, termios.TCSADRAIN, self._tty_settings)
+
+ def process_key_presses(self, nodegraph, max_threads, ui):
+ if not self._tty_settings:
+ return max_threads
+
+ help_printed = False
+ old_max_threads = max_threads
+ while self.poll_stdin():
+ character = sys.stdin.read(1)
+ if character == "+":
+ max_threads = min(multiprocessing.cpu_count(), max_threads + 1)
+ elif character == "-":
+ max_threads = max(1, max_threads - 1)
+ elif character in "lL":
+ print_info(file=sys.stdout)
+ progress_printer = RunningUI()
+ progress_printer.max_threads = max_threads
+ progress_printer.start_time = ui.start_time
+ progress_printer.refresh(nodegraph)
+ progress_printer.flush()
+ elif character in "hH":
+ if help_printed:
+ continue
+
+ help_printed = True
+ print_info("""
+Commands:
+ Key Function
+ h Prints this message.
+ l Lists the currently runnning nodes.
+ + Increases the maximum number of threads by one.
+ - Decreases the maximum number of threads by one; already running tasks
+ are NOT terminated if the number of threads currently used exceeds the
+ resulting maximum.
+""", file=sys.stdout)
+ else:
+ continue
+
+ if max_threads != old_max_threads:
+ print_debug("Maximum number of threads changed from %i to %i."
+ % (old_max_threads, max_threads), file=sys.stdout)
+
+ return max_threads
+
+ @classmethod
+ def poll_stdin(cls):
+ return select.select([sys.stdin], [], [], 0) == ([sys.stdin], [], [])
+
+
+class BaseUI(object):
+ """UI base class.
+
+ Can be initialized, but does nothing but collect stats about
+ the pipeline. Subclasses should override at least one of
+ (but still call the BaseUI function) the functions 'flush',
+ 'finalize', and/or 'state_changed'.
+
+ In addition, the class contains the following properties:
+ - states -- List containing the observed number of states
+ for a state-value corresponding to the index
+ - threads -- Est. number of threads used by running nodes.
+
+ These properties should be treated as read-only.
+ """
+
+ def __init__(self):
+ """Basic initializer; must be called in subclasses."""
+ self.states = []
+ self.threads = 0
+ self.max_threads = 0
+ self.start_time = None
+ self._end_time = None
+ self._updated = True
+
+ def flush(self):
+ """Called by the user of the UI to ensure that the UI to print
+ the current state of the pipeline / changes to pipeline / etc.
+
+ Returns true if node-states have changed since last update.
+ """
+ if self._updated:
+ self._updated = False
+ return True
+ return False
+
+ def finalize(self):
+ """Called by the pipeline at the termination of a run. By default,
+ this function prints the location of the log-file if one was created
+ during the run (e.g. if there were errors), and a summary of all nodes.
+ """
+ runtime = (self._end_time or 0) - (self.start_time or 0)
+
+ if self.states[self.ERROR]:
+ print_err("Done; but errors were detected ...")
+ else:
+ print_info("Done ...")
+
+ print_info()
+ rows = [(" Number of nodes:", sum(self.states)),
+ (" Number of done nodes:", self.states[self.DONE]),
+ (" Number of runable nodes:", self.states[self.RUNABLE]),
+ (" Number of queued nodes:", self.states[self.QUEUED]),
+ (" Number of outdated nodes:", self.states[self.OUTDATED]),
+ (" Number of failed nodes:", self.states[self.ERROR]),
+ (" Pipeline runtime:", _fmt_runtime(runtime))]
+
+ for line in text.padded_table(rows):
+ print_info(line)
+
+ print_info("\nUse --list-output-files to view status of output files.")
+
+ logfile = paleomix.logger.get_logfile()
+ if logfile:
+ print_debug("Log-file located at %r" % (logfile,))
+
+ print_info()
+
+ def refresh(self, nodegraph):
+ """Called when the nodegraph has refreshed, causing state-counts
+ to be recalculated."""
+ self._updated = True
+ self.states, self.threads \
+ = self._count_states(nodegraph, nodegraph.iterflat())
+
+ def state_changed(self, node, old_state, new_state, _is_primary):
+ """Observer function for NodeGraph; counts states of nodes."""
+ self._updated = True
+
+ self.states[old_state] -= 1
+ self.states[new_state] += 1
+ if old_state == self.RUNNING:
+ self.threads -= node.threads
+ elif new_state == self.RUNNING:
+ self.threads += node.threads
+
+ if self.start_time is None:
+ self.start_time = time.time()
+ self._end_time = time.time()
+
+ @classmethod
+ def _count_states(cls, nodegraph, nodes):
+ """Counts the number of each state observed for a set of nodes, and
+ returns these as a list, as well as the estimated number of threads
+ being used by running nodes."""
+ states = [0] * nodegraph.NUMBER_OF_STATES
+ threads = 0
+
+ for node in nodes:
+ state = nodegraph.get_node_state(node)
+
+ states[state] += 1
+ if state == nodegraph.RUNNING:
+ threads += node.threads
+
+ return states, threads
+
+ def _describe_state(self):
+ """ TODO """
+ runtime = 0
+ if self.start_time is not None:
+ runtime = time.time() - self.start_time
+
+ fields = [datetime.datetime.now().strftime("%T"),
+ ' Running ', str(self.states[self.RUNNING]), ' ',
+ ('task ' if self.states[self.RUNNING] == 1 else 'tasks '),
+ 'using ~%i of max %i threads; ' % (self.threads,
+ self.max_threads)]
+
+ if self.states[self.OUTDATED]:
+ fields.append('%i outdated, ' % (self.states[self.OUTDATED],))
+
+ if self.states[self.ERROR]:
+ fields.append('%i failed, ' % (self.states[self.ERROR],))
+
+ fields.extend(('%i done of %i tasks' % (self.states[self.DONE],
+ sum(self.states),),
+ ' in ', _fmt_runtime(runtime),
+ '; press \'h\' for help.'))
+
+ return ''.join(fields)
+
+ DONE = paleomix.nodegraph.NodeGraph.DONE
+ RUNNING = paleomix.nodegraph.NodeGraph.RUNNING
+ RUNABLE = paleomix.nodegraph.NodeGraph.RUNABLE
+ QUEUED = paleomix.nodegraph.NodeGraph.QUEUED
+ OUTDATED = paleomix.nodegraph.NodeGraph.OUTDATED
+ ERROR = paleomix.nodegraph.NodeGraph.ERROR
+
+
+class RunningUI(BaseUI):
+ """Prints a summary, and the list of running nodes every
+ time 'flush' is called."""
+
+ def __init__(self):
+ BaseUI.__init__(self)
+ self._running_nodes = []
+
+ def flush(self):
+ """See BaseUI.flush."""
+ if BaseUI.flush(self) and self._running_nodes:
+ self._print_header()
+ for node in sorted(map(str, self._running_nodes)):
+ print_info(" - %s" % (node,), file=sys.stdout)
+ print_info(file=sys.stdout)
+
+ def refresh(self, nodegraph):
+ """See BaseUI.refresh."""
+ BaseUI.refresh(self, nodegraph)
+ self._running_nodes = []
+ for node in nodegraph.iterflat():
+ if nodegraph.get_node_state(node) == self.RUNNING:
+ self._running_nodes.append(node)
+
+ def state_changed(self, node, old_state, new_state, is_primary):
+ """See BaseUI.state_changed."""
+ BaseUI.state_changed(self, node, old_state, new_state, is_primary)
+
+ if old_state == self.RUNNING:
+ self._running_nodes.remove(node)
+ elif new_state == self.RUNNING:
+ self._running_nodes.append(node)
+
+ def _print_header(self):
+ print_msg('\n%s' % (self._describe_state(),), file=sys.stdout)
+
+ logfile = paleomix.logger.get_logfile()
+ if logfile:
+ print_debug(" Log-file located at %r" % (logfile,),
+ file=sys.stdout)
+
+
+class ProgressUI(BaseUI):
+ """Progress based UI: Prints nodes when they start running; they finish
+ running; or when they fail running. Changes to state resulting from the
+ above is not printed. Every 20th update is followed by a summary of the
+ current total progress when flush is called."""
+
+ # Print a summery of the current state very N events
+ _SUMMARY_EVERY = 20
+
+ def __init__(self):
+ self._refresh_count = ProgressUI._SUMMARY_EVERY
+ self._runtimes = {}
+ BaseUI.__init__(self)
+
+ def refresh(self, nodegraph):
+ """See BaseUI.refresh."""
+ BaseUI.refresh(self, nodegraph)
+ self._print_summary()
+
+ def state_changed(self, node, old_state, new_state, is_primary):
+ """See BaseUI.state_changed."""
+ BaseUI.state_changed(self, node, old_state, new_state, is_primary)
+ if is_primary and (new_state in self._DESCRIPTIONS):
+ self._print_state(node, new_state)
+
+ self._refresh_count -= 1
+ if new_state == self.ERROR:
+ self._refresh_count = 0
+
+ def flush(self):
+ """See BaseUI.flush."""
+ if (self._refresh_count <= 0):
+ self._refresh_count = ProgressUI._SUMMARY_EVERY
+ self._print_summary()
+
+ def _print_summary(self):
+ """Prints a summary of the pipeline progress."""
+ print_msg()
+ print_msg(self._describe_state(), file=sys.stdout)
+
+ logfile = paleomix.logger.get_logfile()
+ if logfile:
+ print_debug("Log-file located at %r" % (logfile,), file=sys.stdout)
+
+ def _print_state(self, node, new_state):
+ state_label, print_func = self._DESCRIPTIONS[new_state]
+ if new_state == self.RUNNING:
+ self._runtimes[node] = time.time()
+ elif new_state in (self.RUNNING, self.DONE, self.ERROR):
+ state_label = "%s (%s)" % (state_label, self._get_runtime(node))
+
+ time_label = datetime.datetime.now().strftime("%T")
+ print_func("%s %s: %s" % (time_label, state_label, node),
+ file=sys.stdout)
+
+ def _get_runtime(self, node):
+ current_time = time.time()
+ runtime = current_time - self._runtimes.pop(node, current_time)
+ return _fmt_runtime(runtime)
+
+ _DESCRIPTIONS = {
+ BaseUI.DONE: ("Finished", print_disabled),
+ BaseUI.RUNNING: ("Started", print_info),
+ BaseUI.ERROR: ("Failed", print_err),
+ }
+
+
+class SummaryUI(BaseUI):
+ def __init__(self):
+ self._max_len = 0
+ self._new_error = False
+ BaseUI.__init__(self)
+
+ def state_changed(self, node, old_state, new_state, is_primary):
+ BaseUI.state_changed(self, node, old_state, new_state, is_primary)
+ self._new_error |= (new_state == self.ERROR and is_primary)
+
+ def flush(self):
+ if BaseUI.flush(self):
+ description = self._describe_state()
+
+ self._max_len = max(len(description), self._max_len)
+ print_msg("\r%s" % (description.ljust(self._max_len),), end="",
+ file=sys.stdout)
+
+ logfile = paleomix.logger.get_logfile()
+ if logfile and self._new_error:
+ print_debug("\nLog-file located at %r" % (logfile,),
+ file=sys.stdout)
+ self._new_error = False
+ sys.stdout.flush()
+
+ def finalize(self):
+ print_msg(file=sys.stdout)
+ BaseUI.finalize(self)
+
+
+def _fmt_runtime(runtime):
+ runtime = int(round(runtime))
+
+ if runtime >= 3600:
+ fmt = "{hours}:{mins:02}:{secs:02}s"
+ elif runtime >= 60:
+ fmt = "{mins}:{secs:02}s"
+ else:
+ fmt = "{secs}s"
+
+ return fmt.format(hours=int(runtime) // 3600,
+ mins=(int(runtime) // 60) % 60,
+ secs=(runtime % 60))
+
+
+# No longer provided
+VerboseUI = RunningUI
+QuietUI = RunningUI
+
+# Different types of UIs
+UI_TYPES = {
+ "Verbose": VerboseUI,
+ "Quiet": RunningUI,
+ "Running": RunningUI,
+ "Progress": ProgressUI,
+ "Summary": SummaryUI,
+}
diff --git a/paleomix/yaml/CHANGES b/paleomix/yaml/CHANGES
new file mode 100644
index 0000000..8bc18b2
--- /dev/null
+++ b/paleomix/yaml/CHANGES
@@ -0,0 +1,140 @@
+
+For a complete Subversion changelog, see 'http://pyyaml.org/log/pyyaml'.
+
+3.10 (2011-05-30)
+-----------------
+
+* Do not try to build LibYAML bindings on platforms other than CPython
+ (Thank to olt(at)bogosoft(dot)com).
+* Clear cyclic references in the parser and the emitter
+ (Thank to kristjan(at)ccpgames(dot)com).
+* Dropped support for Python 2.3 and 2.4.
+
+3.09 (2009-08-31)
+-----------------
+
+* Fixed an obscure scanner error not reported when there is
+ no line break at the end of the stream (Thank to Ingy).
+* Fixed use of uninitialized memory when emitting anchors with
+ LibYAML bindings (Thank to cegner(at)yahoo-inc(dot)com).
+* Fixed emitting incorrect BOM characters for UTF-16 (Thank to
+ Valentin Nechayev)
+* Fixed the emitter for folded scalars not respecting the preferred
+ line width (Thank to Ingy).
+* Fixed a subtle ordering issue with emitting '%TAG' directives
+ (Thank to Andrey Somov).
+* Fixed performance regression with LibYAML bindings.
+
+
+3.08 (2008-12-31)
+-----------------
+
+* Python 3 support (Thank to Erick Tryzelaar).
+* Use Cython instead of Pyrex to build LibYAML bindings.
+* Refactored support for unicode and byte input/output streams.
+
+
+3.07 (2008-12-29)
+-----------------
+
+* The emitter learned to use an optional indentation indicator
+ for block scalar; thus scalars with leading whitespaces
+ could now be represented in a literal or folded style.
+* The test suite is now included in the source distribution.
+ To run the tests, type 'python setup.py test'.
+* Refactored the test suite: dropped unittest in favor of
+ a custom test appliance.
+* Fixed the path resolver in CDumper.
+* Forced an explicit document end indicator when there is
+ a possibility of parsing ambiguity.
+* More setup.py improvements: the package should be usable
+ when any combination of setuptools, Pyrex and LibYAML
+ is installed.
+* Windows binary packages are built against LibYAML-0.1.2.
+* Minor typos and corrections (Thank to Ingy dot Net
+ and Andrey Somov).
+
+
+3.06 (2008-10-03)
+-----------------
+
+* setup.py checks whether LibYAML is installed and if so, builds
+ and installs LibYAML bindings. To force or disable installation
+ of LibYAML bindings, use '--with-libyaml' or '--without-libyaml'
+ respectively.
+* The source distribution includes compiled Pyrex sources so
+ building LibYAML bindings no longer requires Pyrex installed.
+* 'yaml.load()' raises an exception if the input stream contains
+ more than one YAML document.
+* Fixed exceptions produced by LibYAML bindings.
+* Fixed a dot '.' character being recognized as !!float.
+* Fixed Python 2.3 compatibility issue in constructing !!timestamp values.
+* Windows binary packages are built against the LibYAML stable branch.
+* Added attributes 'yaml.__version__' and 'yaml.__with_libyaml__'.
+
+
+3.05 (2007-05-13)
+-----------------
+
+* Windows binary packages were built with LibYAML trunk.
+* Fixed a bug that prevent processing a live stream of YAML documents in
+ timely manner (Thanks edward(at)sweetbytes(dot)net).
+* Fixed a bug when the path in add_path_resolver contains boolean values
+ (Thanks jstroud(at)mbi(dot)ucla(dot)edu).
+* Fixed loss of microsecond precision in timestamps
+ (Thanks edemaine(at)mit(dot)edu).
+* Fixed loading an empty YAML stream.
+* Allowed immutable subclasses of YAMLObject.
+* Made the encoding of the unicode->str conversion explicit so that
+ the conversion does not depend on the default Python encoding.
+* Forced emitting float values in a YAML compatible form.
+
+
+3.04 (2006-08-20)
+-----------------
+
+* Include experimental LibYAML bindings.
+* Fully support recursive structures.
+* Sort dictionary keys. Mapping node values are now represented
+ as lists of pairs instead of dictionaries. No longer check
+ for duplicate mapping keys as it didn't work correctly anyway.
+* Fix invalid output of single-quoted scalars in cases when a single
+ quote is not escaped when preceeded by whitespaces or line breaks.
+* To make porting easier, rewrite Parser not using generators.
+* Fix handling of unexpected block mapping values.
+* Fix a bug in Representer.represent_object: copy_reg.dispatch_table
+ was not correctly handled.
+* Fix a bug when a block scalar is incorrectly emitted in the simple
+ key context.
+* Hold references to the objects being represented.
+* Make Representer not try to guess !!pairs when a list is represented.
+* Fix timestamp constructing and representing.
+* Fix the 'N' plain scalar being incorrectly recognized as !!bool.
+
+
+3.03 (2006-06-19)
+-----------------
+
+* Fix Python 2.5 compatibility issues.
+* Fix numerous bugs in the float handling.
+* Fix scanning some ill-formed documents.
+* Other minor fixes.
+
+
+3.02 (2006-05-15)
+-----------------
+
+* Fix win32 installer. Apparently bdist_wininst does not work well
+ under Linux.
+* Fix a bug in add_path_resolver.
+* Add the yaml-highlight example. Try to run on a color terminal:
+ `python yaml_hl.py <any_document.yaml`.
+
+
+3.01 (2006-05-07)
+-----------------
+
+* Initial release. The version number reflects the codename
+ of the project (PyYAML 3000) and differenciates it from
+ the abandoned PyYaml module.
+
diff --git a/paleomix/yaml/LICENSE b/paleomix/yaml/LICENSE
new file mode 100644
index 0000000..050ced2
--- /dev/null
+++ b/paleomix/yaml/LICENSE
@@ -0,0 +1,19 @@
+Copyright (c) 2006 Kirill Simonov
+
+Permission is hereby granted, free of charge, to any person obtaining a copy of
+this software and associated documentation files (the "Software"), to deal in
+the Software without restriction, including without limitation the rights to
+use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies
+of the Software, and to permit persons to whom the Software is furnished to do
+so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/paleomix/yaml/PKG-INFO b/paleomix/yaml/PKG-INFO
new file mode 100644
index 0000000..b75c574
--- /dev/null
+++ b/paleomix/yaml/PKG-INFO
@@ -0,0 +1,38 @@
+Metadata-Version: 1.0
+Name: PyYAML
+Version: 3.10
+Summary: YAML parser and emitter for Python
+Home-page: http://pyyaml.org/wiki/PyYAML
+Author: Kirill Simonov
+Author-email: xi at resolvent.net
+License: MIT
+Download-URL: http://pyyaml.org/download/pyyaml/PyYAML-3.10.tar.gz
+Description: YAML is a data serialization format designed for human readability
+ and interaction with scripting languages. PyYAML is a YAML parser
+ and emitter for Python.
+
+ PyYAML features a complete YAML 1.1 parser, Unicode support, pickle
+ support, capable extension API, and sensible error messages. PyYAML
+ supports standard YAML tags and provides Python-specific tags that
+ allow to represent an arbitrary Python object.
+
+ PyYAML is applicable for a broad range of tasks from complex
+ configuration files to object serialization and persistance.
+Platform: Any
+Classifier: Development Status :: 5 - Production/Stable
+Classifier: Intended Audience :: Developers
+Classifier: License :: OSI Approved :: MIT License
+Classifier: Operating System :: OS Independent
+Classifier: Programming Language :: Python
+Classifier: Programming Language :: Python :: 2
+Classifier: Programming Language :: Python :: 2.3
+Classifier: Programming Language :: Python :: 2.4
+Classifier: Programming Language :: Python :: 2.5
+Classifier: Programming Language :: Python :: 2.6
+Classifier: Programming Language :: Python :: 2.7
+Classifier: Programming Language :: Python :: 3
+Classifier: Programming Language :: Python :: 3.0
+Classifier: Programming Language :: Python :: 3.1
+Classifier: Programming Language :: Python :: 3.2
+Classifier: Topic :: Software Development :: Libraries :: Python Modules
+Classifier: Topic :: Text Processing :: Markup
diff --git a/paleomix/yaml/README b/paleomix/yaml/README
new file mode 100644
index 0000000..c1edf13
--- /dev/null
+++ b/paleomix/yaml/README
@@ -0,0 +1,35 @@
+PyYAML - The next generation YAML parser and emitter for Python.
+
+To install, type 'python setup.py install'.
+
+By default, the setup.py script checks whether LibYAML is installed
+and if so, builds and installs LibYAML bindings. To skip the check
+and force installation of LibYAML bindings, use the option '--with-libyaml':
+'python setup.py --with-libyaml install'. To disable the check and
+skip building and installing LibYAML bindings, use '--without-libyaml':
+'python setup.py --without-libyaml install'.
+
+When LibYAML bindings are installed, you may use fast LibYAML-based
+parser and emitter as follows:
+
+ >>> yaml.load(stream, Loader=yaml.CLoader)
+ >>> yaml.dump(data, Dumper=yaml.CDumper)
+
+PyYAML includes a comprehensive test suite. To run the tests,
+type 'python setup.py test'.
+
+For more information, check the PyYAML homepage:
+'http://pyyaml.org/wiki/PyYAML'.
+
+For PyYAML tutorial and reference, see:
+'http://pyyaml.org/wiki/PyYAMLDocumentation'.
+
+Post your questions and opinions to the YAML-Core mailing list:
+'http://lists.sourceforge.net/lists/listinfo/yaml-core'.
+
+Submit bug reports and feature requests to the PyYAML bug tracker:
+'http://pyyaml.org/newticket?component=pyyaml'.
+
+PyYAML is written by Kirill Simonov <xi at resolvent.net>. It is released
+under the MIT license. See the file LICENSE for more details.
+
diff --git a/paleomix/yaml/__init__.py b/paleomix/yaml/__init__.py
new file mode 100644
index 0000000..d34edf4
--- /dev/null
+++ b/paleomix/yaml/__init__.py
@@ -0,0 +1,31 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import sys
+
+# The following import is done to allow (at some point) for the inclusion of
+# the python 3.x version of PyYAML, and the automatic selection of the correct
+# version. Currently the rest of the pipeline only supports Python 2.x.
+if sys.version_info.major == 2:
+ from paleomix.yaml.lib2 import *
+else:
+ raise NotImplementedError("Python 3.x version of PyYAML not bundled yet")
diff --git a/paleomix/yaml/lib2/__init__.py b/paleomix/yaml/lib2/__init__.py
new file mode 100644
index 0000000..77a583d
--- /dev/null
+++ b/paleomix/yaml/lib2/__init__.py
@@ -0,0 +1,310 @@
+
+from error import *
+
+from tokens import *
+from events import *
+from nodes import *
+
+from loader import *
+from dumper import *
+
+__version__ = '3.10-paleomix'
+__with_libyaml__ = False
+
+def scan(stream, Loader=Loader):
+ """
+ Scan a YAML stream and produce scanning tokens.
+ """
+ loader = Loader(stream)
+ try:
+ while loader.check_token():
+ yield loader.get_token()
+ finally:
+ loader.dispose()
+
+def parse(stream, Loader=Loader):
+ """
+ Parse a YAML stream and produce parsing events.
+ """
+ loader = Loader(stream)
+ try:
+ while loader.check_event():
+ yield loader.get_event()
+ finally:
+ loader.dispose()
+
+def compose(stream, Loader=Loader):
+ """
+ Parse the first YAML document in a stream
+ and produce the corresponding representation tree.
+ """
+ loader = Loader(stream)
+ try:
+ return loader.get_single_node()
+ finally:
+ loader.dispose()
+
+def compose_all(stream, Loader=Loader):
+ """
+ Parse all YAML documents in a stream
+ and produce corresponding representation trees.
+ """
+ loader = Loader(stream)
+ try:
+ while loader.check_node():
+ yield loader.get_node()
+ finally:
+ loader.dispose()
+
+def load(stream, Loader=Loader):
+ """
+ Parse the first YAML document in a stream
+ and produce the corresponding Python object.
+ """
+ loader = Loader(stream)
+ try:
+ return loader.get_single_data()
+ finally:
+ loader.dispose()
+
+def load_all(stream, Loader=Loader):
+ """
+ Parse all YAML documents in a stream
+ and produce corresponding Python objects.
+ """
+ loader = Loader(stream)
+ try:
+ while loader.check_data():
+ yield loader.get_data()
+ finally:
+ loader.dispose()
+
+def safe_load(stream):
+ """
+ Parse the first YAML document in a stream
+ and produce the corresponding Python object.
+ Resolve only basic YAML tags.
+ """
+ return load(stream, SafeLoader)
+
+def safe_load_all(stream):
+ """
+ Parse all YAML documents in a stream
+ and produce corresponding Python objects.
+ Resolve only basic YAML tags.
+ """
+ return load_all(stream, SafeLoader)
+
+def emit(events, stream=None, Dumper=Dumper,
+ canonical=None, indent=None, width=None,
+ allow_unicode=None, line_break=None):
+ """
+ Emit YAML parsing events into a stream.
+ If stream is None, return the produced string instead.
+ """
+ getvalue = None
+ if stream is None:
+ from StringIO import StringIO
+ stream = StringIO()
+ getvalue = stream.getvalue
+ dumper = Dumper(stream, canonical=canonical, indent=indent, width=width,
+ allow_unicode=allow_unicode, line_break=line_break)
+ try:
+ for event in events:
+ dumper.emit(event)
+ finally:
+ dumper.dispose()
+ if getvalue:
+ return getvalue()
+
+def serialize_all(nodes, stream=None, Dumper=Dumper,
+ canonical=None, indent=None, width=None,
+ allow_unicode=None, line_break=None,
+ encoding='utf-8', explicit_start=None, explicit_end=None,
+ version=None, tags=None):
+ """
+ Serialize a sequence of representation trees into a YAML stream.
+ If stream is None, return the produced string instead.
+ """
+ getvalue = None
+ if stream is None:
+ if encoding is None:
+ from StringIO import StringIO
+ else:
+ from cStringIO import StringIO
+ stream = StringIO()
+ getvalue = stream.getvalue
+ dumper = Dumper(stream, canonical=canonical, indent=indent, width=width,
+ allow_unicode=allow_unicode, line_break=line_break,
+ encoding=encoding, version=version, tags=tags,
+ explicit_start=explicit_start, explicit_end=explicit_end)
+ try:
+ dumper.open()
+ for node in nodes:
+ dumper.serialize(node)
+ dumper.close()
+ finally:
+ dumper.dispose()
+ if getvalue:
+ return getvalue()
+
+def serialize(node, stream=None, Dumper=Dumper, **kwds):
+ """
+ Serialize a representation tree into a YAML stream.
+ If stream is None, return the produced string instead.
+ """
+ return serialize_all([node], stream, Dumper=Dumper, **kwds)
+
+def dump_all(documents, stream=None, Dumper=Dumper,
+ default_style=None, default_flow_style=None,
+ canonical=None, indent=None, width=None,
+ allow_unicode=None, line_break=None,
+ encoding='utf-8', explicit_start=None, explicit_end=None,
+ version=None, tags=None):
+ """
+ Serialize a sequence of Python objects into a YAML stream.
+ If stream is None, return the produced string instead.
+ """
+ getvalue = None
+ if stream is None:
+ if encoding is None:
+ from StringIO import StringIO
+ else:
+ from cStringIO import StringIO
+ stream = StringIO()
+ getvalue = stream.getvalue
+ dumper = Dumper(stream, default_style=default_style,
+ default_flow_style=default_flow_style,
+ canonical=canonical, indent=indent, width=width,
+ allow_unicode=allow_unicode, line_break=line_break,
+ encoding=encoding, version=version, tags=tags,
+ explicit_start=explicit_start, explicit_end=explicit_end)
+ try:
+ dumper.open()
+ for data in documents:
+ dumper.represent(data)
+ dumper.close()
+ finally:
+ dumper.dispose()
+ if getvalue:
+ return getvalue()
+
+def dump(data, stream=None, Dumper=Dumper, **kwds):
+ """
+ Serialize a Python object into a YAML stream.
+ If stream is None, return the produced string instead.
+ """
+ return dump_all([data], stream, Dumper=Dumper, **kwds)
+
+def safe_dump_all(documents, stream=None, **kwds):
+ """
+ Serialize a sequence of Python objects into a YAML stream.
+ Produce only basic YAML tags.
+ If stream is None, return the produced string instead.
+ """
+ return dump_all(documents, stream, Dumper=SafeDumper, **kwds)
+
+def safe_dump(data, stream=None, **kwds):
+ """
+ Serialize a Python object into a YAML stream.
+ Produce only basic YAML tags.
+ If stream is None, return the produced string instead.
+ """
+ return dump_all([data], stream, Dumper=SafeDumper, **kwds)
+
+def add_implicit_resolver(tag, regexp, first=None,
+ Loader=Loader, Dumper=Dumper):
+ """
+ Add an implicit scalar detector.
+ If an implicit scalar value matches the given regexp,
+ the corresponding tag is assigned to the scalar.
+ first is a sequence of possible initial characters or None.
+ """
+ Loader.add_implicit_resolver(tag, regexp, first)
+ Dumper.add_implicit_resolver(tag, regexp, first)
+
+def add_path_resolver(tag, path, kind=None, Loader=Loader, Dumper=Dumper):
+ """
+ Add a path based resolver for the given tag.
+ A path is a list of keys that forms a path
+ to a node in the representation tree.
+ Keys can be string values, integers, or None.
+ """
+ Loader.add_path_resolver(tag, path, kind)
+ Dumper.add_path_resolver(tag, path, kind)
+
+def add_constructor(tag, constructor, Loader=Loader):
+ """
+ Add a constructor for the given tag.
+ Constructor is a function that accepts a Loader instance
+ and a node object and produces the corresponding Python object.
+ """
+ Loader.add_constructor(tag, constructor)
+
+def add_multi_constructor(tag_prefix, multi_constructor, Loader=Loader):
+ """
+ Add a multi-constructor for the given tag prefix.
+ Multi-constructor is called for a node if its tag starts with tag_prefix.
+ Multi-constructor accepts a Loader instance, a tag suffix,
+ and a node object and produces the corresponding Python object.
+ """
+ Loader.add_multi_constructor(tag_prefix, multi_constructor)
+
+def add_representer(data_type, representer, Dumper=Dumper):
+ """
+ Add a representer for the given type.
+ Representer is a function accepting a Dumper instance
+ and an instance of the given data type
+ and producing the corresponding representation node.
+ """
+ Dumper.add_representer(data_type, representer)
+
+def add_multi_representer(data_type, multi_representer, Dumper=Dumper):
+ """
+ Add a representer for the given type.
+ Multi-representer is a function accepting a Dumper instance
+ and an instance of the given data type or subtype
+ and producing the corresponding representation node.
+ """
+ Dumper.add_multi_representer(data_type, multi_representer)
+
+class YAMLObjectMetaclass(type):
+ """
+ The metaclass for YAMLObject.
+ """
+ def __init__(cls, name, bases, kwds):
+ super(YAMLObjectMetaclass, cls).__init__(name, bases, kwds)
+ if 'yaml_tag' in kwds and kwds['yaml_tag'] is not None:
+ cls.yaml_loader.add_constructor(cls.yaml_tag, cls.from_yaml)
+ cls.yaml_dumper.add_representer(cls, cls.to_yaml)
+
+class YAMLObject(object):
+ """
+ An object that can dump itself to a YAML stream
+ and load itself from a YAML stream.
+ """
+
+ __metaclass__ = YAMLObjectMetaclass
+ __slots__ = () # no direct instantiation, so allow immutable subclasses
+
+ yaml_loader = Loader
+ yaml_dumper = Dumper
+
+ yaml_tag = None
+ yaml_flow_style = None
+
+ def from_yaml(cls, loader, node):
+ """
+ Convert a representation node to a Python object.
+ """
+ return loader.construct_yaml_object(node, cls)
+ from_yaml = classmethod(from_yaml)
+
+ def to_yaml(cls, dumper, data):
+ """
+ Convert a Python object to a representation node.
+ """
+ return dumper.represent_yaml_object(cls.yaml_tag, data, cls,
+ flow_style=cls.yaml_flow_style)
+ to_yaml = classmethod(to_yaml)
+
diff --git a/paleomix/yaml/lib2/composer.py b/paleomix/yaml/lib2/composer.py
new file mode 100644
index 0000000..06e5ac7
--- /dev/null
+++ b/paleomix/yaml/lib2/composer.py
@@ -0,0 +1,139 @@
+
+__all__ = ['Composer', 'ComposerError']
+
+from error import MarkedYAMLError
+from events import *
+from nodes import *
+
+class ComposerError(MarkedYAMLError):
+ pass
+
+class Composer(object):
+
+ def __init__(self):
+ self.anchors = {}
+
+ def check_node(self):
+ # Drop the STREAM-START event.
+ if self.check_event(StreamStartEvent):
+ self.get_event()
+
+ # If there are more documents available?
+ return not self.check_event(StreamEndEvent)
+
+ def get_node(self):
+ # Get the root node of the next document.
+ if not self.check_event(StreamEndEvent):
+ return self.compose_document()
+
+ def get_single_node(self):
+ # Drop the STREAM-START event.
+ self.get_event()
+
+ # Compose a document if the stream is not empty.
+ document = None
+ if not self.check_event(StreamEndEvent):
+ document = self.compose_document()
+
+ # Ensure that the stream contains no more documents.
+ if not self.check_event(StreamEndEvent):
+ event = self.get_event()
+ raise ComposerError("expected a single document in the stream",
+ document.start_mark, "but found another document",
+ event.start_mark)
+
+ # Drop the STREAM-END event.
+ self.get_event()
+
+ return document
+
+ def compose_document(self):
+ # Drop the DOCUMENT-START event.
+ self.get_event()
+
+ # Compose the root node.
+ node = self.compose_node(None, None)
+
+ # Drop the DOCUMENT-END event.
+ self.get_event()
+
+ self.anchors = {}
+ return node
+
+ def compose_node(self, parent, index):
+ if self.check_event(AliasEvent):
+ event = self.get_event()
+ anchor = event.anchor
+ if anchor not in self.anchors:
+ raise ComposerError(None, None, "found undefined alias %r"
+ % anchor.encode('utf-8'), event.start_mark)
+ return self.anchors[anchor]
+ event = self.peek_event()
+ anchor = event.anchor
+ if anchor is not None:
+ if anchor in self.anchors:
+ raise ComposerError("found duplicate anchor %r; first occurence"
+ % anchor.encode('utf-8'), self.anchors[anchor].start_mark,
+ "second occurence", event.start_mark)
+ self.descend_resolver(parent, index)
+ if self.check_event(ScalarEvent):
+ node = self.compose_scalar_node(anchor)
+ elif self.check_event(SequenceStartEvent):
+ node = self.compose_sequence_node(anchor)
+ elif self.check_event(MappingStartEvent):
+ node = self.compose_mapping_node(anchor)
+ self.ascend_resolver()
+ return node
+
+ def compose_scalar_node(self, anchor):
+ event = self.get_event()
+ tag = event.tag
+ if tag is None or tag == u'!':
+ tag = self.resolve(ScalarNode, event.value, event.implicit)
+ node = ScalarNode(tag, event.value,
+ event.start_mark, event.end_mark, style=event.style)
+ if anchor is not None:
+ self.anchors[anchor] = node
+ return node
+
+ def compose_sequence_node(self, anchor):
+ start_event = self.get_event()
+ tag = start_event.tag
+ if tag is None or tag == u'!':
+ tag = self.resolve(SequenceNode, None, start_event.implicit)
+ node = SequenceNode(tag, [],
+ start_event.start_mark, None,
+ flow_style=start_event.flow_style)
+ if anchor is not None:
+ self.anchors[anchor] = node
+ index = 0
+ while not self.check_event(SequenceEndEvent):
+ node.value.append(self.compose_node(node, index))
+ index += 1
+ end_event = self.get_event()
+ node.end_mark = end_event.end_mark
+ return node
+
+ def compose_mapping_node(self, anchor):
+ start_event = self.get_event()
+ tag = start_event.tag
+ if tag is None or tag == u'!':
+ tag = self.resolve(MappingNode, None, start_event.implicit)
+ node = MappingNode(tag, [],
+ start_event.start_mark, None,
+ flow_style=start_event.flow_style)
+ if anchor is not None:
+ self.anchors[anchor] = node
+ while not self.check_event(MappingEndEvent):
+ #key_event = self.peek_event()
+ item_key = self.compose_node(node, None)
+ #if item_key in node.value:
+ # raise ComposerError("while composing a mapping", start_event.start_mark,
+ # "found duplicate key", key_event.start_mark)
+ item_value = self.compose_node(node, item_key)
+ #node.value[item_key] = item_value
+ node.value.append((item_key, item_value))
+ end_event = self.get_event()
+ node.end_mark = end_event.end_mark
+ return node
+
diff --git a/paleomix/yaml/lib2/constructor.py b/paleomix/yaml/lib2/constructor.py
new file mode 100644
index 0000000..af231b1
--- /dev/null
+++ b/paleomix/yaml/lib2/constructor.py
@@ -0,0 +1,678 @@
+
+__all__ = ['BaseConstructor', 'SafeConstructor', 'Constructor',
+ 'ConstructorError']
+
+from error import *
+from nodes import *
+
+import datetime
+
+import binascii, re, sys, types
+
+class ConstructorError(MarkedYAMLError):
+ pass
+
+class BaseConstructor(object):
+
+ yaml_constructors = {}
+ yaml_multi_constructors = {}
+
+ def __init__(self):
+ self.constructed_objects = {}
+ self.recursive_objects = {}
+ self.state_generators = []
+ self.deep_construct = False
+
+ def check_data(self):
+ # If there are more documents available?
+ return self.check_node()
+
+ def get_data(self):
+ # Construct and return the next document.
+ if self.check_node():
+ return self.construct_document(self.get_node())
+
+ def get_single_data(self):
+ # Ensure that the stream contains a single document and construct it.
+ node = self.get_single_node()
+ if node is not None:
+ return self.construct_document(node)
+ return None
+
+ def construct_document(self, node):
+ data = self.construct_object(node)
+ while self.state_generators:
+ state_generators = self.state_generators
+ self.state_generators = []
+ for generator in state_generators:
+ for dummy in generator:
+ pass
+ self.constructed_objects = {}
+ self.recursive_objects = {}
+ self.deep_construct = False
+ return data
+
+ def construct_object(self, node, deep=False):
+ if node in self.constructed_objects:
+ return self.constructed_objects[node]
+ if deep:
+ old_deep = self.deep_construct
+ self.deep_construct = True
+ if node in self.recursive_objects:
+ raise ConstructorError(None, None,
+ "found unconstructable recursive node", node.start_mark)
+ self.recursive_objects[node] = None
+ constructor = None
+ tag_suffix = None
+ if node.tag in self.yaml_constructors:
+ constructor = self.yaml_constructors[node.tag]
+ else:
+ for tag_prefix in self.yaml_multi_constructors:
+ if node.tag.startswith(tag_prefix):
+ tag_suffix = node.tag[len(tag_prefix):]
+ constructor = self.yaml_multi_constructors[tag_prefix]
+ break
+ else:
+ if None in self.yaml_multi_constructors:
+ tag_suffix = node.tag
+ constructor = self.yaml_multi_constructors[None]
+ elif None in self.yaml_constructors:
+ constructor = self.yaml_constructors[None]
+ elif isinstance(node, ScalarNode):
+ constructor = self.__class__.construct_scalar
+ elif isinstance(node, SequenceNode):
+ constructor = self.__class__.construct_sequence
+ elif isinstance(node, MappingNode):
+ constructor = self.__class__.construct_mapping
+ if tag_suffix is None:
+ data = constructor(self, node)
+ else:
+ data = constructor(self, tag_suffix, node)
+ if isinstance(data, types.GeneratorType):
+ generator = data
+ data = generator.next()
+ if self.deep_construct:
+ for dummy in generator:
+ pass
+ else:
+ self.state_generators.append(generator)
+ self.constructed_objects[node] = data
+ del self.recursive_objects[node]
+ if deep:
+ self.deep_construct = old_deep
+ return data
+
+ def construct_scalar(self, node):
+ if not isinstance(node, ScalarNode):
+ raise ConstructorError(None, None,
+ "expected a scalar node, but found %s" % node.id,
+ node.start_mark)
+ return node.value
+
+ def construct_sequence(self, node, deep=False):
+ if not isinstance(node, SequenceNode):
+ raise ConstructorError(None, None,
+ "expected a sequence node, but found %s" % node.id,
+ node.start_mark)
+ return [self.construct_object(child, deep=deep)
+ for child in node.value]
+
+ def construct_mapping(self, node, deep=False):
+ if not isinstance(node, MappingNode):
+ raise ConstructorError(None, None,
+ "expected a mapping node, but found %s" % node.id,
+ node.start_mark)
+ mapping = {}
+ for key_node, value_node in node.value:
+ key = self.construct_object(key_node, deep=deep)
+ try:
+ hash(key)
+ except TypeError, exc:
+ raise ConstructorError("while constructing a mapping", node.start_mark,
+ "found unacceptable key (%s)" % exc, key_node.start_mark)
+ if key in mapping:
+ raise ConstructorError("while constructing a mapping", node.start_mark,
+ "found duplicate key (%s)" % key, key_node.start_mark)
+ value = self.construct_object(value_node, deep=deep)
+ mapping[key] = value
+ return mapping
+
+ def construct_pairs(self, node, deep=False):
+ if not isinstance(node, MappingNode):
+ raise ConstructorError(None, None,
+ "expected a mapping node, but found %s" % node.id,
+ node.start_mark)
+ pairs = []
+ for key_node, value_node in node.value:
+ key = self.construct_object(key_node, deep=deep)
+ value = self.construct_object(value_node, deep=deep)
+ pairs.append((key, value))
+ return pairs
+
+ def add_constructor(cls, tag, constructor):
+ if not 'yaml_constructors' in cls.__dict__:
+ cls.yaml_constructors = cls.yaml_constructors.copy()
+ cls.yaml_constructors[tag] = constructor
+ add_constructor = classmethod(add_constructor)
+
+ def add_multi_constructor(cls, tag_prefix, multi_constructor):
+ if not 'yaml_multi_constructors' in cls.__dict__:
+ cls.yaml_multi_constructors = cls.yaml_multi_constructors.copy()
+ cls.yaml_multi_constructors[tag_prefix] = multi_constructor
+ add_multi_constructor = classmethod(add_multi_constructor)
+
+class SafeConstructor(BaseConstructor):
+
+ def construct_scalar(self, node):
+ if isinstance(node, MappingNode):
+ for key_node, value_node in node.value:
+ if key_node.tag == u'tag:yaml.org,2002:value':
+ return self.construct_scalar(value_node)
+ return BaseConstructor.construct_scalar(self, node)
+
+ def flatten_mapping(self, node):
+ merge = []
+ index = 0
+ while index < len(node.value):
+ key_node, value_node = node.value[index]
+ if key_node.tag == u'tag:yaml.org,2002:merge':
+ del node.value[index]
+ if isinstance(value_node, MappingNode):
+ self.flatten_mapping(value_node)
+ merge.extend(value_node.value)
+ elif isinstance(value_node, SequenceNode):
+ submerge = []
+ for subnode in value_node.value:
+ if not isinstance(subnode, MappingNode):
+ raise ConstructorError("while constructing a mapping",
+ node.start_mark,
+ "expected a mapping for merging, but found %s"
+ % subnode.id, subnode.start_mark)
+ self.flatten_mapping(subnode)
+ submerge.append(subnode.value)
+ submerge.reverse()
+ for value in submerge:
+ merge.extend(value)
+ else:
+ raise ConstructorError("while constructing a mapping", node.start_mark,
+ "expected a mapping or list of mappings for merging, but found %s"
+ % value_node.id, value_node.start_mark)
+ elif key_node.tag == u'tag:yaml.org,2002:value':
+ key_node.tag = u'tag:yaml.org,2002:str'
+ index += 1
+ else:
+ index += 1
+ if merge:
+ node.value = merge + node.value
+
+ def construct_mapping(self, node, deep=False):
+ if isinstance(node, MappingNode):
+ self.flatten_mapping(node)
+ return BaseConstructor.construct_mapping(self, node, deep=deep)
+
+ def construct_yaml_null(self, node):
+ self.construct_scalar(node)
+ return None
+
+ bool_values = {
+ u'yes': True,
+ u'no': False,
+ u'true': True,
+ u'false': False,
+ u'on': True,
+ u'off': False,
+ }
+
+ def construct_yaml_bool(self, node):
+ value = self.construct_scalar(node)
+ return self.bool_values[value.lower()]
+
+ def construct_yaml_int(self, node):
+ value = str(self.construct_scalar(node))
+ value = value.replace('_', '')
+ sign = +1
+ if value[0] == '-':
+ sign = -1
+ if value[0] in '+-':
+ value = value[1:]
+ if value == '0':
+ return 0
+ elif value.startswith('0b'):
+ return sign*int(value[2:], 2)
+ elif value.startswith('0x'):
+ return sign*int(value[2:], 16)
+ elif value[0] == '0':
+ return sign*int(value, 8)
+ elif ':' in value:
+ digits = [int(part) for part in value.split(':')]
+ digits.reverse()
+ base = 1
+ value = 0
+ for digit in digits:
+ value += digit*base
+ base *= 60
+ return sign*value
+ else:
+ return sign*int(value)
+
+ inf_value = 1e300
+ while inf_value != inf_value*inf_value:
+ inf_value *= inf_value
+ nan_value = -inf_value/inf_value # Trying to make a quiet NaN (like C99).
+
+ def construct_yaml_float(self, node):
+ value = str(self.construct_scalar(node))
+ value = value.replace('_', '').lower()
+ sign = +1
+ if value[0] == '-':
+ sign = -1
+ if value[0] in '+-':
+ value = value[1:]
+ if value == '.inf':
+ return sign*self.inf_value
+ elif value == '.nan':
+ return self.nan_value
+ elif ':' in value:
+ digits = [float(part) for part in value.split(':')]
+ digits.reverse()
+ base = 1
+ value = 0.0
+ for digit in digits:
+ value += digit*base
+ base *= 60
+ return sign*value
+ else:
+ return sign*float(value)
+
+ def construct_yaml_binary(self, node):
+ value = self.construct_scalar(node)
+ try:
+ return str(value).decode('base64')
+ except (binascii.Error, UnicodeEncodeError), exc:
+ raise ConstructorError(None, None,
+ "failed to decode base64 data: %s" % exc, node.start_mark)
+
+ timestamp_regexp = re.compile(
+ ur'''^(?P<year>[0-9][0-9][0-9][0-9])
+ -(?P<month>[0-9][0-9]?)
+ -(?P<day>[0-9][0-9]?)
+ (?:(?:[Tt]|[ \t]+)
+ (?P<hour>[0-9][0-9]?)
+ :(?P<minute>[0-9][0-9])
+ :(?P<second>[0-9][0-9])
+ (?:\.(?P<fraction>[0-9]*))?
+ (?:[ \t]*(?P<tz>Z|(?P<tz_sign>[-+])(?P<tz_hour>[0-9][0-9]?)
+ (?::(?P<tz_minute>[0-9][0-9]))?))?)?$''', re.X)
+
+ def construct_yaml_timestamp(self, node):
+ value = self.construct_scalar(node)
+ match = self.timestamp_regexp.match(node.value)
+ values = match.groupdict()
+ year = int(values['year'])
+ month = int(values['month'])
+ day = int(values['day'])
+ if not values['hour']:
+ return datetime.date(year, month, day)
+ hour = int(values['hour'])
+ minute = int(values['minute'])
+ second = int(values['second'])
+ fraction = 0
+ if values['fraction']:
+ fraction = values['fraction'][:6]
+ while len(fraction) < 6:
+ fraction += '0'
+ fraction = int(fraction)
+ delta = None
+ if values['tz_sign']:
+ tz_hour = int(values['tz_hour'])
+ tz_minute = int(values['tz_minute'] or 0)
+ delta = datetime.timedelta(hours=tz_hour, minutes=tz_minute)
+ if values['tz_sign'] == '-':
+ delta = -delta
+ data = datetime.datetime(year, month, day, hour, minute, second, fraction)
+ if delta:
+ data -= delta
+ return data
+
+ def construct_yaml_omap(self, node):
+ # Note: we do not check for duplicate keys, because it's too
+ # CPU-expensive.
+ omap = []
+ yield omap
+ if not isinstance(node, SequenceNode):
+ raise ConstructorError("while constructing an ordered map", node.start_mark,
+ "expected a sequence, but found %s" % node.id, node.start_mark)
+ for subnode in node.value:
+ if not isinstance(subnode, MappingNode):
+ raise ConstructorError("while constructing an ordered map", node.start_mark,
+ "expected a mapping of length 1, but found %s" % subnode.id,
+ subnode.start_mark)
+ if len(subnode.value) != 1:
+ raise ConstructorError("while constructing an ordered map", node.start_mark,
+ "expected a single mapping item, but found %d items" % len(subnode.value),
+ subnode.start_mark)
+ key_node, value_node = subnode.value[0]
+ key = self.construct_object(key_node)
+ value = self.construct_object(value_node)
+ omap.append((key, value))
+
+ def construct_yaml_pairs(self, node):
+ # Note: the same code as `construct_yaml_omap`.
+ pairs = []
+ yield pairs
+ if not isinstance(node, SequenceNode):
+ raise ConstructorError("while constructing pairs", node.start_mark,
+ "expected a sequence, but found %s" % node.id, node.start_mark)
+ for subnode in node.value:
+ if not isinstance(subnode, MappingNode):
+ raise ConstructorError("while constructing pairs", node.start_mark,
+ "expected a mapping of length 1, but found %s" % subnode.id,
+ subnode.start_mark)
+ if len(subnode.value) != 1:
+ raise ConstructorError("while constructing pairs", node.start_mark,
+ "expected a single mapping item, but found %d items" % len(subnode.value),
+ subnode.start_mark)
+ key_node, value_node = subnode.value[0]
+ key = self.construct_object(key_node)
+ value = self.construct_object(value_node)
+ pairs.append((key, value))
+
+ def construct_yaml_set(self, node):
+ data = set()
+ yield data
+ value = self.construct_mapping(node)
+ data.update(value)
+
+ def construct_yaml_str(self, node):
+ value = self.construct_scalar(node)
+ try:
+ return value.encode('ascii')
+ except UnicodeEncodeError:
+ return value
+
+ def construct_yaml_seq(self, node):
+ data = []
+ yield data
+ data.extend(self.construct_sequence(node))
+
+ def construct_yaml_map(self, node):
+ data = {}
+ yield data
+ value = self.construct_mapping(node)
+ data.update(value)
+
+ def construct_yaml_object(self, node, cls):
+ data = cls.__new__(cls)
+ yield data
+ if hasattr(data, '__setstate__'):
+ state = self.construct_mapping(node, deep=True)
+ data.__setstate__(state)
+ else:
+ state = self.construct_mapping(node)
+ data.__dict__.update(state)
+
+ def construct_undefined(self, node):
+ raise ConstructorError(None, None,
+ "could not determine a constructor for the tag %r" % node.tag.encode('utf-8'),
+ node.start_mark)
+
+SafeConstructor.add_constructor(
+ u'tag:yaml.org,2002:null',
+ SafeConstructor.construct_yaml_null)
+
+SafeConstructor.add_constructor(
+ u'tag:yaml.org,2002:bool',
+ SafeConstructor.construct_yaml_bool)
+
+SafeConstructor.add_constructor(
+ u'tag:yaml.org,2002:int',
+ SafeConstructor.construct_yaml_int)
+
+SafeConstructor.add_constructor(
+ u'tag:yaml.org,2002:float',
+ SafeConstructor.construct_yaml_float)
+
+SafeConstructor.add_constructor(
+ u'tag:yaml.org,2002:binary',
+ SafeConstructor.construct_yaml_binary)
+
+SafeConstructor.add_constructor(
+ u'tag:yaml.org,2002:timestamp',
+ SafeConstructor.construct_yaml_timestamp)
+
+SafeConstructor.add_constructor(
+ u'tag:yaml.org,2002:omap',
+ SafeConstructor.construct_yaml_omap)
+
+SafeConstructor.add_constructor(
+ u'tag:yaml.org,2002:pairs',
+ SafeConstructor.construct_yaml_pairs)
+
+SafeConstructor.add_constructor(
+ u'tag:yaml.org,2002:set',
+ SafeConstructor.construct_yaml_set)
+
+SafeConstructor.add_constructor(
+ u'tag:yaml.org,2002:str',
+ SafeConstructor.construct_yaml_str)
+
+SafeConstructor.add_constructor(
+ u'tag:yaml.org,2002:seq',
+ SafeConstructor.construct_yaml_seq)
+
+SafeConstructor.add_constructor(
+ u'tag:yaml.org,2002:map',
+ SafeConstructor.construct_yaml_map)
+
+SafeConstructor.add_constructor(None,
+ SafeConstructor.construct_undefined)
+
+class Constructor(SafeConstructor):
+
+ def construct_python_str(self, node):
+ return self.construct_scalar(node).encode('utf-8')
+
+ def construct_python_unicode(self, node):
+ return self.construct_scalar(node)
+
+ def construct_python_long(self, node):
+ return long(self.construct_yaml_int(node))
+
+ def construct_python_complex(self, node):
+ return complex(self.construct_scalar(node))
+
+ def construct_python_tuple(self, node):
+ return tuple(self.construct_sequence(node))
+
+ def find_python_module(self, name, mark):
+ if not name:
+ raise ConstructorError("while constructing a Python module", mark,
+ "expected non-empty name appended to the tag", mark)
+ try:
+ __import__(name)
+ except ImportError, exc:
+ raise ConstructorError("while constructing a Python module", mark,
+ "cannot find module %r (%s)" % (name.encode('utf-8'), exc), mark)
+ return sys.modules[name]
+
+ def find_python_name(self, name, mark):
+ if not name:
+ raise ConstructorError("while constructing a Python object", mark,
+ "expected non-empty name appended to the tag", mark)
+ if u'.' in name:
+ module_name, object_name = name.rsplit('.', 1)
+ else:
+ module_name = '__builtin__'
+ object_name = name
+ try:
+ __import__(module_name)
+ except ImportError, exc:
+ raise ConstructorError("while constructing a Python object", mark,
+ "cannot find module %r (%s)" % (module_name.encode('utf-8'), exc), mark)
+ module = sys.modules[module_name]
+ if not hasattr(module, object_name):
+ raise ConstructorError("while constructing a Python object", mark,
+ "cannot find %r in the module %r" % (object_name.encode('utf-8'),
+ module.__name__), mark)
+ return getattr(module, object_name)
+
+ def construct_python_name(self, suffix, node):
+ value = self.construct_scalar(node)
+ if value:
+ raise ConstructorError("while constructing a Python name", node.start_mark,
+ "expected the empty value, but found %r" % value.encode('utf-8'),
+ node.start_mark)
+ return self.find_python_name(suffix, node.start_mark)
+
+ def construct_python_module(self, suffix, node):
+ value = self.construct_scalar(node)
+ if value:
+ raise ConstructorError("while constructing a Python module", node.start_mark,
+ "expected the empty value, but found %r" % value.encode('utf-8'),
+ node.start_mark)
+ return self.find_python_module(suffix, node.start_mark)
+
+ class classobj: pass
+
+ def make_python_instance(self, suffix, node,
+ args=None, kwds=None, newobj=False):
+ if not args:
+ args = []
+ if not kwds:
+ kwds = {}
+ cls = self.find_python_name(suffix, node.start_mark)
+ if newobj and isinstance(cls, type(self.classobj)) \
+ and not args and not kwds:
+ instance = self.classobj()
+ instance.__class__ = cls
+ return instance
+ elif newobj and isinstance(cls, type):
+ return cls.__new__(cls, *args, **kwds)
+ else:
+ return cls(*args, **kwds)
+
+ def set_python_instance_state(self, instance, state):
+ if hasattr(instance, '__setstate__'):
+ instance.__setstate__(state)
+ else:
+ slotstate = {}
+ if isinstance(state, tuple) and len(state) == 2:
+ state, slotstate = state
+ if hasattr(instance, '__dict__'):
+ instance.__dict__.update(state)
+ elif state:
+ slotstate.update(state)
+ for key, value in slotstate.items():
+ setattr(object, key, value)
+
+ def construct_python_object(self, suffix, node):
+ # Format:
+ # !!python/object:module.name { ... state ... }
+ instance = self.make_python_instance(suffix, node, newobj=True)
+ yield instance
+ deep = hasattr(instance, '__setstate__')
+ state = self.construct_mapping(node, deep=deep)
+ self.set_python_instance_state(instance, state)
+
+ def construct_python_object_apply(self, suffix, node, newobj=False):
+ # Format:
+ # !!python/object/apply # (or !!python/object/new)
+ # args: [ ... arguments ... ]
+ # kwds: { ... keywords ... }
+ # state: ... state ...
+ # listitems: [ ... listitems ... ]
+ # dictitems: { ... dictitems ... }
+ # or short format:
+ # !!python/object/apply [ ... arguments ... ]
+ # The difference between !!python/object/apply and !!python/object/new
+ # is how an object is created, check make_python_instance for details.
+ if isinstance(node, SequenceNode):
+ args = self.construct_sequence(node, deep=True)
+ kwds = {}
+ state = {}
+ listitems = []
+ dictitems = {}
+ else:
+ value = self.construct_mapping(node, deep=True)
+ args = value.get('args', [])
+ kwds = value.get('kwds', {})
+ state = value.get('state', {})
+ listitems = value.get('listitems', [])
+ dictitems = value.get('dictitems', {})
+ instance = self.make_python_instance(suffix, node, args, kwds, newobj)
+ if state:
+ self.set_python_instance_state(instance, state)
+ if listitems:
+ instance.extend(listitems)
+ if dictitems:
+ for key in dictitems:
+ instance[key] = dictitems[key]
+ return instance
+
+ def construct_python_object_new(self, suffix, node):
+ return self.construct_python_object_apply(suffix, node, newobj=True)
+
+Constructor.add_constructor(
+ u'tag:yaml.org,2002:python/none',
+ Constructor.construct_yaml_null)
+
+Constructor.add_constructor(
+ u'tag:yaml.org,2002:python/bool',
+ Constructor.construct_yaml_bool)
+
+Constructor.add_constructor(
+ u'tag:yaml.org,2002:python/str',
+ Constructor.construct_python_str)
+
+Constructor.add_constructor(
+ u'tag:yaml.org,2002:python/unicode',
+ Constructor.construct_python_unicode)
+
+Constructor.add_constructor(
+ u'tag:yaml.org,2002:python/int',
+ Constructor.construct_yaml_int)
+
+Constructor.add_constructor(
+ u'tag:yaml.org,2002:python/long',
+ Constructor.construct_python_long)
+
+Constructor.add_constructor(
+ u'tag:yaml.org,2002:python/float',
+ Constructor.construct_yaml_float)
+
+Constructor.add_constructor(
+ u'tag:yaml.org,2002:python/complex',
+ Constructor.construct_python_complex)
+
+Constructor.add_constructor(
+ u'tag:yaml.org,2002:python/list',
+ Constructor.construct_yaml_seq)
+
+Constructor.add_constructor(
+ u'tag:yaml.org,2002:python/tuple',
+ Constructor.construct_python_tuple)
+
+Constructor.add_constructor(
+ u'tag:yaml.org,2002:python/dict',
+ Constructor.construct_yaml_map)
+
+Constructor.add_multi_constructor(
+ u'tag:yaml.org,2002:python/name:',
+ Constructor.construct_python_name)
+
+Constructor.add_multi_constructor(
+ u'tag:yaml.org,2002:python/module:',
+ Constructor.construct_python_module)
+
+Constructor.add_multi_constructor(
+ u'tag:yaml.org,2002:python/object:',
+ Constructor.construct_python_object)
+
+Constructor.add_multi_constructor(
+ u'tag:yaml.org,2002:python/object/apply:',
+ Constructor.construct_python_object_apply)
+
+Constructor.add_multi_constructor(
+ u'tag:yaml.org,2002:python/object/new:',
+ Constructor.construct_python_object_new)
+
diff --git a/paleomix/yaml/lib2/dumper.py b/paleomix/yaml/lib2/dumper.py
new file mode 100644
index 0000000..f811d2c
--- /dev/null
+++ b/paleomix/yaml/lib2/dumper.py
@@ -0,0 +1,62 @@
+
+__all__ = ['BaseDumper', 'SafeDumper', 'Dumper']
+
+from emitter import *
+from serializer import *
+from representer import *
+from resolver import *
+
+class BaseDumper(Emitter, Serializer, BaseRepresenter, BaseResolver):
+
+ def __init__(self, stream,
+ default_style=None, default_flow_style=None,
+ canonical=None, indent=None, width=None,
+ allow_unicode=None, line_break=None,
+ encoding=None, explicit_start=None, explicit_end=None,
+ version=None, tags=None):
+ Emitter.__init__(self, stream, canonical=canonical,
+ indent=indent, width=width,
+ allow_unicode=allow_unicode, line_break=line_break)
+ Serializer.__init__(self, encoding=encoding,
+ explicit_start=explicit_start, explicit_end=explicit_end,
+ version=version, tags=tags)
+ Representer.__init__(self, default_style=default_style,
+ default_flow_style=default_flow_style)
+ Resolver.__init__(self)
+
+class SafeDumper(Emitter, Serializer, SafeRepresenter, Resolver):
+
+ def __init__(self, stream,
+ default_style=None, default_flow_style=None,
+ canonical=None, indent=None, width=None,
+ allow_unicode=None, line_break=None,
+ encoding=None, explicit_start=None, explicit_end=None,
+ version=None, tags=None):
+ Emitter.__init__(self, stream, canonical=canonical,
+ indent=indent, width=width,
+ allow_unicode=allow_unicode, line_break=line_break)
+ Serializer.__init__(self, encoding=encoding,
+ explicit_start=explicit_start, explicit_end=explicit_end,
+ version=version, tags=tags)
+ SafeRepresenter.__init__(self, default_style=default_style,
+ default_flow_style=default_flow_style)
+ Resolver.__init__(self)
+
+class Dumper(Emitter, Serializer, Representer, Resolver):
+
+ def __init__(self, stream,
+ default_style=None, default_flow_style=None,
+ canonical=None, indent=None, width=None,
+ allow_unicode=None, line_break=None,
+ encoding=None, explicit_start=None, explicit_end=None,
+ version=None, tags=None):
+ Emitter.__init__(self, stream, canonical=canonical,
+ indent=indent, width=width,
+ allow_unicode=allow_unicode, line_break=line_break)
+ Serializer.__init__(self, encoding=encoding,
+ explicit_start=explicit_start, explicit_end=explicit_end,
+ version=version, tags=tags)
+ Representer.__init__(self, default_style=default_style,
+ default_flow_style=default_flow_style)
+ Resolver.__init__(self)
+
diff --git a/paleomix/yaml/lib2/emitter.py b/paleomix/yaml/lib2/emitter.py
new file mode 100644
index 0000000..e5bcdcc
--- /dev/null
+++ b/paleomix/yaml/lib2/emitter.py
@@ -0,0 +1,1140 @@
+
+# Emitter expects events obeying the following grammar:
+# stream ::= STREAM-START document* STREAM-END
+# document ::= DOCUMENT-START node DOCUMENT-END
+# node ::= SCALAR | sequence | mapping
+# sequence ::= SEQUENCE-START node* SEQUENCE-END
+# mapping ::= MAPPING-START (node node)* MAPPING-END
+
+__all__ = ['Emitter', 'EmitterError']
+
+from error import YAMLError
+from events import *
+
+class EmitterError(YAMLError):
+ pass
+
+class ScalarAnalysis(object):
+ def __init__(self, scalar, empty, multiline,
+ allow_flow_plain, allow_block_plain,
+ allow_single_quoted, allow_double_quoted,
+ allow_block):
+ self.scalar = scalar
+ self.empty = empty
+ self.multiline = multiline
+ self.allow_flow_plain = allow_flow_plain
+ self.allow_block_plain = allow_block_plain
+ self.allow_single_quoted = allow_single_quoted
+ self.allow_double_quoted = allow_double_quoted
+ self.allow_block = allow_block
+
+class Emitter(object):
+
+ DEFAULT_TAG_PREFIXES = {
+ u'!' : u'!',
+ u'tag:yaml.org,2002:' : u'!!',
+ }
+
+ def __init__(self, stream, canonical=None, indent=None, width=None,
+ allow_unicode=None, line_break=None):
+
+ # The stream should have the methods `write` and possibly `flush`.
+ self.stream = stream
+
+ # Encoding can be overriden by STREAM-START.
+ self.encoding = None
+
+ # Emitter is a state machine with a stack of states to handle nested
+ # structures.
+ self.states = []
+ self.state = self.expect_stream_start
+
+ # Current event and the event queue.
+ self.events = []
+ self.event = None
+
+ # The current indentation level and the stack of previous indents.
+ self.indents = []
+ self.indent = None
+
+ # Flow level.
+ self.flow_level = 0
+
+ # Contexts.
+ self.root_context = False
+ self.sequence_context = False
+ self.mapping_context = False
+ self.simple_key_context = False
+
+ # Characteristics of the last emitted character:
+ # - current position.
+ # - is it a whitespace?
+ # - is it an indention character
+ # (indentation space, '-', '?', or ':')?
+ self.line = 0
+ self.column = 0
+ self.whitespace = True
+ self.indention = True
+
+ # Whether the document requires an explicit document indicator
+ self.open_ended = False
+
+ # Formatting details.
+ self.canonical = canonical
+ self.allow_unicode = allow_unicode
+ self.best_indent = 2
+ if indent and 1 < indent < 10:
+ self.best_indent = indent
+ self.best_width = 80
+ if width and width > self.best_indent*2:
+ self.best_width = width
+ self.best_line_break = u'\n'
+ if line_break in [u'\r', u'\n', u'\r\n']:
+ self.best_line_break = line_break
+
+ # Tag prefixes.
+ self.tag_prefixes = None
+
+ # Prepared anchor and tag.
+ self.prepared_anchor = None
+ self.prepared_tag = None
+
+ # Scalar analysis and style.
+ self.analysis = None
+ self.style = None
+
+ def dispose(self):
+ # Reset the state attributes (to clear self-references)
+ self.states = []
+ self.state = None
+
+ def emit(self, event):
+ self.events.append(event)
+ while not self.need_more_events():
+ self.event = self.events.pop(0)
+ self.state()
+ self.event = None
+
+ # In some cases, we wait for a few next events before emitting.
+
+ def need_more_events(self):
+ if not self.events:
+ return True
+ event = self.events[0]
+ if isinstance(event, DocumentStartEvent):
+ return self.need_events(1)
+ elif isinstance(event, SequenceStartEvent):
+ return self.need_events(2)
+ elif isinstance(event, MappingStartEvent):
+ return self.need_events(3)
+ else:
+ return False
+
+ def need_events(self, count):
+ level = 0
+ for event in self.events[1:]:
+ if isinstance(event, (DocumentStartEvent, CollectionStartEvent)):
+ level += 1
+ elif isinstance(event, (DocumentEndEvent, CollectionEndEvent)):
+ level -= 1
+ elif isinstance(event, StreamEndEvent):
+ level = -1
+ if level < 0:
+ return False
+ return (len(self.events) < count+1)
+
+ def increase_indent(self, flow=False, indentless=False):
+ self.indents.append(self.indent)
+ if self.indent is None:
+ if flow:
+ self.indent = self.best_indent
+ else:
+ self.indent = 0
+ elif not indentless:
+ self.indent += self.best_indent
+
+ # States.
+
+ # Stream handlers.
+
+ def expect_stream_start(self):
+ if isinstance(self.event, StreamStartEvent):
+ if self.event.encoding and not getattr(self.stream, 'encoding', None):
+ self.encoding = self.event.encoding
+ self.write_stream_start()
+ self.state = self.expect_first_document_start
+ else:
+ raise EmitterError("expected StreamStartEvent, but got %s"
+ % self.event)
+
+ def expect_nothing(self):
+ raise EmitterError("expected nothing, but got %s" % self.event)
+
+ # Document handlers.
+
+ def expect_first_document_start(self):
+ return self.expect_document_start(first=True)
+
+ def expect_document_start(self, first=False):
+ if isinstance(self.event, DocumentStartEvent):
+ if (self.event.version or self.event.tags) and self.open_ended:
+ self.write_indicator(u'...', True)
+ self.write_indent()
+ if self.event.version:
+ version_text = self.prepare_version(self.event.version)
+ self.write_version_directive(version_text)
+ self.tag_prefixes = self.DEFAULT_TAG_PREFIXES.copy()
+ if self.event.tags:
+ handles = self.event.tags.keys()
+ handles.sort()
+ for handle in handles:
+ prefix = self.event.tags[handle]
+ self.tag_prefixes[prefix] = handle
+ handle_text = self.prepare_tag_handle(handle)
+ prefix_text = self.prepare_tag_prefix(prefix)
+ self.write_tag_directive(handle_text, prefix_text)
+ implicit = (first and not self.event.explicit and not self.canonical
+ and not self.event.version and not self.event.tags
+ and not self.check_empty_document())
+ if not implicit:
+ self.write_indent()
+ self.write_indicator(u'---', True)
+ if self.canonical:
+ self.write_indent()
+ self.state = self.expect_document_root
+ elif isinstance(self.event, StreamEndEvent):
+ if self.open_ended:
+ self.write_indicator(u'...', True)
+ self.write_indent()
+ self.write_stream_end()
+ self.state = self.expect_nothing
+ else:
+ raise EmitterError("expected DocumentStartEvent, but got %s"
+ % self.event)
+
+ def expect_document_end(self):
+ if isinstance(self.event, DocumentEndEvent):
+ self.write_indent()
+ if self.event.explicit:
+ self.write_indicator(u'...', True)
+ self.write_indent()
+ self.flush_stream()
+ self.state = self.expect_document_start
+ else:
+ raise EmitterError("expected DocumentEndEvent, but got %s"
+ % self.event)
+
+ def expect_document_root(self):
+ self.states.append(self.expect_document_end)
+ self.expect_node(root=True)
+
+ # Node handlers.
+
+ def expect_node(self, root=False, sequence=False, mapping=False,
+ simple_key=False):
+ self.root_context = root
+ self.sequence_context = sequence
+ self.mapping_context = mapping
+ self.simple_key_context = simple_key
+ if isinstance(self.event, AliasEvent):
+ self.expect_alias()
+ elif isinstance(self.event, (ScalarEvent, CollectionStartEvent)):
+ self.process_anchor(u'&')
+ self.process_tag()
+ if isinstance(self.event, ScalarEvent):
+ self.expect_scalar()
+ elif isinstance(self.event, SequenceStartEvent):
+ if self.flow_level or self.canonical or self.event.flow_style \
+ or self.check_empty_sequence():
+ self.expect_flow_sequence()
+ else:
+ self.expect_block_sequence()
+ elif isinstance(self.event, MappingStartEvent):
+ if self.flow_level or self.canonical or self.event.flow_style \
+ or self.check_empty_mapping():
+ self.expect_flow_mapping()
+ else:
+ self.expect_block_mapping()
+ else:
+ raise EmitterError("expected NodeEvent, but got %s" % self.event)
+
+ def expect_alias(self):
+ if self.event.anchor is None:
+ raise EmitterError("anchor is not specified for alias")
+ self.process_anchor(u'*')
+ self.state = self.states.pop()
+
+ def expect_scalar(self):
+ self.increase_indent(flow=True)
+ self.process_scalar()
+ self.indent = self.indents.pop()
+ self.state = self.states.pop()
+
+ # Flow sequence handlers.
+
+ def expect_flow_sequence(self):
+ self.write_indicator(u'[', True, whitespace=True)
+ self.flow_level += 1
+ self.increase_indent(flow=True)
+ self.state = self.expect_first_flow_sequence_item
+
+ def expect_first_flow_sequence_item(self):
+ if isinstance(self.event, SequenceEndEvent):
+ self.indent = self.indents.pop()
+ self.flow_level -= 1
+ self.write_indicator(u']', False)
+ self.state = self.states.pop()
+ else:
+ if self.canonical or self.column > self.best_width:
+ self.write_indent()
+ self.states.append(self.expect_flow_sequence_item)
+ self.expect_node(sequence=True)
+
+ def expect_flow_sequence_item(self):
+ if isinstance(self.event, SequenceEndEvent):
+ self.indent = self.indents.pop()
+ self.flow_level -= 1
+ if self.canonical:
+ self.write_indicator(u',', False)
+ self.write_indent()
+ self.write_indicator(u']', False)
+ self.state = self.states.pop()
+ else:
+ self.write_indicator(u',', False)
+ if self.canonical or self.column > self.best_width:
+ self.write_indent()
+ self.states.append(self.expect_flow_sequence_item)
+ self.expect_node(sequence=True)
+
+ # Flow mapping handlers.
+
+ def expect_flow_mapping(self):
+ self.write_indicator(u'{', True, whitespace=True)
+ self.flow_level += 1
+ self.increase_indent(flow=True)
+ self.state = self.expect_first_flow_mapping_key
+
+ def expect_first_flow_mapping_key(self):
+ if isinstance(self.event, MappingEndEvent):
+ self.indent = self.indents.pop()
+ self.flow_level -= 1
+ self.write_indicator(u'}', False)
+ self.state = self.states.pop()
+ else:
+ if self.canonical or self.column > self.best_width:
+ self.write_indent()
+ if not self.canonical and self.check_simple_key():
+ self.states.append(self.expect_flow_mapping_simple_value)
+ self.expect_node(mapping=True, simple_key=True)
+ else:
+ self.write_indicator(u'?', True)
+ self.states.append(self.expect_flow_mapping_value)
+ self.expect_node(mapping=True)
+
+ def expect_flow_mapping_key(self):
+ if isinstance(self.event, MappingEndEvent):
+ self.indent = self.indents.pop()
+ self.flow_level -= 1
+ if self.canonical:
+ self.write_indicator(u',', False)
+ self.write_indent()
+ self.write_indicator(u'}', False)
+ self.state = self.states.pop()
+ else:
+ self.write_indicator(u',', False)
+ if self.canonical or self.column > self.best_width:
+ self.write_indent()
+ if not self.canonical and self.check_simple_key():
+ self.states.append(self.expect_flow_mapping_simple_value)
+ self.expect_node(mapping=True, simple_key=True)
+ else:
+ self.write_indicator(u'?', True)
+ self.states.append(self.expect_flow_mapping_value)
+ self.expect_node(mapping=True)
+
+ def expect_flow_mapping_simple_value(self):
+ self.write_indicator(u':', False)
+ self.states.append(self.expect_flow_mapping_key)
+ self.expect_node(mapping=True)
+
+ def expect_flow_mapping_value(self):
+ if self.canonical or self.column > self.best_width:
+ self.write_indent()
+ self.write_indicator(u':', True)
+ self.states.append(self.expect_flow_mapping_key)
+ self.expect_node(mapping=True)
+
+ # Block sequence handlers.
+
+ def expect_block_sequence(self):
+ indentless = (self.mapping_context and not self.indention)
+ self.increase_indent(flow=False, indentless=indentless)
+ self.state = self.expect_first_block_sequence_item
+
+ def expect_first_block_sequence_item(self):
+ return self.expect_block_sequence_item(first=True)
+
+ def expect_block_sequence_item(self, first=False):
+ if not first and isinstance(self.event, SequenceEndEvent):
+ self.indent = self.indents.pop()
+ self.state = self.states.pop()
+ else:
+ self.write_indent()
+ self.write_indicator(u'-', True, indention=True)
+ self.states.append(self.expect_block_sequence_item)
+ self.expect_node(sequence=True)
+
+ # Block mapping handlers.
+
+ def expect_block_mapping(self):
+ self.increase_indent(flow=False)
+ self.state = self.expect_first_block_mapping_key
+
+ def expect_first_block_mapping_key(self):
+ return self.expect_block_mapping_key(first=True)
+
+ def expect_block_mapping_key(self, first=False):
+ if not first and isinstance(self.event, MappingEndEvent):
+ self.indent = self.indents.pop()
+ self.state = self.states.pop()
+ else:
+ self.write_indent()
+ if self.check_simple_key():
+ self.states.append(self.expect_block_mapping_simple_value)
+ self.expect_node(mapping=True, simple_key=True)
+ else:
+ self.write_indicator(u'?', True, indention=True)
+ self.states.append(self.expect_block_mapping_value)
+ self.expect_node(mapping=True)
+
+ def expect_block_mapping_simple_value(self):
+ self.write_indicator(u':', False)
+ self.states.append(self.expect_block_mapping_key)
+ self.expect_node(mapping=True)
+
+ def expect_block_mapping_value(self):
+ self.write_indent()
+ self.write_indicator(u':', True, indention=True)
+ self.states.append(self.expect_block_mapping_key)
+ self.expect_node(mapping=True)
+
+ # Checkers.
+
+ def check_empty_sequence(self):
+ return (isinstance(self.event, SequenceStartEvent) and self.events
+ and isinstance(self.events[0], SequenceEndEvent))
+
+ def check_empty_mapping(self):
+ return (isinstance(self.event, MappingStartEvent) and self.events
+ and isinstance(self.events[0], MappingEndEvent))
+
+ def check_empty_document(self):
+ if not isinstance(self.event, DocumentStartEvent) or not self.events:
+ return False
+ event = self.events[0]
+ return (isinstance(event, ScalarEvent) and event.anchor is None
+ and event.tag is None and event.implicit and event.value == u'')
+
+ def check_simple_key(self):
+ length = 0
+ if isinstance(self.event, NodeEvent) and self.event.anchor is not None:
+ if self.prepared_anchor is None:
+ self.prepared_anchor = self.prepare_anchor(self.event.anchor)
+ length += len(self.prepared_anchor)
+ if isinstance(self.event, (ScalarEvent, CollectionStartEvent)) \
+ and self.event.tag is not None:
+ if self.prepared_tag is None:
+ self.prepared_tag = self.prepare_tag(self.event.tag)
+ length += len(self.prepared_tag)
+ if isinstance(self.event, ScalarEvent):
+ if self.analysis is None:
+ self.analysis = self.analyze_scalar(self.event.value)
+ length += len(self.analysis.scalar)
+ return (length < 128 and (isinstance(self.event, AliasEvent)
+ or (isinstance(self.event, ScalarEvent)
+ and not self.analysis.empty and not self.analysis.multiline)
+ or self.check_empty_sequence() or self.check_empty_mapping()))
+
+ # Anchor, Tag, and Scalar processors.
+
+ def process_anchor(self, indicator):
+ if self.event.anchor is None:
+ self.prepared_anchor = None
+ return
+ if self.prepared_anchor is None:
+ self.prepared_anchor = self.prepare_anchor(self.event.anchor)
+ if self.prepared_anchor:
+ self.write_indicator(indicator+self.prepared_anchor, True)
+ self.prepared_anchor = None
+
+ def process_tag(self):
+ tag = self.event.tag
+ if isinstance(self.event, ScalarEvent):
+ if self.style is None:
+ self.style = self.choose_scalar_style()
+ if ((not self.canonical or tag is None) and
+ ((self.style == '' and self.event.implicit[0])
+ or (self.style != '' and self.event.implicit[1]))):
+ self.prepared_tag = None
+ return
+ if self.event.implicit[0] and tag is None:
+ tag = u'!'
+ self.prepared_tag = None
+ else:
+ if (not self.canonical or tag is None) and self.event.implicit:
+ self.prepared_tag = None
+ return
+ if tag is None:
+ raise EmitterError("tag is not specified")
+ if self.prepared_tag is None:
+ self.prepared_tag = self.prepare_tag(tag)
+ if self.prepared_tag:
+ self.write_indicator(self.prepared_tag, True)
+ self.prepared_tag = None
+
+ def choose_scalar_style(self):
+ if self.analysis is None:
+ self.analysis = self.analyze_scalar(self.event.value)
+ if self.event.style == '"' or self.canonical:
+ return '"'
+ if not self.event.style and self.event.implicit[0]:
+ if (not (self.simple_key_context and
+ (self.analysis.empty or self.analysis.multiline))
+ and (self.flow_level and self.analysis.allow_flow_plain
+ or (not self.flow_level and self.analysis.allow_block_plain))):
+ return ''
+ if self.event.style and self.event.style in '|>':
+ if (not self.flow_level and not self.simple_key_context
+ and self.analysis.allow_block):
+ return self.event.style
+ if not self.event.style or self.event.style == '\'':
+ if (self.analysis.allow_single_quoted and
+ not (self.simple_key_context and self.analysis.multiline)):
+ return '\''
+ return '"'
+
+ def process_scalar(self):
+ if self.analysis is None:
+ self.analysis = self.analyze_scalar(self.event.value)
+ if self.style is None:
+ self.style = self.choose_scalar_style()
+ split = (not self.simple_key_context)
+ #if self.analysis.multiline and split \
+ # and (not self.style or self.style in '\'\"'):
+ # self.write_indent()
+ if self.style == '"':
+ self.write_double_quoted(self.analysis.scalar, split)
+ elif self.style == '\'':
+ self.write_single_quoted(self.analysis.scalar, split)
+ elif self.style == '>':
+ self.write_folded(self.analysis.scalar)
+ elif self.style == '|':
+ self.write_literal(self.analysis.scalar)
+ else:
+ self.write_plain(self.analysis.scalar, split)
+ self.analysis = None
+ self.style = None
+
+ # Analyzers.
+
+ def prepare_version(self, version):
+ major, minor = version
+ if major != 1:
+ raise EmitterError("unsupported YAML version: %d.%d" % (major, minor))
+ return u'%d.%d' % (major, minor)
+
+ def prepare_tag_handle(self, handle):
+ if not handle:
+ raise EmitterError("tag handle must not be empty")
+ if handle[0] != u'!' or handle[-1] != u'!':
+ raise EmitterError("tag handle must start and end with '!': %r"
+ % (handle.encode('utf-8')))
+ for ch in handle[1:-1]:
+ if not (u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \
+ or ch in u'-_'):
+ raise EmitterError("invalid character %r in the tag handle: %r"
+ % (ch.encode('utf-8'), handle.encode('utf-8')))
+ return handle
+
+ def prepare_tag_prefix(self, prefix):
+ if not prefix:
+ raise EmitterError("tag prefix must not be empty")
+ chunks = []
+ start = end = 0
+ if prefix[0] == u'!':
+ end = 1
+ while end < len(prefix):
+ ch = prefix[end]
+ if u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \
+ or ch in u'-;/?!:@&=+$,_.~*\'()[]':
+ end += 1
+ else:
+ if start < end:
+ chunks.append(prefix[start:end])
+ start = end = end+1
+ data = ch.encode('utf-8')
+ for ch in data:
+ chunks.append(u'%%%02X' % ord(ch))
+ if start < end:
+ chunks.append(prefix[start:end])
+ return u''.join(chunks)
+
+ def prepare_tag(self, tag):
+ if not tag:
+ raise EmitterError("tag must not be empty")
+ if tag == u'!':
+ return tag
+ handle = None
+ suffix = tag
+ prefixes = self.tag_prefixes.keys()
+ prefixes.sort()
+ for prefix in prefixes:
+ if tag.startswith(prefix) \
+ and (prefix == u'!' or len(prefix) < len(tag)):
+ handle = self.tag_prefixes[prefix]
+ suffix = tag[len(prefix):]
+ chunks = []
+ start = end = 0
+ while end < len(suffix):
+ ch = suffix[end]
+ if u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \
+ or ch in u'-;/?:@&=+$,_.~*\'()[]' \
+ or (ch == u'!' and handle != u'!'):
+ end += 1
+ else:
+ if start < end:
+ chunks.append(suffix[start:end])
+ start = end = end+1
+ data = ch.encode('utf-8')
+ for ch in data:
+ chunks.append(u'%%%02X' % ord(ch))
+ if start < end:
+ chunks.append(suffix[start:end])
+ suffix_text = u''.join(chunks)
+ if handle:
+ return u'%s%s' % (handle, suffix_text)
+ else:
+ return u'!<%s>' % suffix_text
+
+ def prepare_anchor(self, anchor):
+ if not anchor:
+ raise EmitterError("anchor must not be empty")
+ for ch in anchor:
+ if not (u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \
+ or ch in u'-_'):
+ raise EmitterError("invalid character %r in the anchor: %r"
+ % (ch.encode('utf-8'), anchor.encode('utf-8')))
+ return anchor
+
+ def analyze_scalar(self, scalar):
+
+ # Empty scalar is a special case.
+ if not scalar:
+ return ScalarAnalysis(scalar=scalar, empty=True, multiline=False,
+ allow_flow_plain=False, allow_block_plain=True,
+ allow_single_quoted=True, allow_double_quoted=True,
+ allow_block=False)
+
+ # Indicators and special characters.
+ block_indicators = False
+ flow_indicators = False
+ line_breaks = False
+ special_characters = False
+
+ # Important whitespace combinations.
+ leading_space = False
+ leading_break = False
+ trailing_space = False
+ trailing_break = False
+ break_space = False
+ space_break = False
+
+ # Check document indicators.
+ if scalar.startswith(u'---') or scalar.startswith(u'...'):
+ block_indicators = True
+ flow_indicators = True
+
+ # First character or preceded by a whitespace.
+ preceeded_by_whitespace = True
+
+ # Last character or followed by a whitespace.
+ followed_by_whitespace = (len(scalar) == 1 or
+ scalar[1] in u'\0 \t\r\n\x85\u2028\u2029')
+
+ # The previous character is a space.
+ previous_space = False
+
+ # The previous character is a break.
+ previous_break = False
+
+ index = 0
+ while index < len(scalar):
+ ch = scalar[index]
+
+ # Check for indicators.
+ if index == 0:
+ # Leading indicators are special characters.
+ if ch in u'#,[]{}&*!|>\'\"%@`':
+ flow_indicators = True
+ block_indicators = True
+ if ch in u'?:':
+ flow_indicators = True
+ if followed_by_whitespace:
+ block_indicators = True
+ if ch == u'-' and followed_by_whitespace:
+ flow_indicators = True
+ block_indicators = True
+ else:
+ # Some indicators cannot appear within a scalar as well.
+ if ch in u',?[]{}':
+ flow_indicators = True
+ if ch == u':':
+ flow_indicators = True
+ if followed_by_whitespace:
+ block_indicators = True
+ if ch == u'#' and preceeded_by_whitespace:
+ flow_indicators = True
+ block_indicators = True
+
+ # Check for line breaks, special, and unicode characters.
+ if ch in u'\n\x85\u2028\u2029':
+ line_breaks = True
+ if not (ch == u'\n' or u'\x20' <= ch <= u'\x7E'):
+ if (ch == u'\x85' or u'\xA0' <= ch <= u'\uD7FF'
+ or u'\uE000' <= ch <= u'\uFFFD') and ch != u'\uFEFF':
+ unicode_characters = True
+ if not self.allow_unicode:
+ special_characters = True
+ else:
+ special_characters = True
+
+ # Detect important whitespace combinations.
+ if ch == u' ':
+ if index == 0:
+ leading_space = True
+ if index == len(scalar)-1:
+ trailing_space = True
+ if previous_break:
+ break_space = True
+ previous_space = True
+ previous_break = False
+ elif ch in u'\n\x85\u2028\u2029':
+ if index == 0:
+ leading_break = True
+ if index == len(scalar)-1:
+ trailing_break = True
+ if previous_space:
+ space_break = True
+ previous_space = False
+ previous_break = True
+ else:
+ previous_space = False
+ previous_break = False
+
+ # Prepare for the next character.
+ index += 1
+ preceeded_by_whitespace = (ch in u'\0 \t\r\n\x85\u2028\u2029')
+ followed_by_whitespace = (index+1 >= len(scalar) or
+ scalar[index+1] in u'\0 \t\r\n\x85\u2028\u2029')
+
+ # Let's decide what styles are allowed.
+ allow_flow_plain = True
+ allow_block_plain = True
+ allow_single_quoted = True
+ allow_double_quoted = True
+ allow_block = True
+
+ # Leading and trailing whitespaces are bad for plain scalars.
+ if (leading_space or leading_break
+ or trailing_space or trailing_break):
+ allow_flow_plain = allow_block_plain = False
+
+ # We do not permit trailing spaces for block scalars.
+ if trailing_space:
+ allow_block = False
+
+ # Spaces at the beginning of a new line are only acceptable for block
+ # scalars.
+ if break_space:
+ allow_flow_plain = allow_block_plain = allow_single_quoted = False
+
+ # Spaces followed by breaks, as well as special character are only
+ # allowed for double quoted scalars.
+ if space_break or special_characters:
+ allow_flow_plain = allow_block_plain = \
+ allow_single_quoted = allow_block = False
+
+ # Although the plain scalar writer supports breaks, we never emit
+ # multiline plain scalars.
+ if line_breaks:
+ allow_flow_plain = allow_block_plain = False
+
+ # Flow indicators are forbidden for flow plain scalars.
+ if flow_indicators:
+ allow_flow_plain = False
+
+ # Block indicators are forbidden for block plain scalars.
+ if block_indicators:
+ allow_block_plain = False
+
+ return ScalarAnalysis(scalar=scalar,
+ empty=False, multiline=line_breaks,
+ allow_flow_plain=allow_flow_plain,
+ allow_block_plain=allow_block_plain,
+ allow_single_quoted=allow_single_quoted,
+ allow_double_quoted=allow_double_quoted,
+ allow_block=allow_block)
+
+ # Writers.
+
+ def flush_stream(self):
+ if hasattr(self.stream, 'flush'):
+ self.stream.flush()
+
+ def write_stream_start(self):
+ # Write BOM if needed.
+ if self.encoding and self.encoding.startswith('utf-16'):
+ self.stream.write(u'\uFEFF'.encode(self.encoding))
+
+ def write_stream_end(self):
+ self.flush_stream()
+
+ def write_indicator(self, indicator, need_whitespace,
+ whitespace=False, indention=False):
+ if self.whitespace or not need_whitespace:
+ data = indicator
+ else:
+ data = u' '+indicator
+ self.whitespace = whitespace
+ self.indention = self.indention and indention
+ self.column += len(data)
+ self.open_ended = False
+ if self.encoding:
+ data = data.encode(self.encoding)
+ self.stream.write(data)
+
+ def write_indent(self):
+ indent = self.indent or 0
+ if not self.indention or self.column > indent \
+ or (self.column == indent and not self.whitespace):
+ self.write_line_break()
+ if self.column < indent:
+ self.whitespace = True
+ data = u' '*(indent-self.column)
+ self.column = indent
+ if self.encoding:
+ data = data.encode(self.encoding)
+ self.stream.write(data)
+
+ def write_line_break(self, data=None):
+ if data is None:
+ data = self.best_line_break
+ self.whitespace = True
+ self.indention = True
+ self.line += 1
+ self.column = 0
+ if self.encoding:
+ data = data.encode(self.encoding)
+ self.stream.write(data)
+
+ def write_version_directive(self, version_text):
+ data = u'%%YAML %s' % version_text
+ if self.encoding:
+ data = data.encode(self.encoding)
+ self.stream.write(data)
+ self.write_line_break()
+
+ def write_tag_directive(self, handle_text, prefix_text):
+ data = u'%%TAG %s %s' % (handle_text, prefix_text)
+ if self.encoding:
+ data = data.encode(self.encoding)
+ self.stream.write(data)
+ self.write_line_break()
+
+ # Scalar streams.
+
+ def write_single_quoted(self, text, split=True):
+ self.write_indicator(u'\'', True)
+ spaces = False
+ breaks = False
+ start = end = 0
+ while end <= len(text):
+ ch = None
+ if end < len(text):
+ ch = text[end]
+ if spaces:
+ if ch is None or ch != u' ':
+ if start+1 == end and self.column > self.best_width and split \
+ and start != 0 and end != len(text):
+ self.write_indent()
+ else:
+ data = text[start:end]
+ self.column += len(data)
+ if self.encoding:
+ data = data.encode(self.encoding)
+ self.stream.write(data)
+ start = end
+ elif breaks:
+ if ch is None or ch not in u'\n\x85\u2028\u2029':
+ if text[start] == u'\n':
+ self.write_line_break()
+ for br in text[start:end]:
+ if br == u'\n':
+ self.write_line_break()
+ else:
+ self.write_line_break(br)
+ self.write_indent()
+ start = end
+ else:
+ if ch is None or ch in u' \n\x85\u2028\u2029' or ch == u'\'':
+ if start < end:
+ data = text[start:end]
+ self.column += len(data)
+ if self.encoding:
+ data = data.encode(self.encoding)
+ self.stream.write(data)
+ start = end
+ if ch == u'\'':
+ data = u'\'\''
+ self.column += 2
+ if self.encoding:
+ data = data.encode(self.encoding)
+ self.stream.write(data)
+ start = end + 1
+ if ch is not None:
+ spaces = (ch == u' ')
+ breaks = (ch in u'\n\x85\u2028\u2029')
+ end += 1
+ self.write_indicator(u'\'', False)
+
+ ESCAPE_REPLACEMENTS = {
+ u'\0': u'0',
+ u'\x07': u'a',
+ u'\x08': u'b',
+ u'\x09': u't',
+ u'\x0A': u'n',
+ u'\x0B': u'v',
+ u'\x0C': u'f',
+ u'\x0D': u'r',
+ u'\x1B': u'e',
+ u'\"': u'\"',
+ u'\\': u'\\',
+ u'\x85': u'N',
+ u'\xA0': u'_',
+ u'\u2028': u'L',
+ u'\u2029': u'P',
+ }
+
+ def write_double_quoted(self, text, split=True):
+ self.write_indicator(u'"', True)
+ start = end = 0
+ while end <= len(text):
+ ch = None
+ if end < len(text):
+ ch = text[end]
+ if ch is None or ch in u'"\\\x85\u2028\u2029\uFEFF' \
+ or not (u'\x20' <= ch <= u'\x7E'
+ or (self.allow_unicode
+ and (u'\xA0' <= ch <= u'\uD7FF'
+ or u'\uE000' <= ch <= u'\uFFFD'))):
+ if start < end:
+ data = text[start:end]
+ self.column += len(data)
+ if self.encoding:
+ data = data.encode(self.encoding)
+ self.stream.write(data)
+ start = end
+ if ch is not None:
+ if ch in self.ESCAPE_REPLACEMENTS:
+ data = u'\\'+self.ESCAPE_REPLACEMENTS[ch]
+ elif ch <= u'\xFF':
+ data = u'\\x%02X' % ord(ch)
+ elif ch <= u'\uFFFF':
+ data = u'\\u%04X' % ord(ch)
+ else:
+ data = u'\\U%08X' % ord(ch)
+ self.column += len(data)
+ if self.encoding:
+ data = data.encode(self.encoding)
+ self.stream.write(data)
+ start = end+1
+ if 0 < end < len(text)-1 and (ch == u' ' or start >= end) \
+ and self.column+(end-start) > self.best_width and split:
+ data = text[start:end]+u'\\'
+ if start < end:
+ start = end
+ self.column += len(data)
+ if self.encoding:
+ data = data.encode(self.encoding)
+ self.stream.write(data)
+ self.write_indent()
+ self.whitespace = False
+ self.indention = False
+ if text[start] == u' ':
+ data = u'\\'
+ self.column += len(data)
+ if self.encoding:
+ data = data.encode(self.encoding)
+ self.stream.write(data)
+ end += 1
+ self.write_indicator(u'"', False)
+
+ def determine_block_hints(self, text):
+ hints = u''
+ if text:
+ if text[0] in u' \n\x85\u2028\u2029':
+ hints += unicode(self.best_indent)
+ if text[-1] not in u'\n\x85\u2028\u2029':
+ hints += u'-'
+ elif len(text) == 1 or text[-2] in u'\n\x85\u2028\u2029':
+ hints += u'+'
+ return hints
+
+ def write_folded(self, text):
+ hints = self.determine_block_hints(text)
+ self.write_indicator(u'>'+hints, True)
+ if hints[-1:] == u'+':
+ self.open_ended = True
+ self.write_line_break()
+ leading_space = True
+ spaces = False
+ breaks = True
+ start = end = 0
+ while end <= len(text):
+ ch = None
+ if end < len(text):
+ ch = text[end]
+ if breaks:
+ if ch is None or ch not in u'\n\x85\u2028\u2029':
+ if not leading_space and ch is not None and ch != u' ' \
+ and text[start] == u'\n':
+ self.write_line_break()
+ leading_space = (ch == u' ')
+ for br in text[start:end]:
+ if br == u'\n':
+ self.write_line_break()
+ else:
+ self.write_line_break(br)
+ if ch is not None:
+ self.write_indent()
+ start = end
+ elif spaces:
+ if ch != u' ':
+ if start+1 == end and self.column > self.best_width:
+ self.write_indent()
+ else:
+ data = text[start:end]
+ self.column += len(data)
+ if self.encoding:
+ data = data.encode(self.encoding)
+ self.stream.write(data)
+ start = end
+ else:
+ if ch is None or ch in u' \n\x85\u2028\u2029':
+ data = text[start:end]
+ self.column += len(data)
+ if self.encoding:
+ data = data.encode(self.encoding)
+ self.stream.write(data)
+ if ch is None:
+ self.write_line_break()
+ start = end
+ if ch is not None:
+ breaks = (ch in u'\n\x85\u2028\u2029')
+ spaces = (ch == u' ')
+ end += 1
+
+ def write_literal(self, text):
+ hints = self.determine_block_hints(text)
+ self.write_indicator(u'|'+hints, True)
+ if hints[-1:] == u'+':
+ self.open_ended = True
+ self.write_line_break()
+ breaks = True
+ start = end = 0
+ while end <= len(text):
+ ch = None
+ if end < len(text):
+ ch = text[end]
+ if breaks:
+ if ch is None or ch not in u'\n\x85\u2028\u2029':
+ for br in text[start:end]:
+ if br == u'\n':
+ self.write_line_break()
+ else:
+ self.write_line_break(br)
+ if ch is not None:
+ self.write_indent()
+ start = end
+ else:
+ if ch is None or ch in u'\n\x85\u2028\u2029':
+ data = text[start:end]
+ if self.encoding:
+ data = data.encode(self.encoding)
+ self.stream.write(data)
+ if ch is None:
+ self.write_line_break()
+ start = end
+ if ch is not None:
+ breaks = (ch in u'\n\x85\u2028\u2029')
+ end += 1
+
+ def write_plain(self, text, split=True):
+ if self.root_context:
+ self.open_ended = True
+ if not text:
+ return
+ if not self.whitespace:
+ data = u' '
+ self.column += len(data)
+ if self.encoding:
+ data = data.encode(self.encoding)
+ self.stream.write(data)
+ self.whitespace = False
+ self.indention = False
+ spaces = False
+ breaks = False
+ start = end = 0
+ while end <= len(text):
+ ch = None
+ if end < len(text):
+ ch = text[end]
+ if spaces:
+ if ch != u' ':
+ if start+1 == end and self.column > self.best_width and split:
+ self.write_indent()
+ self.whitespace = False
+ self.indention = False
+ else:
+ data = text[start:end]
+ self.column += len(data)
+ if self.encoding:
+ data = data.encode(self.encoding)
+ self.stream.write(data)
+ start = end
+ elif breaks:
+ if ch not in u'\n\x85\u2028\u2029':
+ if text[start] == u'\n':
+ self.write_line_break()
+ for br in text[start:end]:
+ if br == u'\n':
+ self.write_line_break()
+ else:
+ self.write_line_break(br)
+ self.write_indent()
+ self.whitespace = False
+ self.indention = False
+ start = end
+ else:
+ if ch is None or ch in u' \n\x85\u2028\u2029':
+ data = text[start:end]
+ self.column += len(data)
+ if self.encoding:
+ data = data.encode(self.encoding)
+ self.stream.write(data)
+ start = end
+ if ch is not None:
+ spaces = (ch == u' ')
+ breaks = (ch in u'\n\x85\u2028\u2029')
+ end += 1
+
diff --git a/paleomix/yaml/lib2/error.py b/paleomix/yaml/lib2/error.py
new file mode 100644
index 0000000..046659c
--- /dev/null
+++ b/paleomix/yaml/lib2/error.py
@@ -0,0 +1,75 @@
+
+__all__ = ['Mark', 'YAMLError', 'MarkedYAMLError']
+
+class Mark(object):
+
+ def __init__(self, name, index, line, column, buffer, pointer):
+ self.name = name
+ self.index = index
+ self.line = line
+ self.column = column
+ self.buffer = buffer
+ self.pointer = pointer
+
+ def get_snippet(self, indent=4, max_length=75):
+ if self.buffer is None:
+ return None
+ head = ''
+ start = self.pointer
+ while start > 0 and self.buffer[start-1] not in u'\0\r\n\x85\u2028\u2029':
+ start -= 1
+ if self.pointer-start > max_length//2-1:
+ head = ' ... '
+ start += 5
+ break
+ tail = ''
+ end = self.pointer
+ while end < len(self.buffer) and self.buffer[end] not in u'\0\r\n\x85\u2028\u2029':
+ end += 1
+ if end-self.pointer > max_length//2-1:
+ tail = ' ... '
+ end -= 5
+ break
+ snippet = self.buffer[start:end].encode('utf-8')
+ return ' '*indent + head + snippet + tail + '\n' \
+ + ' '*(indent+self.pointer-start+len(head)) + '^'
+
+ def __str__(self):
+ snippet = self.get_snippet()
+ where = " in \"%s\", line %d, column %d" \
+ % (self.name, self.line+1, self.column+1)
+ if snippet is not None:
+ where += ":\n"+snippet
+ return where
+
+class YAMLError(Exception):
+ pass
+
+class MarkedYAMLError(YAMLError):
+
+ def __init__(self, context=None, context_mark=None,
+ problem=None, problem_mark=None, note=None):
+ self.context = context
+ self.context_mark = context_mark
+ self.problem = problem
+ self.problem_mark = problem_mark
+ self.note = note
+
+ def __str__(self):
+ lines = []
+ if self.context is not None:
+ lines.append(self.context)
+ if self.context_mark is not None \
+ and (self.problem is None or self.problem_mark is None
+ or self.context_mark.name != self.problem_mark.name
+ or self.context_mark.line != self.problem_mark.line
+ or self.context_mark.column != self.problem_mark.column):
+ lines.append(str(self.context_mark))
+ if self.problem is not None:
+ lines.append(self.problem)
+ if self.problem_mark is not None:
+ lines.append(str(self.problem_mark))
+ if self.note is not None:
+ lines.append(self.note)
+ return '\n'.join(lines)
+
diff --git a/paleomix/yaml/lib2/events.py b/paleomix/yaml/lib2/events.py
new file mode 100644
index 0000000..f79ad38
--- /dev/null
+++ b/paleomix/yaml/lib2/events.py
@@ -0,0 +1,86 @@
+
+# Abstract classes.
+
+class Event(object):
+ def __init__(self, start_mark=None, end_mark=None):
+ self.start_mark = start_mark
+ self.end_mark = end_mark
+ def __repr__(self):
+ attributes = [key for key in ['anchor', 'tag', 'implicit', 'value']
+ if hasattr(self, key)]
+ arguments = ', '.join(['%s=%r' % (key, getattr(self, key))
+ for key in attributes])
+ return '%s(%s)' % (self.__class__.__name__, arguments)
+
+class NodeEvent(Event):
+ def __init__(self, anchor, start_mark=None, end_mark=None):
+ self.anchor = anchor
+ self.start_mark = start_mark
+ self.end_mark = end_mark
+
+class CollectionStartEvent(NodeEvent):
+ def __init__(self, anchor, tag, implicit, start_mark=None, end_mark=None,
+ flow_style=None):
+ self.anchor = anchor
+ self.tag = tag
+ self.implicit = implicit
+ self.start_mark = start_mark
+ self.end_mark = end_mark
+ self.flow_style = flow_style
+
+class CollectionEndEvent(Event):
+ pass
+
+# Implementations.
+
+class StreamStartEvent(Event):
+ def __init__(self, start_mark=None, end_mark=None, encoding=None):
+ self.start_mark = start_mark
+ self.end_mark = end_mark
+ self.encoding = encoding
+
+class StreamEndEvent(Event):
+ pass
+
+class DocumentStartEvent(Event):
+ def __init__(self, start_mark=None, end_mark=None,
+ explicit=None, version=None, tags=None):
+ self.start_mark = start_mark
+ self.end_mark = end_mark
+ self.explicit = explicit
+ self.version = version
+ self.tags = tags
+
+class DocumentEndEvent(Event):
+ def __init__(self, start_mark=None, end_mark=None,
+ explicit=None):
+ self.start_mark = start_mark
+ self.end_mark = end_mark
+ self.explicit = explicit
+
+class AliasEvent(NodeEvent):
+ pass
+
+class ScalarEvent(NodeEvent):
+ def __init__(self, anchor, tag, implicit, value,
+ start_mark=None, end_mark=None, style=None):
+ self.anchor = anchor
+ self.tag = tag
+ self.implicit = implicit
+ self.value = value
+ self.start_mark = start_mark
+ self.end_mark = end_mark
+ self.style = style
+
+class SequenceStartEvent(CollectionStartEvent):
+ pass
+
+class SequenceEndEvent(CollectionEndEvent):
+ pass
+
+class MappingStartEvent(CollectionStartEvent):
+ pass
+
+class MappingEndEvent(CollectionEndEvent):
+ pass
+
diff --git a/paleomix/yaml/lib2/loader.py b/paleomix/yaml/lib2/loader.py
new file mode 100644
index 0000000..293ff46
--- /dev/null
+++ b/paleomix/yaml/lib2/loader.py
@@ -0,0 +1,40 @@
+
+__all__ = ['BaseLoader', 'SafeLoader', 'Loader']
+
+from reader import *
+from scanner import *
+from parser import *
+from composer import *
+from constructor import *
+from resolver import *
+
+class BaseLoader(Reader, Scanner, Parser, Composer, BaseConstructor, BaseResolver):
+
+ def __init__(self, stream):
+ Reader.__init__(self, stream)
+ Scanner.__init__(self)
+ Parser.__init__(self)
+ Composer.__init__(self)
+ BaseConstructor.__init__(self)
+ BaseResolver.__init__(self)
+
+class SafeLoader(Reader, Scanner, Parser, Composer, SafeConstructor, Resolver):
+
+ def __init__(self, stream):
+ Reader.__init__(self, stream)
+ Scanner.__init__(self)
+ Parser.__init__(self)
+ Composer.__init__(self)
+ SafeConstructor.__init__(self)
+ Resolver.__init__(self)
+
+class Loader(Reader, Scanner, Parser, Composer, Constructor, Resolver):
+
+ def __init__(self, stream):
+ Reader.__init__(self, stream)
+ Scanner.__init__(self)
+ Parser.__init__(self)
+ Composer.__init__(self)
+ Constructor.__init__(self)
+ Resolver.__init__(self)
+
diff --git a/paleomix/yaml/lib2/nodes.py b/paleomix/yaml/lib2/nodes.py
new file mode 100644
index 0000000..c4f070c
--- /dev/null
+++ b/paleomix/yaml/lib2/nodes.py
@@ -0,0 +1,49 @@
+
+class Node(object):
+ def __init__(self, tag, value, start_mark, end_mark):
+ self.tag = tag
+ self.value = value
+ self.start_mark = start_mark
+ self.end_mark = end_mark
+ def __repr__(self):
+ value = self.value
+ #if isinstance(value, list):
+ # if len(value) == 0:
+ # value = '<empty>'
+ # elif len(value) == 1:
+ # value = '<1 item>'
+ # else:
+ # value = '<%d items>' % len(value)
+ #else:
+ # if len(value) > 75:
+ # value = repr(value[:70]+u' ... ')
+ # else:
+ # value = repr(value)
+ value = repr(value)
+ return '%s(tag=%r, value=%s)' % (self.__class__.__name__, self.tag, value)
+
+class ScalarNode(Node):
+ id = 'scalar'
+ def __init__(self, tag, value,
+ start_mark=None, end_mark=None, style=None):
+ self.tag = tag
+ self.value = value
+ self.start_mark = start_mark
+ self.end_mark = end_mark
+ self.style = style
+
+class CollectionNode(Node):
+ def __init__(self, tag, value,
+ start_mark=None, end_mark=None, flow_style=None):
+ self.tag = tag
+ self.value = value
+ self.start_mark = start_mark
+ self.end_mark = end_mark
+ self.flow_style = flow_style
+
+class SequenceNode(CollectionNode):
+ id = 'sequence'
+
+class MappingNode(CollectionNode):
+ id = 'mapping'
+
diff --git a/paleomix/yaml/lib2/parser.py b/paleomix/yaml/lib2/parser.py
new file mode 100644
index 0000000..f9e3057
--- /dev/null
+++ b/paleomix/yaml/lib2/parser.py
@@ -0,0 +1,589 @@
+
+# The following YAML grammar is LL(1) and is parsed by a recursive descent
+# parser.
+#
+# stream ::= STREAM-START implicit_document? explicit_document* STREAM-END
+# implicit_document ::= block_node DOCUMENT-END*
+# explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END*
+# block_node_or_indentless_sequence ::=
+# ALIAS
+# | properties (block_content | indentless_block_sequence)?
+# | block_content
+# | indentless_block_sequence
+# block_node ::= ALIAS
+# | properties block_content?
+# | block_content
+# flow_node ::= ALIAS
+# | properties flow_content?
+# | flow_content
+# properties ::= TAG ANCHOR? | ANCHOR TAG?
+# block_content ::= block_collection | flow_collection | SCALAR
+# flow_content ::= flow_collection | SCALAR
+# block_collection ::= block_sequence | block_mapping
+# flow_collection ::= flow_sequence | flow_mapping
+# block_sequence ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
+# indentless_sequence ::= (BLOCK-ENTRY block_node?)+
+# block_mapping ::= BLOCK-MAPPING_START
+# ((KEY block_node_or_indentless_sequence?)?
+# (VALUE block_node_or_indentless_sequence?)?)*
+# BLOCK-END
+# flow_sequence ::= FLOW-SEQUENCE-START
+# (flow_sequence_entry FLOW-ENTRY)*
+# flow_sequence_entry?
+# FLOW-SEQUENCE-END
+# flow_sequence_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
+# flow_mapping ::= FLOW-MAPPING-START
+# (flow_mapping_entry FLOW-ENTRY)*
+# flow_mapping_entry?
+# FLOW-MAPPING-END
+# flow_mapping_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
+#
+# FIRST sets:
+#
+# stream: { STREAM-START }
+# explicit_document: { DIRECTIVE DOCUMENT-START }
+# implicit_document: FIRST(block_node)
+# block_node: { ALIAS TAG ANCHOR SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START }
+# flow_node: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START }
+# block_content: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
+# flow_content: { FLOW-SEQUENCE-START FLOW-MAPPING-START SCALAR }
+# block_collection: { BLOCK-SEQUENCE-START BLOCK-MAPPING-START }
+# flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
+# block_sequence: { BLOCK-SEQUENCE-START }
+# block_mapping: { BLOCK-MAPPING-START }
+# block_node_or_indentless_sequence: { ALIAS ANCHOR TAG SCALAR BLOCK-SEQUENCE-START BLOCK-MAPPING-START FLOW-SEQUENCE-START FLOW-MAPPING-START BLOCK-ENTRY }
+# indentless_sequence: { ENTRY }
+# flow_collection: { FLOW-SEQUENCE-START FLOW-MAPPING-START }
+# flow_sequence: { FLOW-SEQUENCE-START }
+# flow_mapping: { FLOW-MAPPING-START }
+# flow_sequence_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
+# flow_mapping_entry: { ALIAS ANCHOR TAG SCALAR FLOW-SEQUENCE-START FLOW-MAPPING-START KEY }
+
+__all__ = ['Parser', 'ParserError']
+
+from error import MarkedYAMLError
+from tokens import *
+from events import *
+from scanner import *
+
+class ParserError(MarkedYAMLError):
+ pass
+
+class Parser(object):
+ # Since writing a recursive-descendant parser is a straightforward task, we
+ # do not give many comments here.
+
+ DEFAULT_TAGS = {
+ u'!': u'!',
+ u'!!': u'tag:yaml.org,2002:',
+ }
+
+ def __init__(self):
+ self.current_event = None
+ self.yaml_version = None
+ self.tag_handles = {}
+ self.states = []
+ self.marks = []
+ self.state = self.parse_stream_start
+
+ def dispose(self):
+ # Reset the state attributes (to clear self-references)
+ self.states = []
+ self.state = None
+
+ def check_event(self, *choices):
+ # Check the type of the next event.
+ if self.current_event is None:
+ if self.state:
+ self.current_event = self.state()
+ if self.current_event is not None:
+ if not choices:
+ return True
+ for choice in choices:
+ if isinstance(self.current_event, choice):
+ return True
+ return False
+
+ def peek_event(self):
+ # Get the next event.
+ if self.current_event is None:
+ if self.state:
+ self.current_event = self.state()
+ return self.current_event
+
+ def get_event(self):
+ # Get the next event and proceed further.
+ if self.current_event is None:
+ if self.state:
+ self.current_event = self.state()
+ value = self.current_event
+ self.current_event = None
+ return value
+
+ # stream ::= STREAM-START implicit_document? explicit_document* STREAM-END
+ # implicit_document ::= block_node DOCUMENT-END*
+ # explicit_document ::= DIRECTIVE* DOCUMENT-START block_node? DOCUMENT-END*
+
+ def parse_stream_start(self):
+
+ # Parse the stream start.
+ token = self.get_token()
+ event = StreamStartEvent(token.start_mark, token.end_mark,
+ encoding=token.encoding)
+
+ # Prepare the next state.
+ self.state = self.parse_implicit_document_start
+
+ return event
+
+ def parse_implicit_document_start(self):
+
+ # Parse an implicit document.
+ if not self.check_token(DirectiveToken, DocumentStartToken,
+ StreamEndToken):
+ self.tag_handles = self.DEFAULT_TAGS
+ token = self.peek_token()
+ start_mark = end_mark = token.start_mark
+ event = DocumentStartEvent(start_mark, end_mark,
+ explicit=False)
+
+ # Prepare the next state.
+ self.states.append(self.parse_document_end)
+ self.state = self.parse_block_node
+
+ return event
+
+ else:
+ return self.parse_document_start()
+
+ def parse_document_start(self):
+
+ # Parse any extra document end indicators.
+ while self.check_token(DocumentEndToken):
+ self.get_token()
+
+ # Parse an explicit document.
+ if not self.check_token(StreamEndToken):
+ token = self.peek_token()
+ start_mark = token.start_mark
+ version, tags = self.process_directives()
+ if not self.check_token(DocumentStartToken):
+ raise ParserError(None, None,
+ "expected '<document start>', but found %r"
+ % self.peek_token().id,
+ self.peek_token().start_mark)
+ token = self.get_token()
+ end_mark = token.end_mark
+ event = DocumentStartEvent(start_mark, end_mark,
+ explicit=True, version=version, tags=tags)
+ self.states.append(self.parse_document_end)
+ self.state = self.parse_document_content
+ else:
+ # Parse the end of the stream.
+ token = self.get_token()
+ event = StreamEndEvent(token.start_mark, token.end_mark)
+ assert not self.states
+ assert not self.marks
+ self.state = None
+ return event
+
+ def parse_document_end(self):
+
+ # Parse the document end.
+ token = self.peek_token()
+ start_mark = end_mark = token.start_mark
+ explicit = False
+ if self.check_token(DocumentEndToken):
+ token = self.get_token()
+ end_mark = token.end_mark
+ explicit = True
+ event = DocumentEndEvent(start_mark, end_mark,
+ explicit=explicit)
+
+ # Prepare the next state.
+ self.state = self.parse_document_start
+
+ return event
+
+ def parse_document_content(self):
+ if self.check_token(DirectiveToken,
+ DocumentStartToken, DocumentEndToken, StreamEndToken):
+ event = self.process_empty_scalar(self.peek_token().start_mark)
+ self.state = self.states.pop()
+ return event
+ else:
+ return self.parse_block_node()
+
+ def process_directives(self):
+ self.yaml_version = None
+ self.tag_handles = {}
+ while self.check_token(DirectiveToken):
+ token = self.get_token()
+ if token.name == u'YAML':
+ if self.yaml_version is not None:
+ raise ParserError(None, None,
+ "found duplicate YAML directive", token.start_mark)
+ major, minor = token.value
+ if major != 1:
+ raise ParserError(None, None,
+ "found incompatible YAML document (version 1.* is required)",
+ token.start_mark)
+ self.yaml_version = token.value
+ elif token.name == u'TAG':
+ handle, prefix = token.value
+ if handle in self.tag_handles:
+ raise ParserError(None, None,
+ "duplicate tag handle %r" % handle.encode('utf-8'),
+ token.start_mark)
+ self.tag_handles[handle] = prefix
+ if self.tag_handles:
+ value = self.yaml_version, self.tag_handles.copy()
+ else:
+ value = self.yaml_version, None
+ for key in self.DEFAULT_TAGS:
+ if key not in self.tag_handles:
+ self.tag_handles[key] = self.DEFAULT_TAGS[key]
+ return value
+
+ # block_node_or_indentless_sequence ::= ALIAS
+ # | properties (block_content | indentless_block_sequence)?
+ # | block_content
+ # | indentless_block_sequence
+ # block_node ::= ALIAS
+ # | properties block_content?
+ # | block_content
+ # flow_node ::= ALIAS
+ # | properties flow_content?
+ # | flow_content
+ # properties ::= TAG ANCHOR? | ANCHOR TAG?
+ # block_content ::= block_collection | flow_collection | SCALAR
+ # flow_content ::= flow_collection | SCALAR
+ # block_collection ::= block_sequence | block_mapping
+ # flow_collection ::= flow_sequence | flow_mapping
+
+ def parse_block_node(self):
+ return self.parse_node(block=True)
+
+ def parse_flow_node(self):
+ return self.parse_node()
+
+ def parse_block_node_or_indentless_sequence(self):
+ return self.parse_node(block=True, indentless_sequence=True)
+
+ def parse_node(self, block=False, indentless_sequence=False):
+ if self.check_token(AliasToken):
+ token = self.get_token()
+ event = AliasEvent(token.value, token.start_mark, token.end_mark)
+ self.state = self.states.pop()
+ else:
+ anchor = None
+ tag = None
+ start_mark = end_mark = tag_mark = None
+ if self.check_token(AnchorToken):
+ token = self.get_token()
+ start_mark = token.start_mark
+ end_mark = token.end_mark
+ anchor = token.value
+ if self.check_token(TagToken):
+ token = self.get_token()
+ tag_mark = token.start_mark
+ end_mark = token.end_mark
+ tag = token.value
+ elif self.check_token(TagToken):
+ token = self.get_token()
+ start_mark = tag_mark = token.start_mark
+ end_mark = token.end_mark
+ tag = token.value
+ if self.check_token(AnchorToken):
+ token = self.get_token()
+ end_mark = token.end_mark
+ anchor = token.value
+ if tag is not None:
+ handle, suffix = tag
+ if handle is not None:
+ if handle not in self.tag_handles:
+ raise ParserError("while parsing a node", start_mark,
+ "found undefined tag handle %r" % handle.encode('utf-8'),
+ tag_mark)
+ tag = self.tag_handles[handle]+suffix
+ else:
+ tag = suffix
+ #if tag == u'!':
+ # raise ParserError("while parsing a node", start_mark,
+ # "found non-specific tag '!'", tag_mark,
+ # "Please check 'http://pyyaml.org/wiki/YAMLNonSpecificTag' and share your opinion.")
+ if start_mark is None:
+ start_mark = end_mark = self.peek_token().start_mark
+ event = None
+ implicit = (tag is None or tag == u'!')
+ if indentless_sequence and self.check_token(BlockEntryToken):
+ end_mark = self.peek_token().end_mark
+ event = SequenceStartEvent(anchor, tag, implicit,
+ start_mark, end_mark)
+ self.state = self.parse_indentless_sequence_entry
+ else:
+ if self.check_token(ScalarToken):
+ token = self.get_token()
+ end_mark = token.end_mark
+ if (token.plain and tag is None) or tag == u'!':
+ implicit = (True, False)
+ elif tag is None:
+ implicit = (False, True)
+ else:
+ implicit = (False, False)
+ event = ScalarEvent(anchor, tag, implicit, token.value,
+ start_mark, end_mark, style=token.style)
+ self.state = self.states.pop()
+ elif self.check_token(FlowSequenceStartToken):
+ end_mark = self.peek_token().end_mark
+ event = SequenceStartEvent(anchor, tag, implicit,
+ start_mark, end_mark, flow_style=True)
+ self.state = self.parse_flow_sequence_first_entry
+ elif self.check_token(FlowMappingStartToken):
+ end_mark = self.peek_token().end_mark
+ event = MappingStartEvent(anchor, tag, implicit,
+ start_mark, end_mark, flow_style=True)
+ self.state = self.parse_flow_mapping_first_key
+ elif block and self.check_token(BlockSequenceStartToken):
+ end_mark = self.peek_token().start_mark
+ event = SequenceStartEvent(anchor, tag, implicit,
+ start_mark, end_mark, flow_style=False)
+ self.state = self.parse_block_sequence_first_entry
+ elif block and self.check_token(BlockMappingStartToken):
+ end_mark = self.peek_token().start_mark
+ event = MappingStartEvent(anchor, tag, implicit,
+ start_mark, end_mark, flow_style=False)
+ self.state = self.parse_block_mapping_first_key
+ elif anchor is not None or tag is not None:
+ # Empty scalars are allowed even if a tag or an anchor is
+ # specified.
+ event = ScalarEvent(anchor, tag, (implicit, False), u'',
+ start_mark, end_mark)
+ self.state = self.states.pop()
+ else:
+ if block:
+ node = 'block'
+ else:
+ node = 'flow'
+ token = self.peek_token()
+ raise ParserError("while parsing a %s node" % node, start_mark,
+ "expected the node content, but found %r" % token.id,
+ token.start_mark)
+ return event
+
+ # block_sequence ::= BLOCK-SEQUENCE-START (BLOCK-ENTRY block_node?)* BLOCK-END
+
+ def parse_block_sequence_first_entry(self):
+ token = self.get_token()
+ self.marks.append(token.start_mark)
+ return self.parse_block_sequence_entry()
+
+ def parse_block_sequence_entry(self):
+ if self.check_token(BlockEntryToken):
+ token = self.get_token()
+ if not self.check_token(BlockEntryToken, BlockEndToken):
+ self.states.append(self.parse_block_sequence_entry)
+ return self.parse_block_node()
+ else:
+ self.state = self.parse_block_sequence_entry
+ return self.process_empty_scalar(token.end_mark)
+ if not self.check_token(BlockEndToken):
+ token = self.peek_token()
+ raise ParserError("while parsing a block collection", self.marks[-1],
+ "expected <block end>, but found %r" % token.id, token.start_mark)
+ token = self.get_token()
+ event = SequenceEndEvent(token.start_mark, token.end_mark)
+ self.state = self.states.pop()
+ self.marks.pop()
+ return event
+
+ # indentless_sequence ::= (BLOCK-ENTRY block_node?)+
+
+ def parse_indentless_sequence_entry(self):
+ if self.check_token(BlockEntryToken):
+ token = self.get_token()
+ if not self.check_token(BlockEntryToken,
+ KeyToken, ValueToken, BlockEndToken):
+ self.states.append(self.parse_indentless_sequence_entry)
+ return self.parse_block_node()
+ else:
+ self.state = self.parse_indentless_sequence_entry
+ return self.process_empty_scalar(token.end_mark)
+ token = self.peek_token()
+ event = SequenceEndEvent(token.start_mark, token.start_mark)
+ self.state = self.states.pop()
+ return event
+
+ # block_mapping ::= BLOCK-MAPPING_START
+ # ((KEY block_node_or_indentless_sequence?)?
+ # (VALUE block_node_or_indentless_sequence?)?)*
+ # BLOCK-END
+
+ def parse_block_mapping_first_key(self):
+ token = self.get_token()
+ self.marks.append(token.start_mark)
+ return self.parse_block_mapping_key()
+
+ def parse_block_mapping_key(self):
+ if self.check_token(KeyToken):
+ token = self.get_token()
+ if not self.check_token(KeyToken, ValueToken, BlockEndToken):
+ self.states.append(self.parse_block_mapping_value)
+ return self.parse_block_node_or_indentless_sequence()
+ else:
+ self.state = self.parse_block_mapping_value
+ return self.process_empty_scalar(token.end_mark)
+ if not self.check_token(BlockEndToken):
+ token = self.peek_token()
+ raise ParserError("while parsing a block mapping", self.marks[-1],
+ "expected <block end>, but found %r" % token.id, token.start_mark)
+ token = self.get_token()
+ event = MappingEndEvent(token.start_mark, token.end_mark)
+ self.state = self.states.pop()
+ self.marks.pop()
+ return event
+
+ def parse_block_mapping_value(self):
+ if self.check_token(ValueToken):
+ token = self.get_token()
+ if not self.check_token(KeyToken, ValueToken, BlockEndToken):
+ self.states.append(self.parse_block_mapping_key)
+ return self.parse_block_node_or_indentless_sequence()
+ else:
+ self.state = self.parse_block_mapping_key
+ return self.process_empty_scalar(token.end_mark)
+ else:
+ self.state = self.parse_block_mapping_key
+ token = self.peek_token()
+ return self.process_empty_scalar(token.start_mark)
+
+ # flow_sequence ::= FLOW-SEQUENCE-START
+ # (flow_sequence_entry FLOW-ENTRY)*
+ # flow_sequence_entry?
+ # FLOW-SEQUENCE-END
+ # flow_sequence_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
+ #
+ # Note that while production rules for both flow_sequence_entry and
+ # flow_mapping_entry are equal, their interpretations are different.
+ # For `flow_sequence_entry`, the part `KEY flow_node? (VALUE flow_node?)?`
+ # generate an inline mapping (set syntax).
+
+ def parse_flow_sequence_first_entry(self):
+ token = self.get_token()
+ self.marks.append(token.start_mark)
+ return self.parse_flow_sequence_entry(first=True)
+
+ def parse_flow_sequence_entry(self, first=False):
+ if not self.check_token(FlowSequenceEndToken):
+ if not first:
+ if self.check_token(FlowEntryToken):
+ self.get_token()
+ else:
+ token = self.peek_token()
+ raise ParserError("while parsing a flow sequence", self.marks[-1],
+ "expected ',' or ']', but got %r" % token.id, token.start_mark)
+
+ if self.check_token(KeyToken):
+ token = self.peek_token()
+ event = MappingStartEvent(None, None, True,
+ token.start_mark, token.end_mark,
+ flow_style=True)
+ self.state = self.parse_flow_sequence_entry_mapping_key
+ return event
+ elif not self.check_token(FlowSequenceEndToken):
+ self.states.append(self.parse_flow_sequence_entry)
+ return self.parse_flow_node()
+ token = self.get_token()
+ event = SequenceEndEvent(token.start_mark, token.end_mark)
+ self.state = self.states.pop()
+ self.marks.pop()
+ return event
+
+ def parse_flow_sequence_entry_mapping_key(self):
+ token = self.get_token()
+ if not self.check_token(ValueToken,
+ FlowEntryToken, FlowSequenceEndToken):
+ self.states.append(self.parse_flow_sequence_entry_mapping_value)
+ return self.parse_flow_node()
+ else:
+ self.state = self.parse_flow_sequence_entry_mapping_value
+ return self.process_empty_scalar(token.end_mark)
+
+ def parse_flow_sequence_entry_mapping_value(self):
+ if self.check_token(ValueToken):
+ token = self.get_token()
+ if not self.check_token(FlowEntryToken, FlowSequenceEndToken):
+ self.states.append(self.parse_flow_sequence_entry_mapping_end)
+ return self.parse_flow_node()
+ else:
+ self.state = self.parse_flow_sequence_entry_mapping_end
+ return self.process_empty_scalar(token.end_mark)
+ else:
+ self.state = self.parse_flow_sequence_entry_mapping_end
+ token = self.peek_token()
+ return self.process_empty_scalar(token.start_mark)
+
+ def parse_flow_sequence_entry_mapping_end(self):
+ self.state = self.parse_flow_sequence_entry
+ token = self.peek_token()
+ return MappingEndEvent(token.start_mark, token.start_mark)
+
+ # flow_mapping ::= FLOW-MAPPING-START
+ # (flow_mapping_entry FLOW-ENTRY)*
+ # flow_mapping_entry?
+ # FLOW-MAPPING-END
+ # flow_mapping_entry ::= flow_node | KEY flow_node? (VALUE flow_node?)?
+
+ def parse_flow_mapping_first_key(self):
+ token = self.get_token()
+ self.marks.append(token.start_mark)
+ return self.parse_flow_mapping_key(first=True)
+
+ def parse_flow_mapping_key(self, first=False):
+ if not self.check_token(FlowMappingEndToken):
+ if not first:
+ if self.check_token(FlowEntryToken):
+ self.get_token()
+ else:
+ token = self.peek_token()
+ raise ParserError("while parsing a flow mapping", self.marks[-1],
+ "expected ',' or '}', but got %r" % token.id, token.start_mark)
+ if self.check_token(KeyToken):
+ token = self.get_token()
+ if not self.check_token(ValueToken,
+ FlowEntryToken, FlowMappingEndToken):
+ self.states.append(self.parse_flow_mapping_value)
+ return self.parse_flow_node()
+ else:
+ self.state = self.parse_flow_mapping_value
+ return self.process_empty_scalar(token.end_mark)
+ elif not self.check_token(FlowMappingEndToken):
+ self.states.append(self.parse_flow_mapping_empty_value)
+ return self.parse_flow_node()
+ token = self.get_token()
+ event = MappingEndEvent(token.start_mark, token.end_mark)
+ self.state = self.states.pop()
+ self.marks.pop()
+ return event
+
+ def parse_flow_mapping_value(self):
+ if self.check_token(ValueToken):
+ token = self.get_token()
+ if not self.check_token(FlowEntryToken, FlowMappingEndToken):
+ self.states.append(self.parse_flow_mapping_key)
+ return self.parse_flow_node()
+ else:
+ self.state = self.parse_flow_mapping_key
+ return self.process_empty_scalar(token.end_mark)
+ else:
+ self.state = self.parse_flow_mapping_key
+ token = self.peek_token()
+ return self.process_empty_scalar(token.start_mark)
+
+ def parse_flow_mapping_empty_value(self):
+ self.state = self.parse_flow_mapping_key
+ return self.process_empty_scalar(self.peek_token().start_mark)
+
+ def process_empty_scalar(self, mark):
+ return ScalarEvent(None, None, (True, False), u'', mark, mark)
+
diff --git a/paleomix/yaml/lib2/reader.py b/paleomix/yaml/lib2/reader.py
new file mode 100644
index 0000000..3249e6b
--- /dev/null
+++ b/paleomix/yaml/lib2/reader.py
@@ -0,0 +1,190 @@
+# This module contains abstractions for the input stream. You don't have to
+# looks further, there are no pretty code.
+#
+# We define two classes here.
+#
+# Mark(source, line, column)
+# It's just a record and its only use is producing nice error messages.
+# Parser does not use it for any other purposes.
+#
+# Reader(source, data)
+# Reader determines the encoding of `data` and converts it to unicode.
+# Reader provides the following methods and attributes:
+# reader.peek(length=1) - return the next `length` characters
+# reader.forward(length=1) - move the current position to `length` characters.
+# reader.index - the number of the current character.
+# reader.line, stream.column - the line and the column of the current character.
+
+__all__ = ['Reader', 'ReaderError']
+
+from error import YAMLError, Mark
+
+import codecs, re
+
+class ReaderError(YAMLError):
+
+ def __init__(self, name, position, character, encoding, reason):
+ self.name = name
+ self.character = character
+ self.position = position
+ self.encoding = encoding
+ self.reason = reason
+
+ def __str__(self):
+ if isinstance(self.character, str):
+ return "'%s' codec can't decode byte #x%02x: %s\n" \
+ " in \"%s\", position %d" \
+ % (self.encoding, ord(self.character), self.reason,
+ self.name, self.position)
+ else:
+ return "unacceptable character #x%04x: %s\n" \
+ " in \"%s\", position %d" \
+ % (self.character, self.reason,
+ self.name, self.position)
+
+class Reader(object):
+ # Reader:
+ # - determines the data encoding and converts it to unicode,
+ # - checks if characters are in allowed range,
+ # - adds '\0' to the end.
+
+ # Reader accepts
+ # - a `str` object,
+ # - a `unicode` object,
+ # - a file-like object with its `read` method returning `str`,
+ # - a file-like object with its `read` method returning `unicode`.
+
+ # Yeah, it's ugly and slow.
+
+ def __init__(self, stream):
+ self.name = None
+ self.stream = None
+ self.stream_pointer = 0
+ self.eof = True
+ self.buffer = u''
+ self.pointer = 0
+ self.raw_buffer = None
+ self.raw_decode = None
+ self.encoding = None
+ self.index = 0
+ self.line = 0
+ self.column = 0
+ if isinstance(stream, unicode):
+ self.name = "<unicode string>"
+ self.check_printable(stream)
+ self.buffer = stream+u'\0'
+ elif isinstance(stream, str):
+ self.name = "<string>"
+ self.raw_buffer = stream
+ self.determine_encoding()
+ else:
+ self.stream = stream
+ self.name = getattr(stream, 'name', "<file>")
+ self.eof = False
+ self.raw_buffer = ''
+ self.determine_encoding()
+
+ def peek(self, index=0):
+ try:
+ return self.buffer[self.pointer+index]
+ except IndexError:
+ self.update(index+1)
+ return self.buffer[self.pointer+index]
+
+ def prefix(self, length=1):
+ if self.pointer+length >= len(self.buffer):
+ self.update(length)
+ return self.buffer[self.pointer:self.pointer+length]
+
+ def forward(self, length=1):
+ if self.pointer+length+1 >= len(self.buffer):
+ self.update(length+1)
+ while length:
+ ch = self.buffer[self.pointer]
+ self.pointer += 1
+ self.index += 1
+ if ch in u'\n\x85\u2028\u2029' \
+ or (ch == u'\r' and self.buffer[self.pointer] != u'\n'):
+ self.line += 1
+ self.column = 0
+ elif ch != u'\uFEFF':
+ self.column += 1
+ length -= 1
+
+ def get_mark(self):
+ if self.stream is None:
+ return Mark(self.name, self.index, self.line, self.column,
+ self.buffer, self.pointer)
+ else:
+ return Mark(self.name, self.index, self.line, self.column,
+ None, None)
+
+ def determine_encoding(self):
+ while not self.eof and len(self.raw_buffer) < 2:
+ self.update_raw()
+ if not isinstance(self.raw_buffer, unicode):
+ if self.raw_buffer.startswith(codecs.BOM_UTF16_LE):
+ self.raw_decode = codecs.utf_16_le_decode
+ self.encoding = 'utf-16-le'
+ elif self.raw_buffer.startswith(codecs.BOM_UTF16_BE):
+ self.raw_decode = codecs.utf_16_be_decode
+ self.encoding = 'utf-16-be'
+ else:
+ self.raw_decode = codecs.utf_8_decode
+ self.encoding = 'utf-8'
+ self.update(1)
+
+ NON_PRINTABLE = re.compile(u'[^\x09\x0A\x0D\x20-\x7E\x85\xA0-\uD7FF\uE000-\uFFFD]')
+ def check_printable(self, data):
+ match = self.NON_PRINTABLE.search(data)
+ if match:
+ character = match.group()
+ position = self.index+(len(self.buffer)-self.pointer)+match.start()
+ raise ReaderError(self.name, position, ord(character),
+ 'unicode', "special characters are not allowed")
+
+ def update(self, length):
+ if self.raw_buffer is None:
+ return
+ self.buffer = self.buffer[self.pointer:]
+ self.pointer = 0
+ while len(self.buffer) < length:
+ if not self.eof:
+ self.update_raw()
+ if self.raw_decode is not None:
+ try:
+ data, converted = self.raw_decode(self.raw_buffer,
+ 'strict', self.eof)
+ except UnicodeDecodeError, exc:
+ character = exc.object[exc.start]
+ if self.stream is not None:
+ position = self.stream_pointer-len(self.raw_buffer)+exc.start
+ else:
+ position = exc.start
+ raise ReaderError(self.name, position, character,
+ exc.encoding, exc.reason)
+ else:
+ data = self.raw_buffer
+ converted = len(data)
+ self.check_printable(data)
+ self.buffer += data
+ self.raw_buffer = self.raw_buffer[converted:]
+ if self.eof:
+ self.buffer += u'\0'
+ self.raw_buffer = None
+ break
+
+ def update_raw(self, size=1024):
+ data = self.stream.read(size)
+ if data:
+ self.raw_buffer += data
+ self.stream_pointer += len(data)
+ else:
+ self.eof = True
+
+#try:
+# import psyco
+# psyco.bind(Reader)
+#except ImportError:
+# pass
+
diff --git a/paleomix/yaml/lib2/representer.py b/paleomix/yaml/lib2/representer.py
new file mode 100644
index 0000000..5f4fc70
--- /dev/null
+++ b/paleomix/yaml/lib2/representer.py
@@ -0,0 +1,484 @@
+
+__all__ = ['BaseRepresenter', 'SafeRepresenter', 'Representer',
+ 'RepresenterError']
+
+from error import *
+from nodes import *
+
+import datetime
+
+import sys, copy_reg, types
+
+class RepresenterError(YAMLError):
+ pass
+
+class BaseRepresenter(object):
+
+ yaml_representers = {}
+ yaml_multi_representers = {}
+
+ def __init__(self, default_style=None, default_flow_style=None):
+ self.default_style = default_style
+ self.default_flow_style = default_flow_style
+ self.represented_objects = {}
+ self.object_keeper = []
+ self.alias_key = None
+
+ def represent(self, data):
+ node = self.represent_data(data)
+ self.serialize(node)
+ self.represented_objects = {}
+ self.object_keeper = []
+ self.alias_key = None
+
+ def get_classobj_bases(self, cls):
+ bases = [cls]
+ for base in cls.__bases__:
+ bases.extend(self.get_classobj_bases(base))
+ return bases
+
+ def represent_data(self, data):
+ if self.ignore_aliases(data):
+ self.alias_key = None
+ else:
+ self.alias_key = id(data)
+ if self.alias_key is not None:
+ if self.alias_key in self.represented_objects:
+ node = self.represented_objects[self.alias_key]
+ #if node is None:
+ # raise RepresenterError("recursive objects are not allowed: %r" % data)
+ return node
+ #self.represented_objects[alias_key] = None
+ self.object_keeper.append(data)
+ data_types = type(data).__mro__
+ if type(data) is types.InstanceType:
+ data_types = self.get_classobj_bases(data.__class__)+list(data_types)
+ if data_types[0] in self.yaml_representers:
+ node = self.yaml_representers[data_types[0]](self, data)
+ else:
+ for data_type in data_types:
+ if data_type in self.yaml_multi_representers:
+ node = self.yaml_multi_representers[data_type](self, data)
+ break
+ else:
+ if None in self.yaml_multi_representers:
+ node = self.yaml_multi_representers[None](self, data)
+ elif None in self.yaml_representers:
+ node = self.yaml_representers[None](self, data)
+ else:
+ node = ScalarNode(None, unicode(data))
+ #if alias_key is not None:
+ # self.represented_objects[alias_key] = node
+ return node
+
+ def add_representer(cls, data_type, representer):
+ if not 'yaml_representers' in cls.__dict__:
+ cls.yaml_representers = cls.yaml_representers.copy()
+ cls.yaml_representers[data_type] = representer
+ add_representer = classmethod(add_representer)
+
+ def add_multi_representer(cls, data_type, representer):
+ if not 'yaml_multi_representers' in cls.__dict__:
+ cls.yaml_multi_representers = cls.yaml_multi_representers.copy()
+ cls.yaml_multi_representers[data_type] = representer
+ add_multi_representer = classmethod(add_multi_representer)
+
+ def represent_scalar(self, tag, value, style=None):
+ if style is None:
+ style = self.default_style
+ node = ScalarNode(tag, value, style=style)
+ if self.alias_key is not None:
+ self.represented_objects[self.alias_key] = node
+ return node
+
+ def represent_sequence(self, tag, sequence, flow_style=None):
+ value = []
+ node = SequenceNode(tag, value, flow_style=flow_style)
+ if self.alias_key is not None:
+ self.represented_objects[self.alias_key] = node
+ best_style = True
+ for item in sequence:
+ node_item = self.represent_data(item)
+ if not (isinstance(node_item, ScalarNode) and not node_item.style):
+ best_style = False
+ value.append(node_item)
+ if flow_style is None:
+ if self.default_flow_style is not None:
+ node.flow_style = self.default_flow_style
+ else:
+ node.flow_style = best_style
+ return node
+
+ def represent_mapping(self, tag, mapping, flow_style=None):
+ value = []
+ node = MappingNode(tag, value, flow_style=flow_style)
+ if self.alias_key is not None:
+ self.represented_objects[self.alias_key] = node
+ best_style = True
+ if hasattr(mapping, 'items'):
+ mapping = mapping.items()
+ mapping.sort()
+ for item_key, item_value in mapping:
+ node_key = self.represent_data(item_key)
+ node_value = self.represent_data(item_value)
+ if not (isinstance(node_key, ScalarNode) and not node_key.style):
+ best_style = False
+ if not (isinstance(node_value, ScalarNode) and not node_value.style):
+ best_style = False
+ value.append((node_key, node_value))
+ if flow_style is None:
+ if self.default_flow_style is not None:
+ node.flow_style = self.default_flow_style
+ else:
+ node.flow_style = best_style
+ return node
+
+ def ignore_aliases(self, data):
+ return False
+
+class SafeRepresenter(BaseRepresenter):
+
+ def ignore_aliases(self, data):
+ if data in [None, ()]:
+ return True
+ if isinstance(data, (str, unicode, bool, int, float)):
+ return True
+
+ def represent_none(self, data):
+ return self.represent_scalar(u'tag:yaml.org,2002:null',
+ u'null')
+
+ def represent_str(self, data):
+ tag = None
+ style = None
+ try:
+ data = unicode(data, 'ascii')
+ tag = u'tag:yaml.org,2002:str'
+ except UnicodeDecodeError:
+ try:
+ data = unicode(data, 'utf-8')
+ tag = u'tag:yaml.org,2002:str'
+ except UnicodeDecodeError:
+ data = data.encode('base64')
+ tag = u'tag:yaml.org,2002:binary'
+ style = '|'
+ return self.represent_scalar(tag, data, style=style)
+
+ def represent_unicode(self, data):
+ return self.represent_scalar(u'tag:yaml.org,2002:str', data)
+
+ def represent_bool(self, data):
+ if data:
+ value = u'true'
+ else:
+ value = u'false'
+ return self.represent_scalar(u'tag:yaml.org,2002:bool', value)
+
+ def represent_int(self, data):
+ return self.represent_scalar(u'tag:yaml.org,2002:int', unicode(data))
+
+ def represent_long(self, data):
+ return self.represent_scalar(u'tag:yaml.org,2002:int', unicode(data))
+
+ inf_value = 1e300
+ while repr(inf_value) != repr(inf_value*inf_value):
+ inf_value *= inf_value
+
+ def represent_float(self, data):
+ if data != data or (data == 0.0 and data == 1.0):
+ value = u'.nan'
+ elif data == self.inf_value:
+ value = u'.inf'
+ elif data == -self.inf_value:
+ value = u'-.inf'
+ else:
+ value = unicode(repr(data)).lower()
+ # Note that in some cases `repr(data)` represents a float number
+ # without the decimal parts. For instance:
+ # >>> repr(1e17)
+ # '1e17'
+ # Unfortunately, this is not a valid float representation according
+ # to the definition of the `!!float` tag. We fix this by adding
+ # '.0' before the 'e' symbol.
+ if u'.' not in value and u'e' in value:
+ value = value.replace(u'e', u'.0e', 1)
+ return self.represent_scalar(u'tag:yaml.org,2002:float', value)
+
+ def represent_list(self, data):
+ #pairs = (len(data) > 0 and isinstance(data, list))
+ #if pairs:
+ # for item in data:
+ # if not isinstance(item, tuple) or len(item) != 2:
+ # pairs = False
+ # break
+ #if not pairs:
+ return self.represent_sequence(u'tag:yaml.org,2002:seq', data)
+ #value = []
+ #for item_key, item_value in data:
+ # value.append(self.represent_mapping(u'tag:yaml.org,2002:map',
+ # [(item_key, item_value)]))
+ #return SequenceNode(u'tag:yaml.org,2002:pairs', value)
+
+ def represent_dict(self, data):
+ return self.represent_mapping(u'tag:yaml.org,2002:map', data)
+
+ def represent_set(self, data):
+ value = {}
+ for key in data:
+ value[key] = None
+ return self.represent_mapping(u'tag:yaml.org,2002:set', value)
+
+ def represent_date(self, data):
+ value = unicode(data.isoformat())
+ return self.represent_scalar(u'tag:yaml.org,2002:timestamp', value)
+
+ def represent_datetime(self, data):
+ value = unicode(data.isoformat(' '))
+ return self.represent_scalar(u'tag:yaml.org,2002:timestamp', value)
+
+ def represent_yaml_object(self, tag, data, cls, flow_style=None):
+ if hasattr(data, '__getstate__'):
+ state = data.__getstate__()
+ else:
+ state = data.__dict__.copy()
+ return self.represent_mapping(tag, state, flow_style=flow_style)
+
+ def represent_undefined(self, data):
+ raise RepresenterError("cannot represent an object: %s" % data)
+
+SafeRepresenter.add_representer(type(None),
+ SafeRepresenter.represent_none)
+
+SafeRepresenter.add_representer(str,
+ SafeRepresenter.represent_str)
+
+SafeRepresenter.add_representer(unicode,
+ SafeRepresenter.represent_unicode)
+
+SafeRepresenter.add_representer(bool,
+ SafeRepresenter.represent_bool)
+
+SafeRepresenter.add_representer(int,
+ SafeRepresenter.represent_int)
+
+SafeRepresenter.add_representer(long,
+ SafeRepresenter.represent_long)
+
+SafeRepresenter.add_representer(float,
+ SafeRepresenter.represent_float)
+
+SafeRepresenter.add_representer(list,
+ SafeRepresenter.represent_list)
+
+SafeRepresenter.add_representer(tuple,
+ SafeRepresenter.represent_list)
+
+SafeRepresenter.add_representer(dict,
+ SafeRepresenter.represent_dict)
+
+SafeRepresenter.add_representer(set,
+ SafeRepresenter.represent_set)
+
+SafeRepresenter.add_representer(datetime.date,
+ SafeRepresenter.represent_date)
+
+SafeRepresenter.add_representer(datetime.datetime,
+ SafeRepresenter.represent_datetime)
+
+SafeRepresenter.add_representer(None,
+ SafeRepresenter.represent_undefined)
+
+class Representer(SafeRepresenter):
+
+ def represent_str(self, data):
+ tag = None
+ style = None
+ try:
+ data = unicode(data, 'ascii')
+ tag = u'tag:yaml.org,2002:str'
+ except UnicodeDecodeError:
+ try:
+ data = unicode(data, 'utf-8')
+ tag = u'tag:yaml.org,2002:python/str'
+ except UnicodeDecodeError:
+ data = data.encode('base64')
+ tag = u'tag:yaml.org,2002:binary'
+ style = '|'
+ return self.represent_scalar(tag, data, style=style)
+
+ def represent_unicode(self, data):
+ tag = None
+ try:
+ data.encode('ascii')
+ tag = u'tag:yaml.org,2002:python/unicode'
+ except UnicodeEncodeError:
+ tag = u'tag:yaml.org,2002:str'
+ return self.represent_scalar(tag, data)
+
+ def represent_long(self, data):
+ tag = u'tag:yaml.org,2002:int'
+ if int(data) is not data:
+ tag = u'tag:yaml.org,2002:python/long'
+ return self.represent_scalar(tag, unicode(data))
+
+ def represent_complex(self, data):
+ if data.imag == 0.0:
+ data = u'%r' % data.real
+ elif data.real == 0.0:
+ data = u'%rj' % data.imag
+ elif data.imag > 0:
+ data = u'%r+%rj' % (data.real, data.imag)
+ else:
+ data = u'%r%rj' % (data.real, data.imag)
+ return self.represent_scalar(u'tag:yaml.org,2002:python/complex', data)
+
+ def represent_tuple(self, data):
+ return self.represent_sequence(u'tag:yaml.org,2002:python/tuple', data)
+
+ def represent_name(self, data):
+ name = u'%s.%s' % (data.__module__, data.__name__)
+ return self.represent_scalar(u'tag:yaml.org,2002:python/name:'+name, u'')
+
+ def represent_module(self, data):
+ return self.represent_scalar(
+ u'tag:yaml.org,2002:python/module:'+data.__name__, u'')
+
+ def represent_instance(self, data):
+ # For instances of classic classes, we use __getinitargs__ and
+ # __getstate__ to serialize the data.
+
+ # If data.__getinitargs__ exists, the object must be reconstructed by
+ # calling cls(**args), where args is a tuple returned by
+ # __getinitargs__. Otherwise, the cls.__init__ method should never be
+ # called and the class instance is created by instantiating a trivial
+ # class and assigning to the instance's __class__ variable.
+
+ # If data.__getstate__ exists, it returns the state of the object.
+ # Otherwise, the state of the object is data.__dict__.
+
+ # We produce either a !!python/object or !!python/object/new node.
+ # If data.__getinitargs__ does not exist and state is a dictionary, we
+ # produce a !!python/object node . Otherwise we produce a
+ # !!python/object/new node.
+
+ cls = data.__class__
+ class_name = u'%s.%s' % (cls.__module__, cls.__name__)
+ args = None
+ state = None
+ if hasattr(data, '__getinitargs__'):
+ args = list(data.__getinitargs__())
+ if hasattr(data, '__getstate__'):
+ state = data.__getstate__()
+ else:
+ state = data.__dict__
+ if args is None and isinstance(state, dict):
+ return self.represent_mapping(
+ u'tag:yaml.org,2002:python/object:'+class_name, state)
+ if isinstance(state, dict) and not state:
+ return self.represent_sequence(
+ u'tag:yaml.org,2002:python/object/new:'+class_name, args)
+ value = {}
+ if args:
+ value['args'] = args
+ value['state'] = state
+ return self.represent_mapping(
+ u'tag:yaml.org,2002:python/object/new:'+class_name, value)
+
+ def represent_object(self, data):
+ # We use __reduce__ API to save the data. data.__reduce__ returns
+ # a tuple of length 2-5:
+ # (function, args, state, listitems, dictitems)
+
+ # For reconstructing, we calls function(*args), then set its state,
+ # listitems, and dictitems if they are not None.
+
+ # A special case is when function.__name__ == '__newobj__'. In this
+ # case we create the object with args[0].__new__(*args).
+
+ # Another special case is when __reduce__ returns a string - we don't
+ # support it.
+
+ # We produce a !!python/object, !!python/object/new or
+ # !!python/object/apply node.
+
+ cls = type(data)
+ if cls in copy_reg.dispatch_table:
+ reduce = copy_reg.dispatch_table[cls](data)
+ elif hasattr(data, '__reduce_ex__'):
+ reduce = data.__reduce_ex__(2)
+ elif hasattr(data, '__reduce__'):
+ reduce = data.__reduce__()
+ else:
+ raise RepresenterError("cannot represent object: %r" % data)
+ reduce = (list(reduce)+[None]*5)[:5]
+ function, args, state, listitems, dictitems = reduce
+ args = list(args)
+ if state is None:
+ state = {}
+ if listitems is not None:
+ listitems = list(listitems)
+ if dictitems is not None:
+ dictitems = dict(dictitems)
+ if function.__name__ == '__newobj__':
+ function = args[0]
+ args = args[1:]
+ tag = u'tag:yaml.org,2002:python/object/new:'
+ newobj = True
+ else:
+ tag = u'tag:yaml.org,2002:python/object/apply:'
+ newobj = False
+ function_name = u'%s.%s' % (function.__module__, function.__name__)
+ if not args and not listitems and not dictitems \
+ and isinstance(state, dict) and newobj:
+ return self.represent_mapping(
+ u'tag:yaml.org,2002:python/object:'+function_name, state)
+ if not listitems and not dictitems \
+ and isinstance(state, dict) and not state:
+ return self.represent_sequence(tag+function_name, args)
+ value = {}
+ if args:
+ value['args'] = args
+ if state or not isinstance(state, dict):
+ value['state'] = state
+ if listitems:
+ value['listitems'] = listitems
+ if dictitems:
+ value['dictitems'] = dictitems
+ return self.represent_mapping(tag+function_name, value)
+
+Representer.add_representer(str,
+ Representer.represent_str)
+
+Representer.add_representer(unicode,
+ Representer.represent_unicode)
+
+Representer.add_representer(long,
+ Representer.represent_long)
+
+Representer.add_representer(complex,
+ Representer.represent_complex)
+
+Representer.add_representer(tuple,
+ Representer.represent_tuple)
+
+Representer.add_representer(type,
+ Representer.represent_name)
+
+Representer.add_representer(types.ClassType,
+ Representer.represent_name)
+
+Representer.add_representer(types.FunctionType,
+ Representer.represent_name)
+
+Representer.add_representer(types.BuiltinFunctionType,
+ Representer.represent_name)
+
+Representer.add_representer(types.ModuleType,
+ Representer.represent_module)
+
+Representer.add_multi_representer(types.InstanceType,
+ Representer.represent_instance)
+
+Representer.add_multi_representer(object,
+ Representer.represent_object)
+
diff --git a/paleomix/yaml/lib2/resolver.py b/paleomix/yaml/lib2/resolver.py
new file mode 100644
index 0000000..8f193ab
--- /dev/null
+++ b/paleomix/yaml/lib2/resolver.py
@@ -0,0 +1,225 @@
+
+__all__ = ['BaseResolver', 'Resolver']
+
+from error import *
+from nodes import *
+
+import re
+
+class ResolverError(YAMLError):
+ pass
+
+class BaseResolver(object):
+
+ DEFAULT_SCALAR_TAG = u'tag:yaml.org,2002:str'
+ DEFAULT_SEQUENCE_TAG = u'tag:yaml.org,2002:seq'
+ DEFAULT_MAPPING_TAG = u'tag:yaml.org,2002:map'
+
+ yaml_implicit_resolvers = {}
+ yaml_path_resolvers = {}
+
+ def __init__(self):
+ self.resolver_exact_paths = []
+ self.resolver_prefix_paths = []
+
+ def add_implicit_resolver(cls, tag, regexp, first):
+ if not 'yaml_implicit_resolvers' in cls.__dict__:
+ cls.yaml_implicit_resolvers = cls.yaml_implicit_resolvers.copy()
+ if first is None:
+ first = [None]
+ for ch in first:
+ cls.yaml_implicit_resolvers.setdefault(ch, []).append((tag, regexp))
+ add_implicit_resolver = classmethod(add_implicit_resolver)
+
+ def add_path_resolver(cls, tag, path, kind=None):
+ # Note: `add_path_resolver` is experimental. The API could be changed.
+ # `new_path` is a pattern that is matched against the path from the
+ # root to the node that is being considered. `node_path` elements are
+ # tuples `(node_check, index_check)`. `node_check` is a node class:
+ # `ScalarNode`, `SequenceNode`, `MappingNode` or `None`. `None`
+ # matches any kind of a node. `index_check` could be `None`, a boolean
+ # value, a string value, or a number. `None` and `False` match against
+ # any _value_ of sequence and mapping nodes. `True` matches against
+ # any _key_ of a mapping node. A string `index_check` matches against
+ # a mapping value that corresponds to a scalar key which content is
+ # equal to the `index_check` value. An integer `index_check` matches
+ # against a sequence value with the index equal to `index_check`.
+ if not 'yaml_path_resolvers' in cls.__dict__:
+ cls.yaml_path_resolvers = cls.yaml_path_resolvers.copy()
+ new_path = []
+ for element in path:
+ if isinstance(element, (list, tuple)):
+ if len(element) == 2:
+ node_check, index_check = element
+ elif len(element) == 1:
+ node_check = element[0]
+ index_check = True
+ else:
+ raise ResolverError("Invalid path element: %s" % element)
+ else:
+ node_check = None
+ index_check = element
+ if node_check is str:
+ node_check = ScalarNode
+ elif node_check is list:
+ node_check = SequenceNode
+ elif node_check is dict:
+ node_check = MappingNode
+ elif node_check not in [ScalarNode, SequenceNode, MappingNode] \
+ and not isinstance(node_check, basestring) \
+ and node_check is not None:
+ raise ResolverError("Invalid node checker: %s" % node_check)
+ if not isinstance(index_check, (basestring, int)) \
+ and index_check is not None:
+ raise ResolverError("Invalid index checker: %s" % index_check)
+ new_path.append((node_check, index_check))
+ if kind is str:
+ kind = ScalarNode
+ elif kind is list:
+ kind = SequenceNode
+ elif kind is dict:
+ kind = MappingNode
+ elif kind not in [ScalarNode, SequenceNode, MappingNode] \
+ and kind is not None:
+ raise ResolverError("Invalid node kind: %s" % kind)
+ cls.yaml_path_resolvers[tuple(new_path), kind] = tag
+ add_path_resolver = classmethod(add_path_resolver)
+
+ def descend_resolver(self, current_node, current_index):
+ if not self.yaml_path_resolvers:
+ return
+ exact_paths = {}
+ prefix_paths = []
+ if current_node:
+ depth = len(self.resolver_prefix_paths)
+ for path, kind in self.resolver_prefix_paths[-1]:
+ if self.check_resolver_prefix(depth, path, kind,
+ current_node, current_index):
+ if len(path) > depth:
+ prefix_paths.append((path, kind))
+ else:
+ exact_paths[kind] = self.yaml_path_resolvers[path, kind]
+ else:
+ for path, kind in self.yaml_path_resolvers:
+ if not path:
+ exact_paths[kind] = self.yaml_path_resolvers[path, kind]
+ else:
+ prefix_paths.append((path, kind))
+ self.resolver_exact_paths.append(exact_paths)
+ self.resolver_prefix_paths.append(prefix_paths)
+
+ def ascend_resolver(self):
+ if not self.yaml_path_resolvers:
+ return
+ self.resolver_exact_paths.pop()
+ self.resolver_prefix_paths.pop()
+
+ def check_resolver_prefix(self, depth, path, kind,
+ current_node, current_index):
+ node_check, index_check = path[depth-1]
+ if isinstance(node_check, basestring):
+ if current_node.tag != node_check:
+ return
+ elif node_check is not None:
+ if not isinstance(current_node, node_check):
+ return
+ if index_check is True and current_index is not None:
+ return
+ if (index_check is False or index_check is None) \
+ and current_index is None:
+ return
+ if isinstance(index_check, basestring):
+ if not (isinstance(current_index, ScalarNode)
+ and index_check == current_index.value):
+ return
+ elif isinstance(index_check, int) and not isinstance(index_check, bool):
+ if index_check != current_index:
+ return
+ return True
+
+ def resolve(self, kind, value, implicit):
+ if kind is ScalarNode and implicit[0]:
+ if value == u'':
+ resolvers = self.yaml_implicit_resolvers.get(u'', [])
+ else:
+ resolvers = self.yaml_implicit_resolvers.get(value[0], [])
+ resolvers += self.yaml_implicit_resolvers.get(None, [])
+ for tag, regexp in resolvers:
+ if regexp.match(value):
+ return tag
+ implicit = implicit[1]
+ if self.yaml_path_resolvers:
+ exact_paths = self.resolver_exact_paths[-1]
+ if kind in exact_paths:
+ return exact_paths[kind]
+ if None in exact_paths:
+ return exact_paths[None]
+ if kind is ScalarNode:
+ return self.DEFAULT_SCALAR_TAG
+ elif kind is SequenceNode:
+ return self.DEFAULT_SEQUENCE_TAG
+ elif kind is MappingNode:
+ return self.DEFAULT_MAPPING_TAG
+
+class Resolver(BaseResolver):
+ pass
+
+Resolver.add_implicit_resolver(
+ u'tag:yaml.org,2002:bool',
+ re.compile(ur'''^(?:yes|Yes|YES|no|No|NO
+ |true|True|TRUE|false|False|FALSE
+ |on|On|ON|off|Off|OFF)$''', re.X),
+ list(u'yYnNtTfFoO'))
+
+Resolver.add_implicit_resolver(
+ u'tag:yaml.org,2002:float',
+ re.compile(ur'''^(?:[-+]?(?:[0-9][0-9_]*)\.[0-9_]*(?:[eE][-+][0-9]+)?
+ |[-+]?(?:[0-9][0-9_]*)[eE][-+][0-9]+
+ |\.[0-9_]+(?:[eE][-+][0-9]+)?
+ |[-+]?[0-9][0-9_]*(?::[0-5]?[0-9])+\.[0-9_]*
+ |[-+]?\.(?:inf|Inf|INF)
+ |\.(?:nan|NaN|NAN))$''', re.X),
+ list(u'-+0123456789.'))
+
+Resolver.add_implicit_resolver(
+ u'tag:yaml.org,2002:int',
+ re.compile(ur'''^(?:[-+]?0b[0-1_]+
+ |[-+]?0[0-7_]+
+ |[-+]?(?:0|[1-9][0-9_]*)
+ |[-+]?0x[0-9a-fA-F_]+
+ |[-+]?[1-9][0-9_]*(?::[0-5]?[0-9])+)$''', re.X),
+ list(u'-+0123456789'))
+
+Resolver.add_implicit_resolver(
+ u'tag:yaml.org,2002:merge',
+ re.compile(ur'^(?:<<)$'),
+ [u'<'])
+
+Resolver.add_implicit_resolver(
+ u'tag:yaml.org,2002:null',
+ re.compile(ur'''^(?: ~
+ |null|Null|NULL
+ | )$''', re.X),
+ [u'~', u'n', u'N', u''])
+
+Resolver.add_implicit_resolver(
+ u'tag:yaml.org,2002:timestamp',
+ re.compile(ur'''^(?:[0-9][0-9][0-9][0-9]-[0-9][0-9]-[0-9][0-9]
+ |[0-9][0-9][0-9][0-9] -[0-9][0-9]? -[0-9][0-9]?
+ (?:[Tt]|[ \t]+)[0-9][0-9]?
+ :[0-9][0-9] :[0-9][0-9] (?:\.[0-9]*)?
+ (?:[ \t]*(?:Z|[-+][0-9][0-9]?(?::[0-9][0-9])?))?)$''', re.X),
+ list(u'0123456789'))
+
+Resolver.add_implicit_resolver(
+ u'tag:yaml.org,2002:value',
+ re.compile(ur'^(?:=)$'),
+ [u'='])
+
+# The following resolver is only for documentation purposes. It cannot work
+# because plain scalars cannot start with '!', '&', or '*'.
+Resolver.add_implicit_resolver(
+ u'tag:yaml.org,2002:yaml',
+ re.compile(ur'^(?:!|&|\*)$'),
+ list(u'!&*'))
+
diff --git a/paleomix/yaml/lib2/scanner.py b/paleomix/yaml/lib2/scanner.py
new file mode 100644
index 0000000..5228fad
--- /dev/null
+++ b/paleomix/yaml/lib2/scanner.py
@@ -0,0 +1,1457 @@
+
+# Scanner produces tokens of the following types:
+# STREAM-START
+# STREAM-END
+# DIRECTIVE(name, value)
+# DOCUMENT-START
+# DOCUMENT-END
+# BLOCK-SEQUENCE-START
+# BLOCK-MAPPING-START
+# BLOCK-END
+# FLOW-SEQUENCE-START
+# FLOW-MAPPING-START
+# FLOW-SEQUENCE-END
+# FLOW-MAPPING-END
+# BLOCK-ENTRY
+# FLOW-ENTRY
+# KEY
+# VALUE
+# ALIAS(value)
+# ANCHOR(value)
+# TAG(value)
+# SCALAR(value, plain, style)
+#
+# Read comments in the Scanner code for more details.
+#
+
+__all__ = ['Scanner', 'ScannerError']
+
+from error import MarkedYAMLError
+from tokens import *
+
+class ScannerError(MarkedYAMLError):
+ pass
+
+class SimpleKey(object):
+ # See below simple keys treatment.
+
+ def __init__(self, token_number, required, index, line, column, mark):
+ self.token_number = token_number
+ self.required = required
+ self.index = index
+ self.line = line
+ self.column = column
+ self.mark = mark
+
+class Scanner(object):
+
+ def __init__(self):
+ """Initialize the scanner."""
+ # It is assumed that Scanner and Reader will have a common descendant.
+ # Reader do the dirty work of checking for BOM and converting the
+ # input data to Unicode. It also adds NUL to the end.
+ #
+ # Reader supports the following methods
+ # self.peek(i=0) # peek the next i-th character
+ # self.prefix(l=1) # peek the next l characters
+ # self.forward(l=1) # read the next l characters and move the pointer.
+
+ # Had we reached the end of the stream?
+ self.done = False
+
+ # The number of unclosed '{' and '['. `flow_level == 0` means block
+ # context.
+ self.flow_level = 0
+
+ # List of processed tokens that are not yet emitted.
+ self.tokens = []
+
+ # Add the STREAM-START token.
+ self.fetch_stream_start()
+
+ # Number of tokens that were emitted through the `get_token` method.
+ self.tokens_taken = 0
+
+ # The current indentation level.
+ self.indent = -1
+
+ # Past indentation levels.
+ self.indents = []
+
+ # Variables related to simple keys treatment.
+
+ # A simple key is a key that is not denoted by the '?' indicator.
+ # Example of simple keys:
+ # ---
+ # block simple key: value
+ # ? not a simple key:
+ # : { flow simple key: value }
+ # We emit the KEY token before all keys, so when we find a potential
+ # simple key, we try to locate the corresponding ':' indicator.
+ # Simple keys should be limited to a single line and 1024 characters.
+
+ # Can a simple key start at the current position? A simple key may
+ # start:
+ # - at the beginning of the line, not counting indentation spaces
+ # (in block context),
+ # - after '{', '[', ',' (in the flow context),
+ # - after '?', ':', '-' (in the block context).
+ # In the block context, this flag also signifies if a block collection
+ # may start at the current position.
+ self.allow_simple_key = True
+
+ # Keep track of possible simple keys. This is a dictionary. The key
+ # is `flow_level`; there can be no more that one possible simple key
+ # for each level. The value is a SimpleKey record:
+ # (token_number, required, index, line, column, mark)
+ # A simple key may start with ALIAS, ANCHOR, TAG, SCALAR(flow),
+ # '[', or '{' tokens.
+ self.possible_simple_keys = {}
+
+ # Public methods.
+
+ def check_token(self, *choices):
+ # Check if the next token is one of the given types.
+ while self.need_more_tokens():
+ self.fetch_more_tokens()
+ if self.tokens:
+ if not choices:
+ return True
+ for choice in choices:
+ if isinstance(self.tokens[0], choice):
+ return True
+ return False
+
+ def peek_token(self):
+ # Return the next token, but do not delete if from the queue.
+ while self.need_more_tokens():
+ self.fetch_more_tokens()
+ if self.tokens:
+ return self.tokens[0]
+
+ def get_token(self):
+ # Return the next token.
+ while self.need_more_tokens():
+ self.fetch_more_tokens()
+ if self.tokens:
+ self.tokens_taken += 1
+ return self.tokens.pop(0)
+
+ # Private methods.
+
+ def need_more_tokens(self):
+ if self.done:
+ return False
+ if not self.tokens:
+ return True
+ # The current token may be a potential simple key, so we
+ # need to look further.
+ self.stale_possible_simple_keys()
+ if self.next_possible_simple_key() == self.tokens_taken:
+ return True
+
+ def fetch_more_tokens(self):
+
+ # Eat whitespaces and comments until we reach the next token.
+ self.scan_to_next_token()
+
+ # Remove obsolete possible simple keys.
+ self.stale_possible_simple_keys()
+
+ # Compare the current indentation and column. It may add some tokens
+ # and decrease the current indentation level.
+ self.unwind_indent(self.column)
+
+ # Peek the next character.
+ ch = self.peek()
+
+ # Is it the end of stream?
+ if ch == u'\0':
+ return self.fetch_stream_end()
+
+ # Is it a directive?
+ if ch == u'%' and self.check_directive():
+ return self.fetch_directive()
+
+ # Is it the document start?
+ if ch == u'-' and self.check_document_start():
+ return self.fetch_document_start()
+
+ # Is it the document end?
+ if ch == u'.' and self.check_document_end():
+ return self.fetch_document_end()
+
+ # TODO: support for BOM within a stream.
+ #if ch == u'\uFEFF':
+ # return self.fetch_bom() <-- issue BOMToken
+
+ # Note: the order of the following checks is NOT significant.
+
+ # Is it the flow sequence start indicator?
+ if ch == u'[':
+ return self.fetch_flow_sequence_start()
+
+ # Is it the flow mapping start indicator?
+ if ch == u'{':
+ return self.fetch_flow_mapping_start()
+
+ # Is it the flow sequence end indicator?
+ if ch == u']':
+ return self.fetch_flow_sequence_end()
+
+ # Is it the flow mapping end indicator?
+ if ch == u'}':
+ return self.fetch_flow_mapping_end()
+
+ # Is it the flow entry indicator?
+ if ch == u',':
+ return self.fetch_flow_entry()
+
+ # Is it the block entry indicator?
+ if ch == u'-' and self.check_block_entry():
+ return self.fetch_block_entry()
+
+ # Is it the key indicator?
+ if ch == u'?' and self.check_key():
+ return self.fetch_key()
+
+ # Is it the value indicator?
+ if ch == u':' and self.check_value():
+ return self.fetch_value()
+
+ # Is it an alias?
+ if ch == u'*':
+ return self.fetch_alias()
+
+ # Is it an anchor?
+ if ch == u'&':
+ return self.fetch_anchor()
+
+ # Is it a tag?
+ if ch == u'!':
+ return self.fetch_tag()
+
+ # Is it a literal scalar?
+ if ch == u'|' and not self.flow_level:
+ return self.fetch_literal()
+
+ # Is it a folded scalar?
+ if ch == u'>' and not self.flow_level:
+ return self.fetch_folded()
+
+ # Is it a single quoted scalar?
+ if ch == u'\'':
+ return self.fetch_single()
+
+ # Is it a double quoted scalar?
+ if ch == u'\"':
+ return self.fetch_double()
+
+ # It must be a plain scalar then.
+ if self.check_plain():
+ return self.fetch_plain()
+
+ # No? It's an error. Let's produce a nice error message.
+ raise ScannerError("while scanning for the next token", None,
+ "found character %r that cannot start any token"
+ % ch.encode('utf-8'), self.get_mark())
+
+ # Simple keys treatment.
+
+ def next_possible_simple_key(self):
+ # Return the number of the nearest possible simple key. Actually we
+ # don't need to loop through the whole dictionary. We may replace it
+ # with the following code:
+ # if not self.possible_simple_keys:
+ # return None
+ # return self.possible_simple_keys[
+ # min(self.possible_simple_keys.keys())].token_number
+ min_token_number = None
+ for level in self.possible_simple_keys:
+ key = self.possible_simple_keys[level]
+ if min_token_number is None or key.token_number < min_token_number:
+ min_token_number = key.token_number
+ return min_token_number
+
+ def stale_possible_simple_keys(self):
+ # Remove entries that are no longer possible simple keys. According to
+ # the YAML specification, simple keys
+ # - should be limited to a single line,
+ # - should be no longer than 1024 characters.
+ # Disabling this procedure will allow simple keys of any length and
+ # height (may cause problems if indentation is broken though).
+ for level in self.possible_simple_keys.keys():
+ key = self.possible_simple_keys[level]
+ if key.line != self.line \
+ or self.index-key.index > 1024:
+ if key.required:
+ raise ScannerError("while scanning a simple key", key.mark,
+ "could not found expected ':'", self.get_mark())
+ del self.possible_simple_keys[level]
+
+ def save_possible_simple_key(self):
+ # The next token may start a simple key. We check if it's possible
+ # and save its position. This function is called for
+ # ALIAS, ANCHOR, TAG, SCALAR(flow), '[', and '{'.
+
+ # Check if a simple key is required at the current position.
+ required = not self.flow_level and self.indent == self.column
+
+ # A simple key is required only if it is the first token in the current
+ # line. Therefore it is always allowed.
+ assert self.allow_simple_key or not required
+
+ # The next token might be a simple key. Let's save it's number and
+ # position.
+ if self.allow_simple_key:
+ self.remove_possible_simple_key()
+ token_number = self.tokens_taken+len(self.tokens)
+ key = SimpleKey(token_number, required,
+ self.index, self.line, self.column, self.get_mark())
+ self.possible_simple_keys[self.flow_level] = key
+
+ def remove_possible_simple_key(self):
+ # Remove the saved possible key position at the current flow level.
+ if self.flow_level in self.possible_simple_keys:
+ key = self.possible_simple_keys[self.flow_level]
+
+ if key.required:
+ raise ScannerError("while scanning a simple key", key.mark,
+ "could not found expected ':'", self.get_mark())
+
+ del self.possible_simple_keys[self.flow_level]
+
+ # Indentation functions.
+
+ def unwind_indent(self, column):
+
+ ## In flow context, tokens should respect indentation.
+ ## Actually the condition should be `self.indent >= column` according to
+ ## the spec. But this condition will prohibit intuitively correct
+ ## constructions such as
+ ## key : {
+ ## }
+ #if self.flow_level and self.indent > column:
+ # raise ScannerError(None, None,
+ # "invalid intendation or unclosed '[' or '{'",
+ # self.get_mark())
+
+ # In the flow context, indentation is ignored. We make the scanner less
+ # restrictive then specification requires.
+ if self.flow_level:
+ return
+
+ # In block context, we may need to issue the BLOCK-END tokens.
+ while self.indent > column:
+ mark = self.get_mark()
+ self.indent = self.indents.pop()
+ self.tokens.append(BlockEndToken(mark, mark))
+
+ def add_indent(self, column):
+ # Check if we need to increase indentation.
+ if self.indent < column:
+ self.indents.append(self.indent)
+ self.indent = column
+ return True
+ return False
+
+ # Fetchers.
+
+ def fetch_stream_start(self):
+ # We always add STREAM-START as the first token and STREAM-END as the
+ # last token.
+
+ # Read the token.
+ mark = self.get_mark()
+
+ # Add STREAM-START.
+ self.tokens.append(StreamStartToken(mark, mark,
+ encoding=self.encoding))
+
+
+ def fetch_stream_end(self):
+
+ # Set the current intendation to -1.
+ self.unwind_indent(-1)
+
+ # Reset simple keys.
+ self.remove_possible_simple_key()
+ self.allow_simple_key = False
+ self.possible_simple_keys = {}
+
+ # Read the token.
+ mark = self.get_mark()
+
+ # Add STREAM-END.
+ self.tokens.append(StreamEndToken(mark, mark))
+
+ # The steam is finished.
+ self.done = True
+
+ def fetch_directive(self):
+
+ # Set the current intendation to -1.
+ self.unwind_indent(-1)
+
+ # Reset simple keys.
+ self.remove_possible_simple_key()
+ self.allow_simple_key = False
+
+ # Scan and add DIRECTIVE.
+ self.tokens.append(self.scan_directive())
+
+ def fetch_document_start(self):
+ self.fetch_document_indicator(DocumentStartToken)
+
+ def fetch_document_end(self):
+ self.fetch_document_indicator(DocumentEndToken)
+
+ def fetch_document_indicator(self, TokenClass):
+
+ # Set the current intendation to -1.
+ self.unwind_indent(-1)
+
+ # Reset simple keys. Note that there could not be a block collection
+ # after '---'.
+ self.remove_possible_simple_key()
+ self.allow_simple_key = False
+
+ # Add DOCUMENT-START or DOCUMENT-END.
+ start_mark = self.get_mark()
+ self.forward(3)
+ end_mark = self.get_mark()
+ self.tokens.append(TokenClass(start_mark, end_mark))
+
+ def fetch_flow_sequence_start(self):
+ self.fetch_flow_collection_start(FlowSequenceStartToken)
+
+ def fetch_flow_mapping_start(self):
+ self.fetch_flow_collection_start(FlowMappingStartToken)
+
+ def fetch_flow_collection_start(self, TokenClass):
+
+ # '[' and '{' may start a simple key.
+ self.save_possible_simple_key()
+
+ # Increase the flow level.
+ self.flow_level += 1
+
+ # Simple keys are allowed after '[' and '{'.
+ self.allow_simple_key = True
+
+ # Add FLOW-SEQUENCE-START or FLOW-MAPPING-START.
+ start_mark = self.get_mark()
+ self.forward()
+ end_mark = self.get_mark()
+ self.tokens.append(TokenClass(start_mark, end_mark))
+
+ def fetch_flow_sequence_end(self):
+ self.fetch_flow_collection_end(FlowSequenceEndToken)
+
+ def fetch_flow_mapping_end(self):
+ self.fetch_flow_collection_end(FlowMappingEndToken)
+
+ def fetch_flow_collection_end(self, TokenClass):
+
+ # Reset possible simple key on the current level.
+ self.remove_possible_simple_key()
+
+ # Decrease the flow level.
+ self.flow_level -= 1
+
+ # No simple keys after ']' or '}'.
+ self.allow_simple_key = False
+
+ # Add FLOW-SEQUENCE-END or FLOW-MAPPING-END.
+ start_mark = self.get_mark()
+ self.forward()
+ end_mark = self.get_mark()
+ self.tokens.append(TokenClass(start_mark, end_mark))
+
+ def fetch_flow_entry(self):
+
+ # Simple keys are allowed after ','.
+ self.allow_simple_key = True
+
+ # Reset possible simple key on the current level.
+ self.remove_possible_simple_key()
+
+ # Add FLOW-ENTRY.
+ start_mark = self.get_mark()
+ self.forward()
+ end_mark = self.get_mark()
+ self.tokens.append(FlowEntryToken(start_mark, end_mark))
+
+ def fetch_block_entry(self):
+
+ # Block context needs additional checks.
+ if not self.flow_level:
+
+ # Are we allowed to start a new entry?
+ if not self.allow_simple_key:
+ raise ScannerError(None, None,
+ "sequence entries are not allowed here",
+ self.get_mark())
+
+ # We may need to add BLOCK-SEQUENCE-START.
+ if self.add_indent(self.column):
+ mark = self.get_mark()
+ self.tokens.append(BlockSequenceStartToken(mark, mark))
+
+ # It's an error for the block entry to occur in the flow context,
+ # but we let the parser detect this.
+ else:
+ pass
+
+ # Simple keys are allowed after '-'.
+ self.allow_simple_key = True
+
+ # Reset possible simple key on the current level.
+ self.remove_possible_simple_key()
+
+ # Add BLOCK-ENTRY.
+ start_mark = self.get_mark()
+ self.forward()
+ end_mark = self.get_mark()
+ self.tokens.append(BlockEntryToken(start_mark, end_mark))
+
+ def fetch_key(self):
+
+ # Block context needs additional checks.
+ if not self.flow_level:
+
+ # Are we allowed to start a key (not nessesary a simple)?
+ if not self.allow_simple_key:
+ raise ScannerError(None, None,
+ "mapping keys are not allowed here",
+ self.get_mark())
+
+ # We may need to add BLOCK-MAPPING-START.
+ if self.add_indent(self.column):
+ mark = self.get_mark()
+ self.tokens.append(BlockMappingStartToken(mark, mark))
+
+ # Simple keys are allowed after '?' in the block context.
+ self.allow_simple_key = not self.flow_level
+
+ # Reset possible simple key on the current level.
+ self.remove_possible_simple_key()
+
+ # Add KEY.
+ start_mark = self.get_mark()
+ self.forward()
+ end_mark = self.get_mark()
+ self.tokens.append(KeyToken(start_mark, end_mark))
+
+ def fetch_value(self):
+
+ # Do we determine a simple key?
+ if self.flow_level in self.possible_simple_keys:
+
+ # Add KEY.
+ key = self.possible_simple_keys[self.flow_level]
+ del self.possible_simple_keys[self.flow_level]
+ self.tokens.insert(key.token_number-self.tokens_taken,
+ KeyToken(key.mark, key.mark))
+
+ # If this key starts a new block mapping, we need to add
+ # BLOCK-MAPPING-START.
+ if not self.flow_level:
+ if self.add_indent(key.column):
+ self.tokens.insert(key.token_number-self.tokens_taken,
+ BlockMappingStartToken(key.mark, key.mark))
+
+ # There cannot be two simple keys one after another.
+ self.allow_simple_key = False
+
+ # It must be a part of a complex key.
+ else:
+
+ # Block context needs additional checks.
+ # (Do we really need them? They will be catched by the parser
+ # anyway.)
+ if not self.flow_level:
+
+ # We are allowed to start a complex value if and only if
+ # we can start a simple key.
+ if not self.allow_simple_key:
+ raise ScannerError(None, None,
+ "mapping values are not allowed here",
+ self.get_mark())
+
+ # If this value starts a new block mapping, we need to add
+ # BLOCK-MAPPING-START. It will be detected as an error later by
+ # the parser.
+ if not self.flow_level:
+ if self.add_indent(self.column):
+ mark = self.get_mark()
+ self.tokens.append(BlockMappingStartToken(mark, mark))
+
+ # Simple keys are allowed after ':' in the block context.
+ self.allow_simple_key = not self.flow_level
+
+ # Reset possible simple key on the current level.
+ self.remove_possible_simple_key()
+
+ # Add VALUE.
+ start_mark = self.get_mark()
+ self.forward()
+ end_mark = self.get_mark()
+ self.tokens.append(ValueToken(start_mark, end_mark))
+
+ def fetch_alias(self):
+
+ # ALIAS could be a simple key.
+ self.save_possible_simple_key()
+
+ # No simple keys after ALIAS.
+ self.allow_simple_key = False
+
+ # Scan and add ALIAS.
+ self.tokens.append(self.scan_anchor(AliasToken))
+
+ def fetch_anchor(self):
+
+ # ANCHOR could start a simple key.
+ self.save_possible_simple_key()
+
+ # No simple keys after ANCHOR.
+ self.allow_simple_key = False
+
+ # Scan and add ANCHOR.
+ self.tokens.append(self.scan_anchor(AnchorToken))
+
+ def fetch_tag(self):
+
+ # TAG could start a simple key.
+ self.save_possible_simple_key()
+
+ # No simple keys after TAG.
+ self.allow_simple_key = False
+
+ # Scan and add TAG.
+ self.tokens.append(self.scan_tag())
+
+ def fetch_literal(self):
+ self.fetch_block_scalar(style='|')
+
+ def fetch_folded(self):
+ self.fetch_block_scalar(style='>')
+
+ def fetch_block_scalar(self, style):
+
+ # A simple key may follow a block scalar.
+ self.allow_simple_key = True
+
+ # Reset possible simple key on the current level.
+ self.remove_possible_simple_key()
+
+ # Scan and add SCALAR.
+ self.tokens.append(self.scan_block_scalar(style))
+
+ def fetch_single(self):
+ self.fetch_flow_scalar(style='\'')
+
+ def fetch_double(self):
+ self.fetch_flow_scalar(style='"')
+
+ def fetch_flow_scalar(self, style):
+
+ # A flow scalar could be a simple key.
+ self.save_possible_simple_key()
+
+ # No simple keys after flow scalars.
+ self.allow_simple_key = False
+
+ # Scan and add SCALAR.
+ self.tokens.append(self.scan_flow_scalar(style))
+
+ def fetch_plain(self):
+
+ # A plain scalar could be a simple key.
+ self.save_possible_simple_key()
+
+ # No simple keys after plain scalars. But note that `scan_plain` will
+ # change this flag if the scan is finished at the beginning of the
+ # line.
+ self.allow_simple_key = False
+
+ # Scan and add SCALAR. May change `allow_simple_key`.
+ self.tokens.append(self.scan_plain())
+
+ # Checkers.
+
+ def check_directive(self):
+
+ # DIRECTIVE: ^ '%' ...
+ # The '%' indicator is already checked.
+ if self.column == 0:
+ return True
+
+ def check_document_start(self):
+
+ # DOCUMENT-START: ^ '---' (' '|'\n')
+ if self.column == 0:
+ if self.prefix(3) == u'---' \
+ and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
+ return True
+
+ def check_document_end(self):
+
+ # DOCUMENT-END: ^ '...' (' '|'\n')
+ if self.column == 0:
+ if self.prefix(3) == u'...' \
+ and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
+ return True
+
+ def check_block_entry(self):
+
+ # BLOCK-ENTRY: '-' (' '|'\n')
+ return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
+
+ def check_key(self):
+
+ # KEY(flow context): '?'
+ if self.flow_level:
+ return True
+
+ # KEY(block context): '?' (' '|'\n')
+ else:
+ return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
+
+ def check_value(self):
+
+ # VALUE(flow context): ':'
+ if self.flow_level:
+ return True
+
+ # VALUE(block context): ':' (' '|'\n')
+ else:
+ return self.peek(1) in u'\0 \t\r\n\x85\u2028\u2029'
+
+ def check_plain(self):
+
+ # A plain scalar may start with any non-space character except:
+ # '-', '?', ':', ',', '[', ']', '{', '}',
+ # '#', '&', '*', '!', '|', '>', '\'', '\"',
+ # '%', '@', '`'.
+ #
+ # It may also start with
+ # '-', '?', ':'
+ # if it is followed by a non-space character.
+ #
+ # Note that we limit the last rule to the block context (except the
+ # '-' character) because we want the flow context to be space
+ # independent.
+ ch = self.peek()
+ return ch not in u'\0 \t\r\n\x85\u2028\u2029-?:,[]{}#&*!|>\'\"%@`' \
+ or (self.peek(1) not in u'\0 \t\r\n\x85\u2028\u2029'
+ and (ch == u'-' or (not self.flow_level and ch in u'?:')))
+
+ # Scanners.
+
+ def scan_to_next_token(self):
+ # We ignore spaces, line breaks and comments.
+ # If we find a line break in the block context, we set the flag
+ # `allow_simple_key` on.
+ # The byte order mark is stripped if it's the first character in the
+ # stream. We do not yet support BOM inside the stream as the
+ # specification requires. Any such mark will be considered as a part
+ # of the document.
+ #
+ # TODO: We need to make tab handling rules more sane. A good rule is
+ # Tabs cannot precede tokens
+ # BLOCK-SEQUENCE-START, BLOCK-MAPPING-START, BLOCK-END,
+ # KEY(block), VALUE(block), BLOCK-ENTRY
+ # So the checking code is
+ # if <TAB>:
+ # self.allow_simple_keys = False
+ # We also need to add the check for `allow_simple_keys == True` to
+ # `unwind_indent` before issuing BLOCK-END.
+ # Scanners for block, flow, and plain scalars need to be modified.
+
+ if self.index == 0 and self.peek() == u'\uFEFF':
+ self.forward()
+ found = False
+ while not found:
+ while self.peek() == u' ':
+ self.forward()
+ if self.peek() == u'#':
+ while self.peek() not in u'\0\r\n\x85\u2028\u2029':
+ self.forward()
+ if self.scan_line_break():
+ if not self.flow_level:
+ self.allow_simple_key = True
+ else:
+ found = True
+
+ def scan_directive(self):
+ # See the specification for details.
+ start_mark = self.get_mark()
+ self.forward()
+ name = self.scan_directive_name(start_mark)
+ value = None
+ if name == u'YAML':
+ value = self.scan_yaml_directive_value(start_mark)
+ end_mark = self.get_mark()
+ elif name == u'TAG':
+ value = self.scan_tag_directive_value(start_mark)
+ end_mark = self.get_mark()
+ else:
+ end_mark = self.get_mark()
+ while self.peek() not in u'\0\r\n\x85\u2028\u2029':
+ self.forward()
+ self.scan_directive_ignored_line(start_mark)
+ return DirectiveToken(name, value, start_mark, end_mark)
+
+ def scan_directive_name(self, start_mark):
+ # See the specification for details.
+ length = 0
+ ch = self.peek(length)
+ while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \
+ or ch in u'-_':
+ length += 1
+ ch = self.peek(length)
+ if not length:
+ raise ScannerError("while scanning a directive", start_mark,
+ "expected alphabetic or numeric character, but found %r"
+ % ch.encode('utf-8'), self.get_mark())
+ value = self.prefix(length)
+ self.forward(length)
+ ch = self.peek()
+ if ch not in u'\0 \r\n\x85\u2028\u2029':
+ raise ScannerError("while scanning a directive", start_mark,
+ "expected alphabetic or numeric character, but found %r"
+ % ch.encode('utf-8'), self.get_mark())
+ return value
+
+ def scan_yaml_directive_value(self, start_mark):
+ # See the specification for details.
+ while self.peek() == u' ':
+ self.forward()
+ major = self.scan_yaml_directive_number(start_mark)
+ if self.peek() != '.':
+ raise ScannerError("while scanning a directive", start_mark,
+ "expected a digit or '.', but found %r"
+ % self.peek().encode('utf-8'),
+ self.get_mark())
+ self.forward()
+ minor = self.scan_yaml_directive_number(start_mark)
+ if self.peek() not in u'\0 \r\n\x85\u2028\u2029':
+ raise ScannerError("while scanning a directive", start_mark,
+ "expected a digit or ' ', but found %r"
+ % self.peek().encode('utf-8'),
+ self.get_mark())
+ return (major, minor)
+
+ def scan_yaml_directive_number(self, start_mark):
+ # See the specification for details.
+ ch = self.peek()
+ if not (u'0' <= ch <= u'9'):
+ raise ScannerError("while scanning a directive", start_mark,
+ "expected a digit, but found %r" % ch.encode('utf-8'),
+ self.get_mark())
+ length = 0
+ while u'0' <= self.peek(length) <= u'9':
+ length += 1
+ value = int(self.prefix(length))
+ self.forward(length)
+ return value
+
+ def scan_tag_directive_value(self, start_mark):
+ # See the specification for details.
+ while self.peek() == u' ':
+ self.forward()
+ handle = self.scan_tag_directive_handle(start_mark)
+ while self.peek() == u' ':
+ self.forward()
+ prefix = self.scan_tag_directive_prefix(start_mark)
+ return (handle, prefix)
+
+ def scan_tag_directive_handle(self, start_mark):
+ # See the specification for details.
+ value = self.scan_tag_handle('directive', start_mark)
+ ch = self.peek()
+ if ch != u' ':
+ raise ScannerError("while scanning a directive", start_mark,
+ "expected ' ', but found %r" % ch.encode('utf-8'),
+ self.get_mark())
+ return value
+
+ def scan_tag_directive_prefix(self, start_mark):
+ # See the specification for details.
+ value = self.scan_tag_uri('directive', start_mark)
+ ch = self.peek()
+ if ch not in u'\0 \r\n\x85\u2028\u2029':
+ raise ScannerError("while scanning a directive", start_mark,
+ "expected ' ', but found %r" % ch.encode('utf-8'),
+ self.get_mark())
+ return value
+
+ def scan_directive_ignored_line(self, start_mark):
+ # See the specification for details.
+ while self.peek() == u' ':
+ self.forward()
+ if self.peek() == u'#':
+ while self.peek() not in u'\0\r\n\x85\u2028\u2029':
+ self.forward()
+ ch = self.peek()
+ if ch not in u'\0\r\n\x85\u2028\u2029':
+ raise ScannerError("while scanning a directive", start_mark,
+ "expected a comment or a line break, but found %r"
+ % ch.encode('utf-8'), self.get_mark())
+ self.scan_line_break()
+
+ def scan_anchor(self, TokenClass):
+ # The specification does not restrict characters for anchors and
+ # aliases. This may lead to problems, for instance, the document:
+ # [ *alias, value ]
+ # can be interpteted in two ways, as
+ # [ "value" ]
+ # and
+ # [ *alias , "value" ]
+ # Therefore we restrict aliases to numbers and ASCII letters.
+ start_mark = self.get_mark()
+ indicator = self.peek()
+ if indicator == u'*':
+ name = 'alias'
+ else:
+ name = 'anchor'
+ self.forward()
+ length = 0
+ ch = self.peek(length)
+ while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \
+ or ch in u'-_':
+ length += 1
+ ch = self.peek(length)
+ if not length:
+ raise ScannerError("while scanning an %s" % name, start_mark,
+ "expected alphabetic or numeric character, but found %r"
+ % ch.encode('utf-8'), self.get_mark())
+ value = self.prefix(length)
+ self.forward(length)
+ ch = self.peek()
+ if ch not in u'\0 \t\r\n\x85\u2028\u2029?:,]}%@`':
+ raise ScannerError("while scanning an %s" % name, start_mark,
+ "expected alphabetic or numeric character, but found %r"
+ % ch.encode('utf-8'), self.get_mark())
+ end_mark = self.get_mark()
+ return TokenClass(value, start_mark, end_mark)
+
+ def scan_tag(self):
+ # See the specification for details.
+ start_mark = self.get_mark()
+ ch = self.peek(1)
+ if ch == u'<':
+ handle = None
+ self.forward(2)
+ suffix = self.scan_tag_uri('tag', start_mark)
+ if self.peek() != u'>':
+ raise ScannerError("while parsing a tag", start_mark,
+ "expected '>', but found %r" % self.peek().encode('utf-8'),
+ self.get_mark())
+ self.forward()
+ elif ch in u'\0 \t\r\n\x85\u2028\u2029':
+ handle = None
+ suffix = u'!'
+ self.forward()
+ else:
+ length = 1
+ use_handle = False
+ while ch not in u'\0 \r\n\x85\u2028\u2029':
+ if ch == u'!':
+ use_handle = True
+ break
+ length += 1
+ ch = self.peek(length)
+ handle = u'!'
+ if use_handle:
+ handle = self.scan_tag_handle('tag', start_mark)
+ else:
+ handle = u'!'
+ self.forward()
+ suffix = self.scan_tag_uri('tag', start_mark)
+ ch = self.peek()
+ if ch not in u'\0 \r\n\x85\u2028\u2029':
+ raise ScannerError("while scanning a tag", start_mark,
+ "expected ' ', but found %r" % ch.encode('utf-8'),
+ self.get_mark())
+ value = (handle, suffix)
+ end_mark = self.get_mark()
+ return TagToken(value, start_mark, end_mark)
+
+ def scan_block_scalar(self, style):
+ # See the specification for details.
+
+ if style == '>':
+ folded = True
+ else:
+ folded = False
+
+ chunks = []
+ start_mark = self.get_mark()
+
+ # Scan the header.
+ self.forward()
+ chomping, increment = self.scan_block_scalar_indicators(start_mark)
+ self.scan_block_scalar_ignored_line(start_mark)
+
+ # Determine the indentation level and go to the first non-empty line.
+ min_indent = self.indent+1
+ if min_indent < 1:
+ min_indent = 1
+ if increment is None:
+ breaks, max_indent, end_mark = self.scan_block_scalar_indentation()
+ indent = max(min_indent, max_indent)
+ else:
+ indent = min_indent+increment-1
+ breaks, end_mark = self.scan_block_scalar_breaks(indent)
+ line_break = u''
+
+ # Scan the inner part of the block scalar.
+ while self.column == indent and self.peek() != u'\0':
+ chunks.extend(breaks)
+ leading_non_space = self.peek() not in u' \t'
+ length = 0
+ while self.peek(length) not in u'\0\r\n\x85\u2028\u2029':
+ length += 1
+ chunks.append(self.prefix(length))
+ self.forward(length)
+ line_break = self.scan_line_break()
+ breaks, end_mark = self.scan_block_scalar_breaks(indent)
+ if self.column == indent and self.peek() != u'\0':
+
+ # Unfortunately, folding rules are ambiguous.
+ #
+ # This is the folding according to the specification:
+
+ if folded and line_break == u'\n' \
+ and leading_non_space and self.peek() not in u' \t':
+ if not breaks:
+ chunks.append(u' ')
+ else:
+ chunks.append(line_break)
+
+ # This is Clark Evans's interpretation (also in the spec
+ # examples):
+ #
+ #if folded and line_break == u'\n':
+ # if not breaks:
+ # if self.peek() not in ' \t':
+ # chunks.append(u' ')
+ # else:
+ # chunks.append(line_break)
+ #else:
+ # chunks.append(line_break)
+ else:
+ break
+
+ # Chomp the tail.
+ if chomping is not False:
+ chunks.append(line_break)
+ if chomping is True:
+ chunks.extend(breaks)
+
+ # We are done.
+ return ScalarToken(u''.join(chunks), False, start_mark, end_mark,
+ style)
+
+ def scan_block_scalar_indicators(self, start_mark):
+ # See the specification for details.
+ chomping = None
+ increment = None
+ ch = self.peek()
+ if ch in u'+-':
+ if ch == '+':
+ chomping = True
+ else:
+ chomping = False
+ self.forward()
+ ch = self.peek()
+ if ch in u'0123456789':
+ increment = int(ch)
+ if increment == 0:
+ raise ScannerError("while scanning a block scalar", start_mark,
+ "expected indentation indicator in the range 1-9, but found 0",
+ self.get_mark())
+ self.forward()
+ elif ch in u'0123456789':
+ increment = int(ch)
+ if increment == 0:
+ raise ScannerError("while scanning a block scalar", start_mark,
+ "expected indentation indicator in the range 1-9, but found 0",
+ self.get_mark())
+ self.forward()
+ ch = self.peek()
+ if ch in u'+-':
+ if ch == '+':
+ chomping = True
+ else:
+ chomping = False
+ self.forward()
+ ch = self.peek()
+ if ch not in u'\0 \r\n\x85\u2028\u2029':
+ raise ScannerError("while scanning a block scalar", start_mark,
+ "expected chomping or indentation indicators, but found %r"
+ % ch.encode('utf-8'), self.get_mark())
+ return chomping, increment
+
+ def scan_block_scalar_ignored_line(self, start_mark):
+ # See the specification for details.
+ while self.peek() == u' ':
+ self.forward()
+ if self.peek() == u'#':
+ while self.peek() not in u'\0\r\n\x85\u2028\u2029':
+ self.forward()
+ ch = self.peek()
+ if ch not in u'\0\r\n\x85\u2028\u2029':
+ raise ScannerError("while scanning a block scalar", start_mark,
+ "expected a comment or a line break, but found %r"
+ % ch.encode('utf-8'), self.get_mark())
+ self.scan_line_break()
+
+ def scan_block_scalar_indentation(self):
+ # See the specification for details.
+ chunks = []
+ max_indent = 0
+ end_mark = self.get_mark()
+ while self.peek() in u' \r\n\x85\u2028\u2029':
+ if self.peek() != u' ':
+ chunks.append(self.scan_line_break())
+ end_mark = self.get_mark()
+ else:
+ self.forward()
+ if self.column > max_indent:
+ max_indent = self.column
+ return chunks, max_indent, end_mark
+
+ def scan_block_scalar_breaks(self, indent):
+ # See the specification for details.
+ chunks = []
+ end_mark = self.get_mark()
+ while self.column < indent and self.peek() == u' ':
+ self.forward()
+ while self.peek() in u'\r\n\x85\u2028\u2029':
+ chunks.append(self.scan_line_break())
+ end_mark = self.get_mark()
+ while self.column < indent and self.peek() == u' ':
+ self.forward()
+ return chunks, end_mark
+
+ def scan_flow_scalar(self, style):
+ # See the specification for details.
+ # Note that we loose indentation rules for quoted scalars. Quoted
+ # scalars don't need to adhere indentation because " and ' clearly
+ # mark the beginning and the end of them. Therefore we are less
+ # restrictive then the specification requires. We only need to check
+ # that document separators are not included in scalars.
+ if style == '"':
+ double = True
+ else:
+ double = False
+ chunks = []
+ start_mark = self.get_mark()
+ quote = self.peek()
+ self.forward()
+ chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
+ while self.peek() != quote:
+ chunks.extend(self.scan_flow_scalar_spaces(double, start_mark))
+ chunks.extend(self.scan_flow_scalar_non_spaces(double, start_mark))
+ self.forward()
+ end_mark = self.get_mark()
+ return ScalarToken(u''.join(chunks), False, start_mark, end_mark,
+ style)
+
+ ESCAPE_REPLACEMENTS = {
+ u'0': u'\0',
+ u'a': u'\x07',
+ u'b': u'\x08',
+ u't': u'\x09',
+ u'\t': u'\x09',
+ u'n': u'\x0A',
+ u'v': u'\x0B',
+ u'f': u'\x0C',
+ u'r': u'\x0D',
+ u'e': u'\x1B',
+ u' ': u'\x20',
+ u'\"': u'\"',
+ u'\\': u'\\',
+ u'N': u'\x85',
+ u'_': u'\xA0',
+ u'L': u'\u2028',
+ u'P': u'\u2029',
+ }
+
+ ESCAPE_CODES = {
+ u'x': 2,
+ u'u': 4,
+ u'U': 8,
+ }
+
+ def scan_flow_scalar_non_spaces(self, double, start_mark):
+ # See the specification for details.
+ chunks = []
+ while True:
+ length = 0
+ while self.peek(length) not in u'\'\"\\\0 \t\r\n\x85\u2028\u2029':
+ length += 1
+ if length:
+ chunks.append(self.prefix(length))
+ self.forward(length)
+ ch = self.peek()
+ if not double and ch == u'\'' and self.peek(1) == u'\'':
+ chunks.append(u'\'')
+ self.forward(2)
+ elif (double and ch == u'\'') or (not double and ch in u'\"\\'):
+ chunks.append(ch)
+ self.forward()
+ elif double and ch == u'\\':
+ self.forward()
+ ch = self.peek()
+ if ch in self.ESCAPE_REPLACEMENTS:
+ chunks.append(self.ESCAPE_REPLACEMENTS[ch])
+ self.forward()
+ elif ch in self.ESCAPE_CODES:
+ length = self.ESCAPE_CODES[ch]
+ self.forward()
+ for k in range(length):
+ if self.peek(k) not in u'0123456789ABCDEFabcdef':
+ raise ScannerError("while scanning a double-quoted scalar", start_mark,
+ "expected escape sequence of %d hexdecimal numbers, but found %r" %
+ (length, self.peek(k).encode('utf-8')), self.get_mark())
+ code = int(self.prefix(length), 16)
+ chunks.append(unichr(code))
+ self.forward(length)
+ elif ch in u'\r\n\x85\u2028\u2029':
+ self.scan_line_break()
+ chunks.extend(self.scan_flow_scalar_breaks(double, start_mark))
+ else:
+ raise ScannerError("while scanning a double-quoted scalar", start_mark,
+ "found unknown escape character %r" % ch.encode('utf-8'), self.get_mark())
+ else:
+ return chunks
+
+ def scan_flow_scalar_spaces(self, double, start_mark):
+ # See the specification for details.
+ chunks = []
+ length = 0
+ while self.peek(length) in u' \t':
+ length += 1
+ whitespaces = self.prefix(length)
+ self.forward(length)
+ ch = self.peek()
+ if ch == u'\0':
+ raise ScannerError("while scanning a quoted scalar", start_mark,
+ "found unexpected end of stream", self.get_mark())
+ elif ch in u'\r\n\x85\u2028\u2029':
+ line_break = self.scan_line_break()
+ breaks = self.scan_flow_scalar_breaks(double, start_mark)
+ if line_break != u'\n':
+ chunks.append(line_break)
+ elif not breaks:
+ chunks.append(u' ')
+ chunks.extend(breaks)
+ else:
+ chunks.append(whitespaces)
+ return chunks
+
+ def scan_flow_scalar_breaks(self, double, start_mark):
+ # See the specification for details.
+ chunks = []
+ while True:
+ # Instead of checking indentation, we check for document
+ # separators.
+ prefix = self.prefix(3)
+ if (prefix == u'---' or prefix == u'...') \
+ and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
+ raise ScannerError("while scanning a quoted scalar", start_mark,
+ "found unexpected document separator", self.get_mark())
+ while self.peek() in u' \t':
+ self.forward()
+ if self.peek() in u'\r\n\x85\u2028\u2029':
+ chunks.append(self.scan_line_break())
+ else:
+ return chunks
+
+ def scan_plain(self):
+ # See the specification for details.
+ # We add an additional restriction for the flow context:
+ # plain scalars in the flow context cannot contain ',', ':' and '?'.
+ # We also keep track of the `allow_simple_key` flag here.
+ # Indentation rules are loosed for the flow context.
+ chunks = []
+ start_mark = self.get_mark()
+ end_mark = start_mark
+ indent = self.indent+1
+ # We allow zero indentation for scalars, but then we need to check for
+ # document separators at the beginning of the line.
+ #if indent == 0:
+ # indent = 1
+ spaces = []
+ while True:
+ length = 0
+ if self.peek() == u'#':
+ break
+ while True:
+ ch = self.peek(length)
+ if ch in u'\0 \t\r\n\x85\u2028\u2029' \
+ or (not self.flow_level and ch == u':' and
+ self.peek(length+1) in u'\0 \t\r\n\x85\u2028\u2029') \
+ or (self.flow_level and ch in u',:?[]{}'):
+ break
+ length += 1
+ # It's not clear what we should do with ':' in the flow context.
+ if (self.flow_level and ch == u':'
+ and self.peek(length+1) not in u'\0 \t\r\n\x85\u2028\u2029,[]{}'):
+ self.forward(length)
+ raise ScannerError("while scanning a plain scalar", start_mark,
+ "found unexpected ':'", self.get_mark(),
+ "Please check http://pyyaml.org/wiki/YAMLColonInFlowContext for details.")
+ if length == 0:
+ break
+ self.allow_simple_key = False
+ chunks.extend(spaces)
+ chunks.append(self.prefix(length))
+ self.forward(length)
+ end_mark = self.get_mark()
+ spaces = self.scan_plain_spaces(indent, start_mark)
+ if not spaces or self.peek() == u'#' \
+ or (not self.flow_level and self.column < indent):
+ break
+ return ScalarToken(u''.join(chunks), True, start_mark, end_mark)
+
+ def scan_plain_spaces(self, indent, start_mark):
+ # See the specification for details.
+ # The specification is really confusing about tabs in plain scalars.
+ # We just forbid them completely. Do not use tabs in YAML!
+ chunks = []
+ length = 0
+ while self.peek(length) in u' ':
+ length += 1
+ whitespaces = self.prefix(length)
+ self.forward(length)
+ ch = self.peek()
+ if ch in u'\r\n\x85\u2028\u2029':
+ line_break = self.scan_line_break()
+ self.allow_simple_key = True
+ prefix = self.prefix(3)
+ if (prefix == u'---' or prefix == u'...') \
+ and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
+ return
+ breaks = []
+ while self.peek() in u' \r\n\x85\u2028\u2029':
+ if self.peek() == ' ':
+ self.forward()
+ else:
+ breaks.append(self.scan_line_break())
+ prefix = self.prefix(3)
+ if (prefix == u'---' or prefix == u'...') \
+ and self.peek(3) in u'\0 \t\r\n\x85\u2028\u2029':
+ return
+ if line_break != u'\n':
+ chunks.append(line_break)
+ elif not breaks:
+ chunks.append(u' ')
+ chunks.extend(breaks)
+ elif whitespaces:
+ chunks.append(whitespaces)
+ return chunks
+
+ def scan_tag_handle(self, name, start_mark):
+ # See the specification for details.
+ # For some strange reasons, the specification does not allow '_' in
+ # tag handles. I have allowed it anyway.
+ ch = self.peek()
+ if ch != u'!':
+ raise ScannerError("while scanning a %s" % name, start_mark,
+ "expected '!', but found %r" % ch.encode('utf-8'),
+ self.get_mark())
+ length = 1
+ ch = self.peek(length)
+ if ch != u' ':
+ while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \
+ or ch in u'-_':
+ length += 1
+ ch = self.peek(length)
+ if ch != u'!':
+ self.forward(length)
+ raise ScannerError("while scanning a %s" % name, start_mark,
+ "expected '!', but found %r" % ch.encode('utf-8'),
+ self.get_mark())
+ length += 1
+ value = self.prefix(length)
+ self.forward(length)
+ return value
+
+ def scan_tag_uri(self, name, start_mark):
+ # See the specification for details.
+ # Note: we do not check if URI is well-formed.
+ chunks = []
+ length = 0
+ ch = self.peek(length)
+ while u'0' <= ch <= u'9' or u'A' <= ch <= u'Z' or u'a' <= ch <= u'z' \
+ or ch in u'-;/?:@&=+$,_.!~*\'()[]%':
+ if ch == u'%':
+ chunks.append(self.prefix(length))
+ self.forward(length)
+ length = 0
+ chunks.append(self.scan_uri_escapes(name, start_mark))
+ else:
+ length += 1
+ ch = self.peek(length)
+ if length:
+ chunks.append(self.prefix(length))
+ self.forward(length)
+ length = 0
+ if not chunks:
+ raise ScannerError("while parsing a %s" % name, start_mark,
+ "expected URI, but found %r" % ch.encode('utf-8'),
+ self.get_mark())
+ return u''.join(chunks)
+
+ def scan_uri_escapes(self, name, start_mark):
+ # See the specification for details.
+ bytes = []
+ mark = self.get_mark()
+ while self.peek() == u'%':
+ self.forward()
+ for k in range(2):
+ if self.peek(k) not in u'0123456789ABCDEFabcdef':
+ raise ScannerError("while scanning a %s" % name, start_mark,
+ "expected URI escape sequence of 2 hexdecimal numbers, but found %r" %
+ (self.peek(k).encode('utf-8')), self.get_mark())
+ bytes.append(chr(int(self.prefix(2), 16)))
+ self.forward(2)
+ try:
+ value = unicode(''.join(bytes), 'utf-8')
+ except UnicodeDecodeError, exc:
+ raise ScannerError("while scanning a %s" % name, start_mark, str(exc), mark)
+ return value
+
+ def scan_line_break(self):
+ # Transforms:
+ # '\r\n' : '\n'
+ # '\r' : '\n'
+ # '\n' : '\n'
+ # '\x85' : '\n'
+ # '\u2028' : '\u2028'
+ # '\u2029 : '\u2029'
+ # default : ''
+ ch = self.peek()
+ if ch in u'\r\n\x85':
+ if self.prefix(2) == u'\r\n':
+ self.forward(2)
+ else:
+ self.forward()
+ return u'\n'
+ elif ch in u'\u2028\u2029':
+ self.forward()
+ return ch
+ return u''
+
+#try:
+# import psyco
+# psyco.bind(Scanner)
+#except ImportError:
+# pass
+
diff --git a/paleomix/yaml/lib2/serializer.py b/paleomix/yaml/lib2/serializer.py
new file mode 100644
index 0000000..0bf1e96
--- /dev/null
+++ b/paleomix/yaml/lib2/serializer.py
@@ -0,0 +1,111 @@
+
+__all__ = ['Serializer', 'SerializerError']
+
+from error import YAMLError
+from events import *
+from nodes import *
+
+class SerializerError(YAMLError):
+ pass
+
+class Serializer(object):
+
+ ANCHOR_TEMPLATE = u'id%03d'
+
+ def __init__(self, encoding=None,
+ explicit_start=None, explicit_end=None, version=None, tags=None):
+ self.use_encoding = encoding
+ self.use_explicit_start = explicit_start
+ self.use_explicit_end = explicit_end
+ self.use_version = version
+ self.use_tags = tags
+ self.serialized_nodes = {}
+ self.anchors = {}
+ self.last_anchor_id = 0
+ self.closed = None
+
+ def open(self):
+ if self.closed is None:
+ self.emit(StreamStartEvent(encoding=self.use_encoding))
+ self.closed = False
+ elif self.closed:
+ raise SerializerError("serializer is closed")
+ else:
+ raise SerializerError("serializer is already opened")
+
+ def close(self):
+ if self.closed is None:
+ raise SerializerError("serializer is not opened")
+ elif not self.closed:
+ self.emit(StreamEndEvent())
+ self.closed = True
+
+ #def __del__(self):
+ # self.close()
+
+ def serialize(self, node):
+ if self.closed is None:
+ raise SerializerError("serializer is not opened")
+ elif self.closed:
+ raise SerializerError("serializer is closed")
+ self.emit(DocumentStartEvent(explicit=self.use_explicit_start,
+ version=self.use_version, tags=self.use_tags))
+ self.anchor_node(node)
+ self.serialize_node(node, None, None)
+ self.emit(DocumentEndEvent(explicit=self.use_explicit_end))
+ self.serialized_nodes = {}
+ self.anchors = {}
+ self.last_anchor_id = 0
+
+ def anchor_node(self, node):
+ if node in self.anchors:
+ if self.anchors[node] is None:
+ self.anchors[node] = self.generate_anchor(node)
+ else:
+ self.anchors[node] = None
+ if isinstance(node, SequenceNode):
+ for item in node.value:
+ self.anchor_node(item)
+ elif isinstance(node, MappingNode):
+ for key, value in node.value:
+ self.anchor_node(key)
+ self.anchor_node(value)
+
+ def generate_anchor(self, node):
+ self.last_anchor_id += 1
+ return self.ANCHOR_TEMPLATE % self.last_anchor_id
+
+ def serialize_node(self, node, parent, index):
+ alias = self.anchors[node]
+ if node in self.serialized_nodes:
+ self.emit(AliasEvent(alias))
+ else:
+ self.serialized_nodes[node] = True
+ self.descend_resolver(parent, index)
+ if isinstance(node, ScalarNode):
+ detected_tag = self.resolve(ScalarNode, node.value, (True, False))
+ default_tag = self.resolve(ScalarNode, node.value, (False, True))
+ implicit = (node.tag == detected_tag), (node.tag == default_tag)
+ self.emit(ScalarEvent(alias, node.tag, implicit, node.value,
+ style=node.style))
+ elif isinstance(node, SequenceNode):
+ implicit = (node.tag
+ == self.resolve(SequenceNode, node.value, True))
+ self.emit(SequenceStartEvent(alias, node.tag, implicit,
+ flow_style=node.flow_style))
+ index = 0
+ for item in node.value:
+ self.serialize_node(item, node, index)
+ index += 1
+ self.emit(SequenceEndEvent())
+ elif isinstance(node, MappingNode):
+ implicit = (node.tag
+ == self.resolve(MappingNode, node.value, True))
+ self.emit(MappingStartEvent(alias, node.tag, implicit,
+ flow_style=node.flow_style))
+ for key, value in node.value:
+ self.serialize_node(key, node, None)
+ self.serialize_node(value, node, key)
+ self.emit(MappingEndEvent())
+ self.ascend_resolver()
+
diff --git a/paleomix/yaml/lib2/tokens.py b/paleomix/yaml/lib2/tokens.py
new file mode 100644
index 0000000..4d0b48a
--- /dev/null
+++ b/paleomix/yaml/lib2/tokens.py
@@ -0,0 +1,104 @@
+
+class Token(object):
+ def __init__(self, start_mark, end_mark):
+ self.start_mark = start_mark
+ self.end_mark = end_mark
+ def __repr__(self):
+ attributes = [key for key in self.__dict__
+ if not key.endswith('_mark')]
+ attributes.sort()
+ arguments = ', '.join(['%s=%r' % (key, getattr(self, key))
+ for key in attributes])
+ return '%s(%s)' % (self.__class__.__name__, arguments)
+
+#class BOMToken(Token):
+# id = '<byte order mark>'
+
+class DirectiveToken(Token):
+ id = '<directive>'
+ def __init__(self, name, value, start_mark, end_mark):
+ self.name = name
+ self.value = value
+ self.start_mark = start_mark
+ self.end_mark = end_mark
+
+class DocumentStartToken(Token):
+ id = '<document start>'
+
+class DocumentEndToken(Token):
+ id = '<document end>'
+
+class StreamStartToken(Token):
+ id = '<stream start>'
+ def __init__(self, start_mark=None, end_mark=None,
+ encoding=None):
+ self.start_mark = start_mark
+ self.end_mark = end_mark
+ self.encoding = encoding
+
+class StreamEndToken(Token):
+ id = '<stream end>'
+
+class BlockSequenceStartToken(Token):
+ id = '<block sequence start>'
+
+class BlockMappingStartToken(Token):
+ id = '<block mapping start>'
+
+class BlockEndToken(Token):
+ id = '<block end>'
+
+class FlowSequenceStartToken(Token):
+ id = '['
+
+class FlowMappingStartToken(Token):
+ id = '{'
+
+class FlowSequenceEndToken(Token):
+ id = ']'
+
+class FlowMappingEndToken(Token):
+ id = '}'
+
+class KeyToken(Token):
+ id = '?'
+
+class ValueToken(Token):
+ id = ':'
+
+class BlockEntryToken(Token):
+ id = '-'
+
+class FlowEntryToken(Token):
+ id = ','
+
+class AliasToken(Token):
+ id = '<alias>'
+ def __init__(self, value, start_mark, end_mark):
+ self.value = value
+ self.start_mark = start_mark
+ self.end_mark = end_mark
+
+class AnchorToken(Token):
+ id = '<anchor>'
+ def __init__(self, value, start_mark, end_mark):
+ self.value = value
+ self.start_mark = start_mark
+ self.end_mark = end_mark
+
+class TagToken(Token):
+ id = '<tag>'
+ def __init__(self, value, start_mark, end_mark):
+ self.value = value
+ self.start_mark = start_mark
+ self.end_mark = end_mark
+
+class ScalarToken(Token):
+ id = '<scalar>'
+ def __init__(self, value, plain, start_mark, end_mark, style=None):
+ self.value = value
+ self.plain = plain
+ self.start_mark = start_mark
+ self.end_mark = end_mark
+ self.style = style
+
diff --git a/pylint.conf b/pylint.conf
new file mode 100644
index 0000000..be19af1
--- /dev/null
+++ b/pylint.conf
@@ -0,0 +1,286 @@
+[MASTER]
+
+# Specify a configuration file.
+#rcfile=
+
+# Python code to execute, usually for sys.path manipulation such as
+# pygtk.require().
+#init-hook=
+
+# Profiled execution.
+profile=no
+
+# Add files or directories to the blacklist. They should be base names, not
+# paths.
+ignore=CVS
+
+# Pickle collected data for later comparisons.
+persistent=yes
+
+# List of plugins (as comma separated values of python modules names) to load,
+# usually to register additional checkers.
+#load-plugins=
+
+
+[MESSAGES CONTROL]
+
+# Enable the message, report, category or checker with the given id(s). You can
+# either give multiple identifier separated by comma (,) or put this option
+# multiple time. See also the "--disable" option for examples.
+#enable=
+
+# Disable the message, report, category or checker with the given id(s). You
+# can either give multiple identifiers separated by comma (,) or put this
+# option multiple times (only on the command line, not in the configuration
+# file where it should appear only once).You can also use "--disable=all" to
+# disable everything first and then reenable specific checks. For example, if
+# you want to run only the similarities checker, you can use "--disable=all
+# --enable=similarities". If you want to run only the classes checker, but have
+# no Warning level messages displayed, use"--disable=all --enable=classes
+# --disable=W"
+#disable=
+disable=W0142,R0903,R0913,I0011,similarities,C0111
+# W0142: Used * or ** magic
+# R0903: Too few public methods
+# R0913: Too many arguments (X/5)
+# I0011: Warning about warnings disabled locally
+# C0111: Missing docstring
+
+
+[REPORTS]
+
+# Set the output format. Available formats are text, parseable, colorized, msvs
+# (visual studio) and html. You can also give a reporter class, eg
+# mypackage.mymodule.MyReporterClass.
+output-format=text
+
+# Put messages in a separate file for each module / package specified on the
+# command line instead of printing them on stdout. Reports (if any) will be
+# written in a file name "pylint_global.[txt|html]".
+files-output=no
+
+# Tells whether to display a full report or only the messages
+reports=no
+
+# Python expression which should return a note less than 10 (10 is the highest
+# note). You have access to the variables errors warning, statement which
+# respectively contain the number of errors / warnings messages and the total
+# number of statements analyzed. This is used by the global evaluation report
+# (RP0004).
+evaluation=10.0 - ((float(5 * error + warning + refactor + convention) / statement) * 10)
+
+# Add a comment according to your evaluation note. This is used by the global
+# evaluation report (RP0004).
+comment=no
+
+# Template used to display messages. This is a python new-style format string
+# used to format the message information. See doc for all details
+msg-template={path}:{line}:{msg_id}: {msg}
+
+
+[TYPECHECK]
+
+# Tells whether missing members accessed in mixin class should be ignored. A
+# mixin class is detected if its name ends with "mixin" (case insensitive).
+ignore-mixin-members=yes
+
+# List of classes names for which member attributes should not be checked
+# (useful for classes with attributes dynamically set).
+ignored-classes=SQLObject
+
+# When zope mode is activated, add a predefined set of Zope acquired attributes
+# to generated-members.
+zope=no
+
+# List of members which are set dynamically and missed by pylint inference
+# system, and so shouldn't trigger E0201 when accessed. Python regular
+# expressions are accepted.
+generated-members=REQUEST,acl_users,aq_parent
+
+
+[VARIABLES]
+
+# Tells whether we should check for unused import in __init__ files.
+init-import=no
+
+# A regular expression matching the beginning of the name of dummy variables
+# (i.e. not used).
+dummy-variables-rgx=^_|dummy
+
+# List of additional names supposed to be defined in builtins. Remember that
+# you should avoid to define new builtins when possible.
+additional-builtins=
+
+
+[SIMILARITIES]
+
+# Minimum lines number of a similarity.
+min-similarity-lines=4
+
+# Ignore comments when computing similarities.
+ignore-comments=yes
+
+# Ignore docstrings when computing similarities.
+ignore-docstrings=yes
+
+# Ignore imports when computing similarities.
+ignore-imports=no
+
+
+[FORMAT]
+
+# Maximum number of characters on a single line.
+max-line-length=80
+
+# Regexp for a line that is allowed to be longer than the limit.
+ignore-long-lines=^\s*(# )?<?https?://\S+>?$
+
+# Allow the body of an if to be on the same line as the test if there is no
+# else.
+single-line-if-stmt=no
+
+# List of optional constructs for which whitespace checking is disabled
+no-space-check=trailing-comma,dict-separator
+
+# Maximum number of lines in a module
+max-module-lines=1000
+
+# String used as indentation unit. This is usually " " (4 spaces) or "\t" (1
+# tab).
+indent-string=' '
+
+
+[BASIC]
+
+# Required attributes for module, separated by a comma
+required-attributes=
+
+# List of builtins function names that should not be used, separated by a comma
+bad-functions=filter,apply,input
+
+# Regular expression which should only match correct module names
+module-rgx=(([a-z_][a-z0-9_]*)|([A-Z][a-zA-Z0-9]+))$
+
+# Regular expression which should only match correct module level names
+const-rgx=(([A-Z_][A-Z0-9_]*)|(__.*__))$
+
+# Regular expression which should only match correct class names
+class-rgx=[A-Z_][a-zA-Z0-9]+$
+
+# Regular expression which should only match correct function names
+function-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression which should only match correct method names
+method-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression which should only match correct instance attribute names
+attr-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression which should only match correct argument names
+argument-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression which should only match correct variable names
+variable-rgx=[a-z_][a-z0-9_]{2,30}$
+
+# Regular expression which should only match correct attribute names in class
+# bodies
+class-attribute-rgx=([A-Za-z_][A-Za-z0-9_]{2,30}|(__.*__))$
+
+# Regular expression which should only match correct list comprehension /
+# generator expression variable names
+inlinevar-rgx=[A-Za-z_][A-Za-z0-9_]*$
+
+# Good variable names which should always be accepted, separated by a comma
+good-names=i,j,k,ex,Run,_
+
+# Bad variable names which should always be refused, separated by a comma
+bad-names=foo,bar,baz,toto,tutu,tata
+
+# Regular expression which should only match function or class names that do
+# not require a docstring.
+no-docstring-rgx=__.*__
+
+# Minimum line length for functions/classes that require docstrings, shorter
+# ones are exempt.
+docstring-min-length=-1
+
+
+[MISCELLANEOUS]
+
+# List of note tags to take in consideration, separated by a comma.
+notes=FIXME,XXX,TODO
+
+
+[CLASSES]
+
+# List of interface methods to ignore, separated by a comma. This is used for
+# instance to not check methods defines in Zope's Interface base class.
+ignore-iface-methods=isImplementedBy,deferred,extends,names,namesAndDescriptions,queryDescriptionFor,getBases,getDescriptionFor,getDoc,getName,getTaggedValue,getTaggedValueTags,isEqualOrExtendedBy,setTaggedValue,isImplementedByInstancesOf,adaptWith,is_implemented_by
+
+# List of method names used to declare (i.e. assign) instance attributes.
+defining-attr-methods=__init__,__new__,setUp
+
+# List of valid names for the first argument in a class method.
+valid-classmethod-first-arg=cls
+
+# List of valid names for the first argument in a metaclass class method.
+valid-metaclass-classmethod-first-arg=mcs
+
+
+[DESIGN]
+
+# Maximum number of arguments for function / method
+max-args=5
+
+# Argument names that match this expression will be ignored. Default to name
+# with leading underscore
+ignored-argument-names=_.*
+
+# Maximum number of locals for function / method body
+max-locals=15
+
+# Maximum number of return / yield for function / method body
+max-returns=6
+
+# Maximum number of branch for function / method body
+max-branches=12
+
+# Maximum number of statements in function / method body
+max-statements=50
+
+# Maximum number of parents for a class (see R0901).
+max-parents=7
+
+# Maximum number of attributes for a class (see R0902).
+max-attributes=7
+
+# Minimum number of public methods for a class (see R0903).
+min-public-methods=2
+
+# Maximum number of public methods for a class (see R0904).
+max-public-methods=20
+
+
+[IMPORTS]
+
+# Deprecated modules which should not be used, separated by a comma
+deprecated-modules=regsub,TERMIOS,Bastion,rexec
+
+# Create a graph of every (i.e. internal and external) dependencies in the
+# given file (report RP0402 must not be disabled)
+import-graph=
+
+# Create a graph of external dependencies in the given file (report RP0402 must
+# not be disabled)
+ext-import-graph=
+
+# Create a graph of internal dependencies in the given file (report RP0402 must
+# not be disabled)
+int-import-graph=
+
+
+[EXCEPTIONS]
+
+# Exceptions that will emit a warning when being caught. Defaults to
+# "Exception"
+overgeneral-exceptions=Exception
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..e4b3084
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,107 @@
+#!/usr/bin/env python
+#
+# Copyright (c) 2013 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+import codecs
+import os
+import sys
+
+from setuptools import setup, find_packages
+
+
+if (sys.version_info[0] != 2) or (sys.version_info[1] != 7):
+ sys.stderr.write("ERROR: Python version 2.7.x required!\n")
+ sys.stderr.write(" Current version is v%s\n"
+ % (sys.version.replace("\n", " "),))
+ sys.exit(1)
+
+
+def _get_version():
+ """Retrieve version from current install directory."""
+ env = {}
+ with open(os.path.join("paleomix", "__init__.py")) as handle:
+ exec(handle.read(), env)
+
+ return env["__version__"]
+
+
+def _get_readme():
+ """Retrieves contents of README.rst, forcing UTF-8 encoding."""
+ with codecs.open("README.rst", encoding="utf-8") as handle:
+ return handle.read()
+
+
+setup(
+ name='paleomix',
+ version=_get_version(),
+
+ description='Bioinformatics pipelines for HTS data',
+ long_description=_get_readme(),
+
+ url='https://github.com/MikkelSchubert/paleomix',
+
+ author='Mikkel Schubert',
+ author_email='MSchubert at snm.ku.dk',
+
+ license='MIT',
+
+ classifiers=[
+ 'Development Status :: 5 - Production/Stable',
+
+ 'Intended Audience :: Science/Research',
+ 'Topic :: Scientific/Engineering :: Bio-Informatics',
+
+ 'License :: OSI Approved :: MIT License',
+
+ 'Programming Language :: Python :: 2 :: Only',
+ 'Programming Language :: Python :: 2.7',
+ ],
+
+ keywords='pipeline bioinformatics hts phylogeny bam',
+
+ packages=find_packages(exclude=['misc', 'tests']),
+
+ install_requires=['pysam>=0.8.3',
+ 'setproctitle>=1.1.0'],
+
+ # Dependencies set in setup_requires to allow use of 'setup.py nosetests'
+ setup_requires=['nose>=1.3.0',
+ 'flexmock>=0.9.7',
+ 'coverage>=4.0.0'],
+
+ test_suite='nose.collector',
+
+ entry_points={
+ 'console_scripts': [
+ 'paleomix=paleomix:run',
+
+ # Aliases used in previous publications
+ 'bam_pipeline=paleomix:run_bam_pipeline',
+ 'bam_rmdup_collapsed=paleomix:run_rmdup_collapsed',
+ 'conv_gtf_to_bed=paleomix:run_gtf_to_bed',
+ 'phylo_pipeline=paleomix:run_phylo_pipeline',
+ 'trim_pipeline=paleomix:run_trim_pipeline',
+ ],
+ },
+
+ zip_safe=False,
+
+ include_package_data=True,
+)
diff --git a/tests/atomiccmd_test/__init__.py b/tests/atomiccmd_test/__init__.py
new file mode 100644
index 0000000..90e5529
--- /dev/null
+++ b/tests/atomiccmd_test/__init__.py
@@ -0,0 +1,22 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
diff --git a/tests/atomiccmd_test/builder_test.py b/tests/atomiccmd_test/builder_test.py
new file mode 100644
index 0000000..f066471
--- /dev/null
+++ b/tests/atomiccmd_test/builder_test.py
@@ -0,0 +1,715 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+from flexmock import flexmock
+from nose.tools import assert_equal, assert_raises
+
+from paleomix.common.testing import Monkeypatch
+
+from paleomix.atomiccmd.builder import \
+ AtomicCmdBuilder, \
+ AtomicCmdBuilderError, \
+ AtomicJavaCmdBuilder, \
+ AtomicMPICmdBuilder, \
+ apply_options, \
+ use_customizable_cli_parameters, \
+ create_customizable_cli_parameters, \
+ JAVA_VERSIONS
+
+
+###############################################################################
+###############################################################################
+# AtomicCmdBuilder: Constructor
+
+
+def test_builder__simple__call():
+ builder = AtomicCmdBuilder(["ls"])
+ assert_equal(builder.call, ["ls"])
+
+
+def test_builder__simple__str():
+ builder = AtomicCmdBuilder("ls")
+ assert_equal(builder.call, ["ls"])
+
+
+def test_builder__simple__iterable():
+ builder = AtomicCmdBuilder(iter(["ls"]))
+ assert_equal(builder.call, ["ls"])
+
+
+def test_builder__complex():
+ builder = AtomicCmdBuilder(["java", "jar", "/a/jar"])
+ assert_equal(builder.call, ["java", "jar", "/a/jar"])
+
+
+def test_builder__kwargs__empty():
+ builder = AtomicCmdBuilder(["ls"])
+ assert_equal(builder.kwargs, {})
+
+
+def test_builder__kwargs():
+ expected = {"IN_FILE": "/abc/def.txt",
+ "OUT_FILE": "/etc/fstab"}
+ builder = AtomicCmdBuilder(["ls"], **expected)
+ assert_equal(builder.kwargs, expected)
+
+
+def test_builder__kwargs__set_cwd():
+ builder = AtomicCmdBuilder(["ls"], set_cwd=True)
+ assert_equal(builder.kwargs, {"set_cwd": True})
+
+
+###############################################################################
+###############################################################################
+# AtomicCmdBuilder: set_option
+
+
+def test_builder__set_option():
+ builder = AtomicCmdBuilder("find")
+ builder.set_option("-name", "*.txt")
+ assert_equal(builder.call, ["find", "-name", "*.txt"])
+
+
+def test_builder__set_option__overwrite():
+ builder = AtomicCmdBuilder("find")
+ builder.set_option("-name", "*.txt", fixed=False)
+ builder.set_option("-name", "*.bat")
+ assert_equal(builder.call, ["find", "-name", "*.bat"])
+
+
+def test_builder__set_option__overwrite_fixed():
+ builder = AtomicCmdBuilder("find")
+ builder.set_option("-name", "*.txt")
+ assert_raises(AtomicCmdBuilderError, builder.set_option, "-name", "*.bat")
+
+
+###############################################################################
+###############################################################################
+# AtomicCmdBuilder: add_option
+
+def test_builder__add_option():
+ builder = AtomicCmdBuilder("find")
+ builder.add_option("-name", "*.txt")
+ assert_equal(builder.call, ["find", "-name", "*.txt"])
+
+
+def test_builder__add_option__overwrite():
+ builder = AtomicCmdBuilder("find")
+ builder.add_option("-name", "*.txt")
+ builder.add_option("-or")
+ builder.add_option("-name", "*.bat")
+ assert_equal(builder.call, ["find", "-name", "*.txt", "-or", "-name", "*.bat"])
+
+
+###############################################################################
+###############################################################################
+# AtomicCmdBuilder: add_option / set_option common tests
+
+def test_builder__add_or_set_option__without_value():
+ def _do_test_builder__add_or_set_option__without_value(setter):
+ builder = AtomicCmdBuilder("find")
+ setter(builder, "-delete")
+ assert_equal(builder.call, ["find", "-delete"])
+ yield _do_test_builder__add_or_set_option__without_value, AtomicCmdBuilder.add_option
+ yield _do_test_builder__add_or_set_option__without_value, AtomicCmdBuilder.set_option
+
+
+def test_builder__add_or_set_option__with_sep():
+ def _do_test_builder__add_or_set_option__with_sep(setter):
+ builder = AtomicCmdBuilder("find")
+ setter(builder, "-size", "0", sep="=")
+ assert_equal(builder.call, ["find", "-size=0"])
+ yield _do_test_builder__add_or_set_option__with_sep, AtomicCmdBuilder.add_option
+ yield _do_test_builder__add_or_set_option__with_sep, AtomicCmdBuilder.set_option
+
+
+def test_builder__add_or_set_option__with_non_str_value():
+ def _do_test_test_builder__add_or_set_option__with_non_str_value(setter):
+ builder = AtomicCmdBuilder("find")
+ setter(builder, "-size", 0)
+ assert_equal(builder.call, ["find", "-size", 0])
+ yield _do_test_test_builder__add_or_set_option__with_non_str_value, AtomicCmdBuilder.add_option
+ yield _do_test_test_builder__add_or_set_option__with_non_str_value, AtomicCmdBuilder.set_option
+
+
+def test_builder__add_or_set_option__add_and_set():
+ def _do_test_builder__add_or_set_option__add_and_set(setter_1, setter_2):
+ builder = AtomicCmdBuilder("find")
+ setter_1(builder, "-name", "*.txt")
+ assert_raises(AtomicCmdBuilderError, setter_2, builder, "-name", "*.bat")
+ yield _do_test_builder__add_or_set_option__add_and_set, AtomicCmdBuilder.set_option, AtomicCmdBuilder.add_option
+ yield _do_test_builder__add_or_set_option__add_and_set, AtomicCmdBuilder.add_option, AtomicCmdBuilder.set_option
+
+
+def test_builder__add_or_set_option__with_non_str_key():
+ def _do_test_builder__add_or_set_option__with_non_str_key(setter):
+ builder = AtomicCmdBuilder("find")
+ assert_raises(TypeError, setter, builder, 7913, "True")
+ yield _do_test_builder__add_or_set_option__with_non_str_key, AtomicCmdBuilder.add_option
+ yield _do_test_builder__add_or_set_option__with_non_str_key, AtomicCmdBuilder.set_option
+
+
+def test_builder__add_or_set_option__after_finalize():
+ def _do_test_builder__add_or_set_option__after_finalize(setter):
+ builder = AtomicCmdBuilder("find")
+ builder.finalize()
+ assert_raises(AtomicCmdBuilderError, setter, builder, "-size", "1")
+ yield _do_test_builder__add_or_set_option__after_finalize, AtomicCmdBuilder.add_option
+ yield _do_test_builder__add_or_set_option__after_finalize, AtomicCmdBuilder.set_option
+
+
+def test_builder__add_or_set_option__empty_key():
+ def _do_test_builder__add_or_set_option__empty_key(setter):
+ builder = AtomicCmdBuilder("find")
+ assert_raises(KeyError, setter, builder, "", "1")
+ yield _do_test_builder__add_or_set_option__empty_key, AtomicCmdBuilder.add_option
+ yield _do_test_builder__add_or_set_option__empty_key, AtomicCmdBuilder.set_option
+
+
+###############################################################################
+###############################################################################
+# AtomicCmdBuilder: pop_option
+
+def test_builder__pop_option():
+ def _do_test_builder__pop_option(setter):
+ builder = AtomicCmdBuilder("find")
+ setter(builder, "-size", "0", fixed=False)
+ builder.pop_option("-size")
+ assert_equal(builder.call, ["find"])
+ yield _do_test_builder__pop_option, AtomicCmdBuilder.set_option
+ yield _do_test_builder__pop_option, AtomicCmdBuilder.add_option
+
+
+def test_builder__pop_option__last_option():
+ builder = AtomicCmdBuilder("find")
+ builder.add_option("-size", "0", fixed=False)
+ builder.add_option("-size", "1", fixed=False)
+ builder.pop_option("-size")
+ assert_equal(builder.call, ["find", "-size", "0"])
+
+
+def test_builder__pop_option__different_options():
+ def _do_test_builder__pop_option(setter):
+ builder = AtomicCmdBuilder("find")
+ setter(builder, "-empty", fixed=False)
+ setter(builder, "-size", "1", fixed=False)
+ setter(builder, "-name", "*.txt", fixed=False)
+ builder.pop_option("-size")
+ assert_equal(builder.call, ["find", "-empty", "-name", "*.txt"])
+ yield _do_test_builder__pop_option, AtomicCmdBuilder.set_option
+ yield _do_test_builder__pop_option, AtomicCmdBuilder.add_option
+
+
+def test_builder__pop_option__is_fixed():
+ def _do_test_builder__pop_option__is_fixed(setter):
+ builder = AtomicCmdBuilder("find")
+ setter(builder, "-size", "0")
+ assert_raises(AtomicCmdBuilderError, builder.pop_option, "-size")
+ yield _do_test_builder__pop_option__is_fixed, AtomicCmdBuilder.set_option
+ yield _do_test_builder__pop_option__is_fixed, AtomicCmdBuilder.add_option
+
+
+def test_builder__pop_option__empty():
+ builder = AtomicCmdBuilder("find")
+ assert_raises(KeyError, builder.pop_option, "-size")
+
+
+def test_builder__pop_option__missing_key():
+ builder = AtomicCmdBuilder("find")
+ builder.set_option("-size", 0)
+ assert_raises(KeyError, builder.pop_option, "-isize")
+
+
+def test_builder__pop_option__with_non_str_key():
+ builder = AtomicCmdBuilder("find")
+ assert_raises(TypeError, builder.pop_option, 7913)
+
+
+def test_builder__pop_option__with_empty_key():
+ builder = AtomicCmdBuilder("find")
+ assert_raises(KeyError, builder.pop_option, "")
+
+
+###############################################################################
+###############################################################################
+# AtomicCmdBuilder: add_value
+
+def test_builder__add_value():
+ builder = AtomicCmdBuilder("ls")
+ builder.add_value("%(IN_FILE)s")
+ assert_equal(builder.call, ["ls", "%(IN_FILE)s"])
+
+
+def test_builder__add_value__two_values():
+ builder = AtomicCmdBuilder("ls")
+ builder.add_value("%(IN_FILE)s")
+ builder.add_value("%(OUT_FILE)s")
+ assert_equal(builder.call, ["ls", "%(IN_FILE)s", "%(OUT_FILE)s"])
+
+
+###############################################################################
+###############################################################################
+# AtomicCmdBuilder: set_kwargs
+
+def test_builder__set_kwargs__called_once():
+ expected = {"IN_PATH": "/a/b/", "OUT_PATH": "/dst/file"}
+ builder = AtomicCmdBuilder("echo")
+ builder.set_kwargs(**expected)
+ assert_equal(builder.kwargs, expected)
+
+
+def test_builder__set_kwargs__called_twice():
+ expected = {"IN_PATH": "/a/b/", "OUT_PATH": "/dst/file"}
+ builder = AtomicCmdBuilder("echo")
+ builder.set_kwargs(OUT_PATH="/dst/file")
+ builder.set_kwargs(IN_PATH="/a/b/")
+ assert_equal(builder.kwargs, expected)
+
+
+def test_builder__set_kwargs__atomiccmdbuilder():
+ mock = flexmock(AtomicCmdBuilder("true"))
+ mock.should_receive('finalize').and_return("finalized!")
+ builder = AtomicCmdBuilder("ls", IN_BUILDER=mock)
+ assert_equal(builder.kwargs, {"IN_BUILDER": "finalized!"})
+
+
+def test_builder__set_kwargs__after_finalize():
+ expected = {"IN_PATH": "/a/b/"}
+ builder = AtomicCmdBuilder("echo")
+ builder.set_kwargs(IN_PATH="/a/b/")
+ builder.finalize()
+ assert_raises(AtomicCmdBuilderError, builder.set_kwargs, OUT_PATH="/dst/file")
+ assert_equal(builder.kwargs, expected)
+
+
+def test_builder__set__kwargs__overwriting():
+ expected = {"IN_PATH": "/a/b/"}
+ builder = AtomicCmdBuilder("echo")
+ builder.set_kwargs(IN_PATH="/a/b/")
+ assert_raises(AtomicCmdBuilderError, builder.set_kwargs, IN_PATH="/dst/file")
+ assert_equal(builder.kwargs, expected)
+
+
+###############################################################################
+###############################################################################
+# AtomicCmdBuilder: finalize
+
+def test_builder__finalized_call__simple_command():
+ builder = AtomicCmdBuilder("echo")
+ assert_equal(builder.finalized_call, ["echo"])
+
+
+def test_builder__finalized_call__kwargs_are_instantiated():
+ builder = AtomicCmdBuilder(("echo", "%(ARG1)s", "X=%(ARG2)s"),
+ ARG1="/foo/bar",
+ ARG2="zod")
+ assert_equal(builder.finalized_call, ["echo", "/foo/bar", "X=zod"])
+
+
+def test_builder__finalized_call__kwargs_are_instantiated__with_temp_dir():
+ builder = AtomicCmdBuilder(("echo", "%(ARG)s", "%(TEMP_DIR)s"),
+ ARG="/foo/bar")
+ assert_equal(builder.finalized_call, ["echo", "/foo/bar", "%(TEMP_DIR)"])
+
+
+def test_builder__finalized_call__kwargs_are_instantiated__with_non_str_arg():
+ builder = AtomicCmdBuilder(("echo", "%(ARG)s", 17),
+ ARG="/foo/bar")
+ assert_equal(builder.finalized_call, ["echo", "/foo/bar", "17"])
+
+
+###############################################################################
+###############################################################################
+# AtomicCmdBuilder: finalize
+
+def test_builder__finalize__returns_singleton():
+ builder = AtomicCmdBuilder("echo")
+ assert builder.finalize() is builder.finalize()
+
+
+def test_builder__finalize__calls_atomiccmd():
+ was_called = []
+
+ class _AtomicCmdMock:
+ def __init__(self, *args, **kwargs):
+ assert_equal(args, (["echo", "-out", "%(OUT_FILE)s", "%(IN_FILE)s"],))
+ assert_equal(kwargs, {"IN_FILE": "/in/file",
+ "OUT_FILE": "/out/file",
+ "set_cwd": True})
+ was_called.append(True)
+
+ with Monkeypatch("paleomix.atomiccmd.builder.AtomicCmd", _AtomicCmdMock):
+ builder = AtomicCmdBuilder("echo", set_cwd=True)
+ builder.add_option("-out", "%(OUT_FILE)s")
+ builder.add_value("%(IN_FILE)s")
+ builder.set_kwargs(OUT_FILE="/out/file",
+ IN_FILE="/in/file")
+
+ builder.finalize()
+ assert was_called
+
+
+###############################################################################
+###############################################################################
+# AtomicCmdBuilder: add_multiple_options
+
+def test_builder__add_multiple_options():
+ values = ("file_a", "file_b")
+ expected = {"IN_FILE_01": "file_a", "IN_FILE_02": "file_b"}
+
+ builder = AtomicCmdBuilder("ls")
+ kwargs = builder.add_multiple_options("-i", values)
+
+ assert_equal(kwargs, expected)
+ assert_equal(builder.kwargs, expected)
+ assert_equal(builder.call, ["ls",
+ "-i", "%(IN_FILE_01)s",
+ "-i", "%(IN_FILE_02)s"])
+
+
+def test_builder__add_multiple_options_with_sep():
+ values = ("file_a", "file_b")
+ expected = {"IN_FILE_01": "file_a", "IN_FILE_02": "file_b"}
+
+ builder = AtomicCmdBuilder("ls")
+ kwargs = builder.add_multiple_options("-i", values, sep="=")
+
+ assert_equal(kwargs, expected)
+ assert_equal(builder.kwargs, expected)
+ assert_equal(builder.call, ["ls",
+ "-i=%(IN_FILE_01)s",
+ "-i=%(IN_FILE_02)s"])
+
+
+def test_builder__add_multiple_options_with_template():
+ values = ("file_a", "file_b")
+ expected = {"OUT_BAM_1": "file_a", "OUT_BAM_2": "file_b"}
+
+ builder = AtomicCmdBuilder("ls")
+ kwargs = builder.add_multiple_options("-i", values, template="OUT_BAM_%i")
+
+ assert_equal(kwargs, expected)
+ assert_equal(builder.kwargs, expected)
+ assert_equal(builder.call, ["ls",
+ "-i", "%(OUT_BAM_1)s",
+ "-i", "%(OUT_BAM_2)s"])
+
+
+def test_builder__add_multiple_options_multiple_times():
+ expected = {"IN_FILE_01": "file_a",
+ "IN_FILE_02": "file_b"}
+
+ builder = AtomicCmdBuilder("ls")
+ kwargs = builder.add_multiple_options("-i", ("file_a",))
+ assert_equal(kwargs, {"IN_FILE_01": "file_a"})
+ kwargs = builder.add_multiple_options("-i", ("file_b",))
+ assert_equal(kwargs, {"IN_FILE_02": "file_b"})
+
+ assert_equal(builder.kwargs, expected)
+ assert_equal(builder.call, ["ls",
+ "-i", "%(IN_FILE_01)s",
+ "-i", "%(IN_FILE_02)s"])
+
+
+###############################################################################
+###############################################################################
+# AtomicCmdBuilder: add_multiple_values
+
+def test_builder__add_multiple_values():
+ values = ("file_a", "file_b")
+ expected = {"IN_FILE_01": "file_a", "IN_FILE_02": "file_b"}
+
+ builder = AtomicCmdBuilder("ls")
+ kwargs = builder.add_multiple_values(values)
+
+ assert_equal(kwargs, expected)
+ assert_equal(builder.kwargs, expected)
+ assert_equal(builder.call, ["ls", "%(IN_FILE_01)s", "%(IN_FILE_02)s"])
+
+
+def test_builder__add_multiple_values_with_template():
+ values = ("file_a", "file_b")
+ expected = {"OUT_BAM_1": "file_a", "OUT_BAM_2": "file_b"}
+
+ builder = AtomicCmdBuilder("ls")
+ kwargs = builder.add_multiple_values(values, template="OUT_BAM_%i")
+
+ assert_equal(kwargs, expected)
+ assert_equal(builder.kwargs, expected)
+ assert_equal(builder.call, ["ls", "%(OUT_BAM_1)s", "%(OUT_BAM_2)s"])
+
+
+def test_builder__add_multiple_values_multiple_times():
+ expected = {"IN_FILE_01": "file_a", "IN_FILE_02": "file_b"}
+
+ builder = AtomicCmdBuilder("ls")
+ kwargs = builder.add_multiple_values(("file_a",))
+ assert_equal(kwargs, {"IN_FILE_01": "file_a"})
+ kwargs = builder.add_multiple_values(("file_b",))
+ assert_equal(kwargs, {"IN_FILE_02": "file_b"})
+
+ assert_equal(builder.kwargs, expected)
+ assert_equal(builder.call, ["ls", "%(IN_FILE_01)s", "%(IN_FILE_02)s"])
+
+
+###############################################################################
+###############################################################################
+# AtomicJavaCmdBuilder
+
+def test_java_builder__default__no_config():
+ builder = AtomicJavaCmdBuilder("/path/Foo.jar")
+ assert_equal(builder.call, ["java",
+ "-server",
+ "-Djava.io.tmpdir=%(TEMP_DIR)s",
+ "-Djava.awt.headless=true",
+ "-XX:+UseSerialGC",
+ "-Xmx4g",
+ "-jar", "%(AUX_JAR)s"])
+
+
+def test_java_builder__defaults__call():
+ builder = AtomicJavaCmdBuilder("/path/Foo.jar", temp_root="/disk/tmp")
+ assert_equal(builder.call, ["java",
+ "-server",
+ "-Djava.io.tmpdir=/disk/tmp",
+ "-Djava.awt.headless=true",
+ "-XX:+UseSerialGC",
+ "-Xmx4g",
+ "-jar", "%(AUX_JAR)s"])
+
+
+def test_java_builder__defaults__kwargs():
+ builder = AtomicJavaCmdBuilder("/path/Foo.jar")
+ assert_equal(builder.kwargs, {"AUX_JAR": "/path/Foo.jar",
+ "CHECK_JRE": JAVA_VERSIONS[(1, 6)]})
+
+
+def test_java_builder__multithreaded_gc():
+ builder = AtomicJavaCmdBuilder("/path/Foo.jar",
+ temp_root="/disk/tmp",
+ gc_threads=3)
+ assert_equal(builder.call, ["java",
+ "-server",
+ "-Djava.io.tmpdir=/disk/tmp",
+ "-Djava.awt.headless=true",
+ "-XX:ParallelGCThreads=3",
+ "-Xmx4g",
+ "-jar", "%(AUX_JAR)s"])
+
+
+def test_java_builder__multithreaded_gc__zero_or_negative_threads():
+ assert_raises(ValueError, AtomicJavaCmdBuilder, "/path/Foo.jar", gc_threads=0)
+ assert_raises(ValueError, AtomicJavaCmdBuilder, "/path/Foo.jar", gc_threads=-1)
+
+
+def test_java_builder__multithreaded_gc__non_int_threads():
+ assert_raises(TypeError, AtomicJavaCmdBuilder, "/path/Foo.jar", gc_threads="3")
+
+
+def test_java_builder__kwargs():
+ builder = AtomicJavaCmdBuilder("/path/Foo.jar", set_cwd=True)
+ assert_equal(builder.kwargs, {"AUX_JAR": "/path/Foo.jar",
+ "set_cwd": True,
+ "CHECK_JRE": JAVA_VERSIONS[(1, 6)]})
+
+
+###############################################################################
+###############################################################################
+# AtomicMPICmdBuilder
+
+def test_mpi_builder__defaults__str():
+ builder = AtomicMPICmdBuilder("ls")
+ assert_equal(builder.call, ["ls"])
+ assert_equal(builder.kwargs, {"EXEC_MPI": "mpirun"})
+
+
+def test_mpi_builder__multithreaded__str():
+ builder = AtomicMPICmdBuilder("ls", threads=3)
+ assert_equal(builder.call, ["mpirun", "-n", 3, "ls"])
+ assert_equal(builder.kwargs, {"EXEC_MAIN": "ls"})
+
+
+def test_mpi_builder__defaults__complex_cmd():
+ builder = AtomicMPICmdBuilder(["python", "/foo/run.py"])
+ assert_equal(builder.call, ["python", "/foo/run.py"])
+ assert_equal(builder.kwargs, {"EXEC_MPI": "mpirun"})
+
+
+def test_mpi_builder__multithreaded__complex_cmd():
+ builder = AtomicMPICmdBuilder(["python", "/foo/run.py"], threads=3)
+ assert_equal(builder.call, ["mpirun", "-n", 3, "python", "/foo/run.py"])
+ assert_equal(builder.kwargs, {"EXEC_MAIN": "python"})
+
+
+def test_mpi_builder__kwargs():
+ builder = AtomicMPICmdBuilder("ls", set_cwd=True)
+ assert_equal(builder.kwargs, {"set_cwd": True, "EXEC_MPI": "mpirun"})
+
+
+def test_mpi_builder__threads__zero_or_negative():
+ assert_raises(ValueError, AtomicMPICmdBuilder, "ls", threads=0)
+ assert_raises(ValueError, AtomicMPICmdBuilder, "ls", threads=-1)
+
+
+def test_mpi_builder__threads__non_int():
+ assert_raises(TypeError, AtomicMPICmdBuilder, "ls", threads="3")
+
+
+###############################################################################
+###############################################################################
+# create_customizable_cli_parameters
+
+def test_custom_cli__single_named_arg():
+ class SingleNamedArg:
+ @create_customizable_cli_parameters
+ def customize(cls, argument):
+ return {}
+
+ value = "A value"
+ obj = SingleNamedArg.customize(value)
+ assert_equal(obj.argument, value)
+
+
+def test_custom_cli__adding_new_values():
+ class SingleNamedArg:
+ @create_customizable_cli_parameters
+ def customize(cls):
+ return {"dynamic": 12345}
+
+ obj = SingleNamedArg.customize()
+ assert_equal(obj.dynamic, 12345)
+
+
+def test_custom_cli__multiple_named_args():
+ class SingleNamedArg:
+ @create_customizable_cli_parameters
+ def customize(cls, first, second):
+ return {}
+
+ obj = SingleNamedArg.customize(123, 456)
+ assert_equal(obj.first, 123)
+ assert_equal(obj.second, 456)
+
+
+def test_custom_cli__only_customize_is_valid_function_name():
+ try:
+ class ClassWithMisnamedFunction:
+ @create_customizable_cli_parameters
+ def not_called_customize(cls, first, second):
+ return {} # pragma: no coverage
+
+ assert False, "ValueError not raised" # pragma: no coverage
+ except ValueError:
+ pass
+
+
+###############################################################################
+###############################################################################
+# use_customizable_cli_parameters
+
+
+###############################################################################
+###############################################################################
+# apply_options
+
+def test_apply_options__single_option__default_pred__set_when_pred_is_true():
+ mock = flexmock()
+ mock.should_receive('set_option').with_args('--foo', 17).once()
+ apply_options(mock, {"--foo": 17})
+
+
+def test_apply_options__single_option__default_pred__ignore_when_pred_is_false():
+ mock = flexmock()
+ apply_options(mock, {"Other": None})
+
+
+def _user_pred(key):
+ return key.startswith("FOO")
+
+
+def test_apply_options__single_option__user_pred__set_when_pred_is_true():
+ mock = flexmock()
+ mock.should_receive('set_option').with_args('FOO_BAR', 17).once()
+ apply_options(mock, {"FOO_BAR": 17}, _user_pred)
+
+
+def test_apply_options__single_option__user_pred__ignore_when_pred_is_false():
+ mock = flexmock()
+ apply_options(mock, {"BAR_FOO": 17}, _user_pred)
+
+
+def test_apply_options__single_option__boolean__set_when_value_is_true():
+ mock = flexmock()
+ mock.should_receive('set_option').with_args('-v')
+ apply_options(mock, {"-v": True})
+
+
+def test_apply_options__single_option__boolean__set_when_value_is_none():
+ mock = flexmock()
+ mock.should_receive('set_option').with_args('-v')
+ apply_options(mock, {"-v": None})
+
+
+def test_apply_options__single_option__boolean__pop_when_value_is_false():
+ mock = flexmock()
+ mock.should_receive('pop_option').with_args('-v')
+ apply_options(mock, {"-v": False})
+
+
+def test_apply_options__single_option__boolean__pop_missing_throws():
+ mock = flexmock()
+ mock.should_receive('pop_option').with_args('-v').and_raise(KeyError('-v'))
+ assert_raises(KeyError, apply_options, mock, {"-v": False})
+
+
+def test_apply_options__multiple_option():
+ mock = flexmock()
+ mock.should_receive('add_option').with_args('--foo', 3).once()
+ mock.should_receive('add_option').with_args('--foo', 17).once()
+ apply_options(mock, {"--foo": [3, 17]})
+
+
+def test_apply_options__boolean_and_none_is_single_value_only():
+ mock = flexmock()
+ assert_raises(TypeError, apply_options, mock, {"--foo": [True]})
+ assert_raises(TypeError, apply_options, mock, {"--foo": [False]})
+ assert_raises(TypeError, apply_options, mock, {"--foo": [None]})
+
+
+def test_apply_options__unexpected_types_in_values():
+ mock = flexmock()
+ assert_raises(TypeError, apply_options, mock, {"--foo": object()})
+ assert_raises(TypeError, apply_options, mock, {"--foo": iter([])})
+ assert_raises(TypeError, apply_options, mock, {"--foo": {}})
+ assert_raises(TypeError, apply_options, mock, {"--foo": set()})
+
+
+def test_apply_options__non_string_types_in_keys():
+ mock = flexmock()
+ assert_raises(TypeError, apply_options, mock, {1: 17})
+ assert_raises(TypeError, apply_options, mock, {("foo",): 17})
+
+
+def test_apply_options__not_dict_like():
+ mock = flexmock()
+ assert_raises(TypeError, apply_options, mock, None)
+ assert_raises(TypeError, apply_options, mock, [1, 2, 3])
diff --git a/tests/atomiccmd_test/command_test.py b/tests/atomiccmd_test/command_test.py
new file mode 100644
index 0000000..f871851
--- /dev/null
+++ b/tests/atomiccmd_test/command_test.py
@@ -0,0 +1,873 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import signal
+import weakref
+
+from flexmock import flexmock
+
+import nose
+from nose.tools import \
+ assert_in, \
+ assert_equal, \
+ assert_raises
+
+from paleomix.common.testing import \
+ with_temp_folder, \
+ Monkeypatch, \
+ get_file_contents, \
+ set_file_contents
+
+import paleomix.atomiccmd.command
+import paleomix.common.fileutils as fileutils
+
+from paleomix.common.versions import RequirementObj
+from paleomix.atomiccmd.command import AtomicCmd, CmdError
+
+
+def test_file(*args):
+ test_root = os.path.dirname(os.path.dirname(__file__))
+
+ return os.path.join(test_root, "data", *args)
+
+
+###############################################################################
+###############################################################################
+# Constructor: Command
+
+def test_atomiccmd__command_str():
+ cmd = AtomicCmd("ls")
+ assert_equal(cmd.executables, frozenset(["ls"]))
+
+
+ at nose.tools.raises(ValueError)
+def test_atomiccmd__executables_empty_str():
+ AtomicCmd("")
+
+
+def test_atomiccmd__command_tuple():
+ cmd = AtomicCmd(("cd", "."))
+ assert_equal(cmd.executables, frozenset(["cd"]))
+
+
+ at nose.tools.raises(ValueError)
+def test_atomiccmd__executables_empty_tuple():
+ AtomicCmd(())
+
+
+ at nose.tools.raises(ValueError)
+def test_atomiccmd__executables_empty_str_in_tuple():
+ AtomicCmd((""))
+
+
+###############################################################################
+###############################################################################
+# Constructor: set_cwd
+
+def test_atomiccmd__set_cwd():
+ @with_temp_folder
+ def _do_test_atomiccmd__set_cwd(temp_folder, set_cwd):
+ cwd = os.getcwd()
+ cmd = AtomicCmd(("bash", "-c", "echo -n ${PWD}"),
+ TEMP_OUT_STDOUT="result.txt",
+ set_cwd=set_cwd)
+ cmd.run(temp_folder)
+ assert_equal(cmd.join(), [0])
+ assert_equal(cwd, os.getcwd())
+
+ expected = temp_folder if set_cwd else cwd
+ result = get_file_contents(os.path.join(temp_folder, "result.txt"))
+ assert os.path.samefile(expected, result), "%r != %r" % (expected, result)
+
+ yield _do_test_atomiccmd__set_cwd, False
+ yield _do_test_atomiccmd__set_cwd, True
+
+
+# Full path when set_cwd is False, rel. path when True
+def test_atomiccmd__set_cwd__temp_in_out():
+ @with_temp_folder
+ def _do_test_atomiccmd__paths_temp_in(temp_folder, set_cwd, kwargs):
+ cmd = AtomicCmd(("echo", "-n", "%%(%s)s" % tuple(kwargs.keys())),
+ TEMP_OUT_STDOUT="result.txt",
+ set_cwd=set_cwd,
+ **kwargs)
+ cmd.run(temp_folder)
+ assert_equal(cmd.join(), [0])
+
+ expected = os.path.join("" if set_cwd else temp_folder, "test_file")
+ result = get_file_contents(os.path.join(temp_folder, "result.txt"))
+ assert_equal(os.path.abspath(expected), os.path.abspath(result))
+
+ yield _do_test_atomiccmd__paths_temp_in, True, {"TEMP_IN_FOO": "test_file"}
+ yield _do_test_atomiccmd__paths_temp_in, False, {"TEMP_IN_FOO": "test_file"}
+ yield _do_test_atomiccmd__paths_temp_in, True, {"TEMP_OUT_FOO": "test_file"}
+ yield _do_test_atomiccmd__paths_temp_in, False, {"TEMP_OUT_FOO": "test_file"}
+
+
+###############################################################################
+###############################################################################
+# Constructor: Paths / pipes
+
+# Check that specified paths/etc. are available via getters
+def test_atomiccmd__paths():
+ cmd = AtomicCmd("ls",
+ IN_AAA="/a/b/c",
+ IN_AAB="/x/y/z",
+ TEMP_IN_ABB="tmp_in",
+ OUT_AAA="/out/foo",
+ OUT_BBC="foo/bar",
+ TEMP_OUT_A="xyb",
+ EXEC_OTHER="true",
+ AUX_WAT="wat/wat",
+ CHECK_FUNC=bool,
+ OUT_STDERR="/var/log/pipe.stderr",
+ TEMP_OUT_STDOUT="pipe.stdout")
+
+ assert_equal(cmd.executables, frozenset(["ls", "true"]))
+ assert_equal(cmd.requirements, frozenset([bool]))
+ assert_equal(cmd.input_files, frozenset(["/a/b/c", "/x/y/z"]))
+ assert_equal(cmd.output_files, frozenset(["/out/foo", "foo/bar", "/var/log/pipe.stderr"]))
+ assert_equal(cmd.auxiliary_files, frozenset(["wat/wat"]))
+ assert_equal(cmd.expected_temp_files, frozenset(["foo", "bar", "pipe.stderr"]))
+ assert_in("xyb", cmd.optional_temp_files)
+ assert_in("pipe.stdout", cmd.optional_temp_files)
+
+
+def test_atomiccmd__paths_optional():
+ cmd = AtomicCmd(["ls"],
+ IN_OPTIONAL=None,
+ OUT_OPTIONAL=None)
+ assert_equal(cmd.input_files, frozenset())
+ assert_equal(cmd.output_files, frozenset())
+
+
+ at with_temp_folder
+def test_atomiccmd__pipes_stdin(temp_folder):
+ fname = test_file("fasta_file.fasta")
+ cmd = AtomicCmd("cat",
+ IN_STDIN=fname,
+ OUT_STDOUT="result.txt")
+ assert_equal(cmd.input_files, frozenset([fname]))
+ cmd.run(temp_folder)
+ assert_equal(cmd.join(), [0])
+ result = get_file_contents(os.path.join(temp_folder, "result.txt"))
+ assert_equal(result, ">This_is_FASTA!\nACGTN\n>This_is_ALSO_FASTA!\nCGTNA\n")
+
+
+ at with_temp_folder
+def test_atomiccmd__pipes_stdin__temp_file(temp_folder):
+ cmd = AtomicCmd("cat",
+ TEMP_IN_STDIN="infile.fasta",
+ OUT_STDOUT="result.txt")
+ assert_equal(cmd.input_files, frozenset())
+ set_file_contents(os.path.join(temp_folder, "infile.fasta"), "a\nbc\nd")
+ cmd.run(temp_folder)
+ assert_equal(cmd.join(), [0])
+ result = get_file_contents(os.path.join(temp_folder, "result.txt"))
+ assert_equal(result, "a\nbc\nd")
+
+
+ at with_temp_folder
+def test_atomiccmd__pipes_stdin__dev_null_implicit_1(temp_folder):
+ # STDIN should be implicitly set to /dev/null; deadlocks if not
+ cmd = AtomicCmd("cat")
+ cmd.run(temp_folder)
+ assert_equal(cmd.join(), [0])
+
+
+ at with_temp_folder
+def test_atomiccmd__pipes_stdin__dev_null_implicit_2(temp_folder):
+ # STDIN should be implicitly set to /dev/null; deadlocks if not
+ cmd = AtomicCmd("cat", IN_STDIN=None)
+ cmd.run(temp_folder)
+ assert_equal(cmd.join(), [0])
+
+
+ at with_temp_folder
+def test_atomiccmd__pipes_stdin__dev_null_explicit(temp_folder):
+ # STDIN should be set to /dev/null; deadlocks if not
+ cmd = AtomicCmd("cat", IN_STDIN=AtomicCmd.DEVNULL)
+ cmd.run(temp_folder, wrap_errors=False)
+ assert_equal(cmd.join(), [0])
+
+
+# Test possible combinations of explicit / implicit saving of stdout/err
+def test_atomiccmd__pipes_out():
+ @with_temp_folder
+ def _do_test_atomiccmd__pipes_out(temp_folder, stdout, stderr, kwargs):
+ cmd = AtomicCmd(("bash", "-c", "echo -n 'STDERR!' > /dev/stderr; echo -n 'STDOUT!';"), **kwargs)
+ cmd.run(temp_folder)
+ assert_equal(cmd.join(), [0])
+
+ expected_files = []
+ for (tmpl, text) in ((stdout, "STDOUT!"), (stderr, "STDERR!")):
+ if tmpl is not None:
+ fname = tmpl.format(id(cmd))
+ result = get_file_contents(os.path.join(temp_folder, fname))
+ assert_equal(result, text)
+ expected_files.append(fname)
+
+ assert_equal(set(os.listdir(temp_folder)), set(expected_files))
+
+ yield _do_test_atomiccmd__pipes_out, "pipe_bash_{0}.stdout", "pipe_bash_{0}.stderr", {}
+ yield _do_test_atomiccmd__pipes_out, "pipe_bash_{0}.stdout", "stderr.txt", {"OUT_STDERR": "stderr.txt"}
+ yield _do_test_atomiccmd__pipes_out, "stdout.txt", "pipe_bash_{0}.stderr", {"OUT_STDOUT": "stdout.txt"}
+ yield _do_test_atomiccmd__pipes_out, "stdout.txt", "stderr.txt", {"OUT_STDOUT": "stdout.txt",
+ "OUT_STDERR": "stderr.txt"}
+
+ yield _do_test_atomiccmd__pipes_out, None, None, {"OUT_STDOUT": AtomicCmd.DEVNULL,
+ "OUT_STDERR": AtomicCmd.DEVNULL}
+ yield _do_test_atomiccmd__pipes_out, None, "pipe_bash_{0}.stderr", {"OUT_STDOUT": AtomicCmd.DEVNULL}
+ yield _do_test_atomiccmd__pipes_out, "pipe_bash_{0}.stdout", None, {"OUT_STDERR": AtomicCmd.DEVNULL}
+
+
+def test_atomiccmd__pipes_out_dev_null():
+ @with_temp_folder
+ def _do_test_atomiccmd__pipes_out(temp_folder, stdout, stderr, kwargs):
+ cmd = AtomicCmd(("bash", "-c", "echo -n 'STDERR!' > /dev/stderr; echo -n 'STDOUT!';"), **kwargs)
+ cmd.run(temp_folder)
+ assert_equal(cmd.join(), [0])
+ result_out = get_file_contents(os.path.join(temp_folder, stdout.format(id(cmd))))
+ result_err = get_file_contents(os.path.join(temp_folder, stderr.format(id(cmd))))
+ assert_equal(result_out, "STDOUT!")
+ assert_equal(result_err, "STDERR!")
+
+
+def test_atomiccmd__paths__malformed_keys():
+ def _do_test_atomiccmd__paths__malformed(kwargs):
+ assert_raises(ValueError, AtomicCmd, "true", **kwargs)
+
+ yield _do_test_atomiccmd__paths__malformed, {"IN": "/var/foo"} # Missing key-name #1
+ yield _do_test_atomiccmd__paths__malformed, {"IN_": "/var/foo"} # Missing key-name #2
+ yield _do_test_atomiccmd__paths__malformed, {"TEMP_OUT": "/var/foo"} # Missing key-name #3
+ yield _do_test_atomiccmd__paths__malformed, {"TEMP_OUT_": "/var/foo"} # Missing key-name #4
+ yield _do_test_atomiccmd__paths__malformed, {"TEMP_OUX_FOO": "foo"} # Invalid key-type #1
+ yield _do_test_atomiccmd__paths__malformed, {"INS_BAR": "foo"} # Invalid key-type #2
+
+
+def test_atomiccmd__paths__invalid_values():
+ def _do_test_atomiccmd__paths__invalid_values(kwargs):
+ assert_raises(TypeError, AtomicCmd, "true", **kwargs)
+
+ yield _do_test_atomiccmd__paths__invalid_values, {"IN_FILE": 1}
+ yield _do_test_atomiccmd__paths__invalid_values, {"TEMP_IN_FILE": set()}
+ yield _do_test_atomiccmd__paths__invalid_values, {"OUT_FILE": [1, 2, 3]}
+ yield _do_test_atomiccmd__paths__invalid_values, {"TEMP_OUT_FILE": 1.0}
+
+ yield _do_test_atomiccmd__paths__invalid_values, {"IN_STDIN": {}}
+ yield _do_test_atomiccmd__paths__invalid_values, {"TEMP_IN_STDIN": frozenset()}
+ yield _do_test_atomiccmd__paths__invalid_values, {"OUT_STDOUT": 1.7}
+ yield _do_test_atomiccmd__paths__invalid_values, {"TEMP_OUT_STDOUT": ()}
+ yield _do_test_atomiccmd__paths__invalid_values, {"OUT_STDERR": xrange(3)}
+ yield _do_test_atomiccmd__paths__invalid_values, {"TEMP_OUT_STDERR": -1}
+
+
+# Subpaths are not allowed for temp IN/OUT files, neither relative nor asbsolute
+def test_atomiccmd__paths__invalid_temp_paths():
+ def _do_test_atomiccmd__paths__invalid_temp_paths(kwargs):
+ assert_raises(ValueError, AtomicCmd, "true", **kwargs)
+
+ # No relative paths
+ yield _do_test_atomiccmd__paths__invalid_temp_paths, {"TEMP_IN_FOO": "sub/infile"}
+ yield _do_test_atomiccmd__paths__invalid_temp_paths, {"TEMP_IN_STDIN": "sub/stdin"}
+ yield _do_test_atomiccmd__paths__invalid_temp_paths, {"TEMP_OUT_FOO": "sub/outfile"}
+ yield _do_test_atomiccmd__paths__invalid_temp_paths, {"TEMP_OUT_STDOUT": "sub/stdout"}
+ yield _do_test_atomiccmd__paths__invalid_temp_paths, {"TEMP_OUT_STDERR": "sub/stderr"}
+
+ # No absolute paths
+ yield _do_test_atomiccmd__paths__invalid_temp_paths, {"TEMP_IN_FOO": "/tmp/sub/infile"}
+ yield _do_test_atomiccmd__paths__invalid_temp_paths, {"TEMP_IN_STDIN": "/dev/sub/stdin"}
+ yield _do_test_atomiccmd__paths__invalid_temp_paths, {"TEMP_OUT_FOO": "/etc/sub/outfile"}
+ yield _do_test_atomiccmd__paths__invalid_temp_paths, {"TEMP_OUT_STDOUT": "/var/sub/stdout"}
+ yield _do_test_atomiccmd__paths__invalid_temp_paths, {"TEMP_OUT_STDERR": "/home/sub/stderr"}
+
+
+# All OUT_ files must be unique, including all TEMP_OUT_
+def test_atomiccmd__paths__overlapping_output():
+ def _do_test_atomiccmd__paths__overlapping_output(key_1, file_1, key_2, file_2):
+ assert_raises(ValueError, AtomicCmd, ("ls",), **{key_1: file_1,
+ key_2: file_2})
+
+ yield _do_test_atomiccmd__paths__overlapping_output, "OUT_FILE_1", "/foo/bar/outfile", "OUT_FILE_2", "/var/outfile"
+ yield _do_test_atomiccmd__paths__overlapping_output, "TEMP_OUT_FILE_1", "outfile", "OUT_FILE_1", "/var/outfile"
+ yield _do_test_atomiccmd__paths__overlapping_output, "OUT_FILE_1", "/foo/bar/outfile", "TEMP_OUT_FILE_1", "outfile"
+ yield _do_test_atomiccmd__paths__overlapping_output, "TEMP_OUT_FILE_1", "outfile", "TEMP_OUT_FILE_2", "outfile"
+
+ yield _do_test_atomiccmd__paths__overlapping_output, "OUT_FILE_1", "/foo/bar/outfile", "OUT_STDOUT", "/var/outfile"
+ yield _do_test_atomiccmd__paths__overlapping_output, "TEMP_OUT_FILE_1", "outfile", "OUT_STDOUT", "/var/outfile"
+ yield _do_test_atomiccmd__paths__overlapping_output, "OUT_FILE_1", "/foo/bar/outfile", "TEMP_OUT_STDOUT", "outfile"
+ yield _do_test_atomiccmd__paths__overlapping_output, "TEMP_OUT_FILE_1", "outfile", "TEMP_OUT_STDOUT", "outfile"
+
+ yield _do_test_atomiccmd__paths__overlapping_output, "OUT_FILE_1", "/foo/bar/outfile", "OUT_STDERR", "/var/outfile"
+ yield _do_test_atomiccmd__paths__overlapping_output, "TEMP_OUT_FILE_1", "outfile", "OUT_STDERR", "/var/outfile"
+ yield _do_test_atomiccmd__paths__overlapping_output, "OUT_FILE_1", "/foo/bar/outfile", "TEMP_OUT_STDERR", "outfile"
+ yield _do_test_atomiccmd__paths__overlapping_output, "TEMP_OUT_FILE_1", "outfile", "TEMP_OUT_STDERR", "outfile"
+
+
+# A pipe can be w/wo TEMP_, but not both
+def test_atomiccmd__pipes__duplicates():
+ def _do_test_atomiccmd__pipes__duplicates(key):
+ kwargs = {"TEMP_" + key: "temp_file",
+ key: "file"}
+ assert_raises(CmdError, AtomicCmd, ["ls"], **kwargs)
+
+ yield _do_test_atomiccmd__pipes__duplicates, "IN_STDIN"
+ yield _do_test_atomiccmd__pipes__duplicates, "OUT_STDOUT"
+ yield _do_test_atomiccmd__pipes__duplicates, "OUT_STDERR"
+
+
+###############################################################################
+###############################################################################
+# CHECK_ / EXEC_
+
+# RequirementObjs are the standard way to do tests
+def test_atomicmcd__exec__reqobj():
+ reqobj = RequirementObj(call=("echo", "version"),
+ search="version",
+ checks=str)
+ cmd = AtomicCmd("true",
+ CHECK_VERSION=reqobj)
+ assert_equal(cmd.requirements, frozenset([reqobj]))
+
+
+# CHECK_ is expected to be a callable
+ at nose.tools.raises(TypeError)
+def test_atomiccmd__checks__non_callable():
+ AtomicCmd("ls", CHECK_FOO="ls")
+
+
+# EXEC_ is expected to be a string
+def test_atomiccmd__exec__invalid():
+ @nose.tools.raises(TypeError)
+ def _test_atomiccmd__exec__invalid(obj):
+ AtomicCmd("true", EXEC_FOO=obj)
+
+ yield _test_atomiccmd__exec__invalid, str
+ yield _test_atomiccmd__exec__invalid, {}
+ yield _test_atomiccmd__exec__invalid, 1
+
+
+###############################################################################
+###############################################################################
+# AUX
+
+def test_atomiccmd__aux__invalid():
+ @nose.tools.raises(TypeError)
+ def _test_atomiccmd__exec__invalid(obj):
+ AtomicCmd("true", AUX_FOO=obj)
+
+ yield _test_atomiccmd__exec__invalid, str
+ yield _test_atomiccmd__exec__invalid, {}
+ yield _test_atomiccmd__exec__invalid, 1
+
+
+###############################################################################
+###############################################################################
+# Path components
+
+ at with_temp_folder
+def test_atomiccmd__paths_non_str(temp_folder):
+ cmd = AtomicCmd(("touch", 1234),
+ OUT_FOO="1234",
+ set_cwd=True)
+ cmd.run(temp_folder)
+ assert_equal(cmd.join(), [0])
+ assert os.path.exists(os.path.join(temp_folder, "1234"))
+
+
+ at nose.tools.raises(CmdError)
+def test_atomiccmd__paths_missing():
+ AtomicCmd(("touch", "%(IN_FOO)s"))
+
+
+ at nose.tools.raises(CmdError)
+def test_atomiccmd__paths_invalid():
+ AtomicCmd(("touch", "%(IN_FOO)"),
+ IN_FOO="abc")
+
+
+ at with_temp_folder
+def test_atomiccmd__paths__key(temp_folder):
+ cmd = AtomicCmd(("echo", "-n", "%(TEMP_DIR)s"),
+ OUT_STDOUT=AtomicCmd.PIPE)
+ cmd.run(temp_folder)
+ path = cmd._proc.stdout.read()
+ assert os.path.samefile(temp_folder, path), (temp_folder, path)
+ assert_equal(cmd.join(), [0])
+
+
+###############################################################################
+###############################################################################
+# Constructor: Piping commands
+
+ at with_temp_folder
+def test_atomiccmd__piping(temp_folder):
+ cmd_1 = AtomicCmd(["echo", "-n", "#@!$^"],
+ OUT_STDOUT=AtomicCmd.PIPE)
+ assert_equal(cmd_1.output_files, frozenset())
+ cmd_2 = AtomicCmd(["cat"],
+ IN_STDIN=cmd_1,
+ OUT_STDOUT="piped.txt")
+ assert_equal(cmd_2.input_files, frozenset())
+ cmd_1.run(temp_folder)
+ cmd_2.run(temp_folder)
+ assert_equal(cmd_1.join(), [0])
+ assert_equal(cmd_2.join(), [0])
+ result = get_file_contents(os.path.join(temp_folder, "piped.txt"))
+ assert_equal(result, "#@!$^")
+
+
+ at with_temp_folder
+def test_atomiccmd__piping_temp(temp_folder):
+ cmd_1 = AtomicCmd(["echo", "-n", "#@!$^"],
+ TEMP_OUT_STDOUT=AtomicCmd.PIPE)
+ assert_equal(cmd_1.output_files, frozenset())
+ cmd_2 = AtomicCmd(["cat"],
+ TEMP_IN_STDIN=cmd_1,
+ OUT_STDOUT="piped.txt")
+ assert_equal(cmd_2.input_files, frozenset())
+ cmd_1.run(temp_folder)
+ cmd_2.run(temp_folder)
+ assert_equal(cmd_1.join(), [0])
+ assert_equal(cmd_2.join(), [0])
+ result = get_file_contents(os.path.join(temp_folder, "piped.txt"))
+ assert_equal(result, "#@!$^")
+
+
+# Only STDOUT takes AtomicCmd.PIPE
+def test_atomiccmd__piping__wrong_pipe():
+ def _test_atomiccmd__piping__wrong_pipe(key):
+ assert_raises(TypeError, AtomicCmd, "ls", **{key: AtomicCmd.PIPE})
+
+ yield _test_atomiccmd__piping__wrong_pipe, "IN_STDIN"
+ yield _test_atomiccmd__piping__wrong_pipe, "TEMP_IN_STDIN"
+ yield _test_atomiccmd__piping__wrong_pipe, "OUT_STDERR"
+ yield _test_atomiccmd__piping__wrong_pipe, "TEMP_OUT_STDERR"
+
+
+ at with_temp_folder
+def test_atomiccmd__piping_is_only_allowed_once(temp_folder):
+ cmd_1 = AtomicCmd(["echo", "-n", "foo\nbar"],
+ OUT_STDOUT=AtomicCmd.PIPE)
+ cmd_2a = AtomicCmd(["grep", "foo"],
+ IN_STDIN=cmd_1)
+ cmd_2b = AtomicCmd(["grep", "bar"],
+ IN_STDIN=cmd_1)
+ cmd_1.run(temp_folder)
+ cmd_2a.run(temp_folder)
+ assert_raises(CmdError, cmd_2b.run, temp_folder)
+ assert_equal(cmd_1.join(), [0])
+ assert_equal(cmd_2a.join(), [0])
+ assert_equal(cmd_2b.join(), [None])
+
+
+###############################################################################
+###############################################################################
+# run
+
+ at with_temp_folder
+def test_atomiccmd__run__already_running(temp_files):
+ cmd = AtomicCmd(("sleep", "10"))
+ cmd.run(temp_files)
+ assert_raises(CmdError, cmd.run, temp_files)
+ cmd.terminate()
+ cmd.join()
+
+
+ at with_temp_folder
+def test_atomiccmd__run__exception_on_missing_command(temp_files):
+ cmd = AtomicCmd(("xyzabcefgh", "10"))
+ assert_raises(CmdError, cmd.run, temp_files)
+ cmd.terminate()
+ cmd.join()
+
+
+ at with_temp_folder
+def test_atomiccmd__run__exception_on_missing_command__no_wrap(temp_files):
+ cmd = AtomicCmd(("xyzabcefgh", "10"))
+ assert_raises(OSError, cmd.run, temp_files, wrap_errors=False)
+ cmd.terminate()
+ cmd.join()
+
+
+ at with_temp_folder
+def test_atomiccmd__run__invalid_temp(temp_files):
+ cmd = AtomicCmd(("sleep", "10"))
+ assert_raises(CmdError, cmd.run, os.path.join(temp_files, "foo"))
+ cmd.terminate()
+ cmd.join()
+
+
+###############################################################################
+###############################################################################
+# Ready
+
+ at with_temp_folder
+def test_atomiccmd__ready(temp_folder):
+ cmd = AtomicCmd("ls")
+ assert_equal(cmd.join(), [None])
+ assert not cmd.ready()
+ cmd.run(temp_folder)
+ assert_equal(cmd.join(), [0])
+ assert cmd.ready()
+
+
+###############################################################################
+###############################################################################
+# Join / wait
+
+def test_atomiccmd__join_wait():
+ @with_temp_folder
+ def _do_test_atomiccmd__join_wait(temp_folder, func, call, before_run, after_run):
+ cmd = AtomicCmd(call)
+ assert_equal(func(cmd), before_run)
+ cmd.run(temp_folder)
+ assert_equal(func(cmd), after_run)
+
+ yield _do_test_atomiccmd__join_wait, AtomicCmd.join, "true", [None], [0]
+ yield _do_test_atomiccmd__join_wait, AtomicCmd.join, "false", [None], [1]
+
+ yield _do_test_atomiccmd__join_wait, AtomicCmd.wait, "true", None, 0
+ yield _do_test_atomiccmd__join_wait, AtomicCmd.wait, "false", None, 1
+
+
+###############################################################################
+###############################################################################
+# Terminate
+
+def test_atomiccmd__terminate():
+ @with_temp_folder
+ def _do_test_atomiccmd__terminate(temp_folder, raise_on_terminate):
+ cmd = AtomicCmd(("sleep", "10"))
+ cmd.run(temp_folder)
+
+ killpg_was_called = []
+
+ def _wrap_killpg(pid, sig):
+ assert_equal(pid, cmd._proc.pid)
+ assert_equal(sig, signal.SIGTERM)
+ killpg_was_called.append(True)
+ if raise_on_terminate:
+ raise OSError("KABOOM!")
+
+ with Monkeypatch("os.killpg", _wrap_killpg):
+ cmd.terminate()
+ cmd.terminate()
+ assert_equal(cmd.join(), ["SIGTERM"])
+ assert killpg_was_called
+ yield _do_test_atomiccmd__terminate, False
+ yield _do_test_atomiccmd__terminate, True
+
+
+# Ensure that no OSException is raised, even if the command
+# managed to finish before terminate was called
+ at with_temp_folder
+def test_atomiccmd__terminate_race_condition(temp_folder):
+ cmd = AtomicCmd("true")
+ cmd.run(temp_folder)
+ while cmd._proc.poll() is None:
+ pass
+ cmd.terminate()
+ assert_equal(cmd.join(), [0])
+
+
+# Calling terminate on an already joined command is acceptable ...
+ at with_temp_folder
+def test_atomiccmd__terminate_after_join(temp_folder):
+ cmd = AtomicCmd("true")
+ cmd.run(temp_folder)
+ assert_equal(cmd.join(), [0])
+ cmd.terminate()
+ assert_equal(cmd.join(), [0])
+
+
+# Signals are translated into strings
+ at with_temp_folder
+def test_atomiccmd__terminate_sigterm(temp_folder):
+ cmd = AtomicCmd(("sleep", "10"))
+ cmd.run(temp_folder)
+ cmd.terminate()
+ assert_equal(cmd.join(), ["SIGTERM"])
+
+
+ at with_temp_folder
+def test_atomiccmd__terminate_sigkill(temp_folder):
+ cmd = AtomicCmd(("sleep", "10"))
+ cmd.run(temp_folder)
+ cmd._proc.kill()
+ assert_equal(cmd.join(), ["SIGKILL"])
+
+
+###############################################################################
+###############################################################################
+# commit
+
+def _setup_for_commit(temp_folder, create_cmd=True):
+ destination = os.path.join(temp_folder, "out")
+ temp_folder = os.path.join(temp_folder, "tmp")
+ os.makedirs(destination)
+ os.makedirs(temp_folder)
+
+ if not create_cmd:
+ return destination, temp_folder
+
+ cmd = AtomicCmd(("touch", "%(OUT_FOO)s"),
+ OUT_FOO=os.path.join(destination, "1234"))
+ cmd.run(temp_folder)
+ assert_equal(cmd.join(), [0])
+
+ return destination, temp_folder, cmd
+
+
+ at with_temp_folder
+def test_atomiccmd__commit_simple(temp_folder):
+ destination, temp_folder, cmd = _setup_for_commit(temp_folder)
+ cmd.commit(temp_folder)
+ assert not os.path.exists(os.path.join(temp_folder, "1234"))
+ assert os.path.exists(os.path.join(destination, "1234"))
+
+
+ at with_temp_folder
+def test_atomiccmd__commit_temp_out(temp_folder):
+ dest, temp = _setup_for_commit(temp_folder, create_cmd=False)
+ cmd = AtomicCmd(("echo", "foo"),
+ OUT_STDOUT=os.path.join(dest, "foo.txt"),
+ TEMP_OUT_FOO="bar.txt")
+ cmd.run(temp)
+ assert_equal(cmd.join(), [0])
+ set_file_contents(os.path.join(temp, "bar.txt"), "1 2 3")
+ cmd.commit(temp)
+ assert_equal(os.listdir(temp), [])
+ assert_equal(os.listdir(dest), ["foo.txt"])
+
+
+ at with_temp_folder
+def test_atomiccmd__commit_temp_only(temp_folder):
+ cmd = AtomicCmd(("echo", "foo"),
+ TEMP_OUT_STDOUT="bar.txt")
+ cmd.run(temp_folder)
+ assert_equal(cmd.join(), [0])
+ assert os.path.exists(os.path.join(temp_folder, "bar.txt"))
+ cmd.commit(temp_folder)
+ assert_equal(os.listdir(temp_folder), [])
+
+
+def test_atomiccmd__commit_before_run():
+ cmd = AtomicCmd("true")
+ assert_raises(CmdError, cmd.commit, "/tmp")
+
+
+ at with_temp_folder
+def test_atomiccmd__commit_while_running(temp_folder):
+ cmd = AtomicCmd(("sleep", "10"))
+ cmd.run(temp_folder)
+ assert_raises(CmdError, cmd.commit, temp_folder)
+ cmd.terminate()
+ cmd.join()
+
+
+ at with_temp_folder
+def test_atomiccmd__commit_before_join(temp_folder):
+ cmd = AtomicCmd(("sleep", "0.1"))
+ cmd.run(temp_folder)
+ while cmd._proc.poll() is None:
+ pass
+ assert_raises(CmdError, cmd.commit, temp_folder)
+ cmd.join()
+
+
+# The temp path might differ, as long as the actual path is the same
+ at with_temp_folder
+def test_atomiccmd__commit_temp_folder(temp_folder):
+ destination, temp_folder, cmd = _setup_for_commit(temp_folder)
+ cmd.commit(os.path.realpath(temp_folder))
+ assert not os.path.exists(os.path.join(temp_folder, "1234"))
+ assert os.path.exists(os.path.join(destination, "1234"))
+
+
+ at with_temp_folder
+def test_atomiccmd__commit_wrong_temp_folder(temp_folder):
+ destination, temp_folder, cmd = _setup_for_commit(temp_folder)
+ assert_raises(CmdError, cmd.commit, destination)
+
+
+ at with_temp_folder
+def test_atomiccmd__commit_missing_files(temp_folder):
+ destination, temp_folder = _setup_for_commit(temp_folder, False)
+ cmd = AtomicCmd(("touch", "%(OUT_FOO)s"),
+ OUT_FOO=os.path.join(destination, "1234"),
+ OUT_BAR=os.path.join(destination, "4567"))
+ cmd.run(temp_folder)
+ cmd.join()
+ before = set(os.listdir(temp_folder))
+ assert_raises(CmdError, cmd.commit, temp_folder)
+ assert_equal(before, set(os.listdir(temp_folder)))
+
+
+ at with_temp_folder
+def test_atomiccmd__commit_failure_cleanup(temp_folder):
+ counter = []
+ move_file = fileutils.move_file
+
+ def _monkey_move_file(source, destination):
+ if counter:
+ raise OSError("ARRRGHHH!")
+ counter.append(destination)
+
+ return move_file(source, destination)
+
+ destination, temp_folder = _setup_for_commit(temp_folder, False)
+ command = AtomicCmd(("touch", "%(OUT_FILE_1)s", "%(OUT_FILE_2)s",
+ "%(OUT_FILE_3)s"),
+ OUT_FILE_1=os.path.join(destination, "file_1"),
+ OUT_FILE_2=os.path.join(destination, "file_2"),
+ OUT_FILE_3=os.path.join(destination, "file_3"))
+
+ try:
+ fileutils.move_file = _monkey_move_file
+ command.run(temp_folder)
+ assert_equal(command.join(), [0])
+ assert_raises(OSError, command.commit, temp_folder)
+
+ assert_equal(tuple(os.listdir(destination)), ())
+ finally:
+ fileutils.move_file = move_file
+
+
+ at with_temp_folder
+def test_atomiccmd__commit_with_pipes(temp_folder):
+ destination, temp_folder = _setup_for_commit(temp_folder, False)
+ command_1 = AtomicCmd(("echo", "Hello, World!"),
+ OUT_STDOUT=AtomicCmd.PIPE)
+ command_2 = AtomicCmd(("gzip",),
+ IN_STDIN=command_1,
+ OUT_STDOUT=os.path.join(destination, "foo.gz"))
+
+ command_1.run(temp_folder)
+ command_2.run(temp_folder)
+
+ assert_equal(command_1.join(), [0])
+ assert_equal(command_2.join(), [0])
+
+ command_1.commit(temp_folder)
+ command_2.commit(temp_folder)
+
+ assert_equal(set(os.listdir(destination)), set(("foo.gz",)))
+ assert_equal(set(os.listdir(temp_folder)), set())
+
+
+###############################################################################
+###############################################################################
+# __str__
+# Additional tests in atomicpp_test.py
+
+def test_atomiccmd__str__():
+ cmd = AtomicCmd(("echo", "test"))
+ assert_equal(paleomix.atomiccmd.pprint.pformat(cmd), str(cmd))
+
+
+###############################################################################
+###############################################################################
+# Cleanup
+
+# Test that the internal list of processes is kept clean of old objects
+def test_atomiccmd__cleanup_proc():
+ @with_temp_folder
+ def _do_test_atomiccmd__cleanup_proc(temp_folder, func):
+ assert_equal(paleomix.atomiccmd.command._PROCS, set())
+ cmd = AtomicCmd("ls")
+ cmd.run(temp_folder)
+ ref = iter(paleomix.atomiccmd.command._PROCS).next()
+ assert ref
+ assert_equal(ref(), cmd._proc)
+
+ assert_equal(cmd.join(), [0])
+ cmd = func(cmd, temp_folder)
+
+ assert ref not in paleomix.atomiccmd.command._PROCS
+
+ def _do_commit(cmd, temp_folder):
+ # Trigger freeing of proc
+ cmd.commit(temp_folder)
+ return cmd
+
+ # The proc object should be released when commit is called
+ yield _do_test_atomiccmd__cleanup_proc, _do_commit
+ # The proc object should be released when the cmd object is released
+ yield _do_test_atomiccmd__cleanup_proc, lambda _cmd, _temp_folder: None
+
+
+def test_atomiccmd__cleanup_sigterm():
+ def _do_test_atomiccmd__cleanup_sigterm(kill_at):
+ sigs_sent, exit_called = {}, []
+
+ def _wrap_killpg(pid, sig):
+ assert pid not in sigs_sent
+ do_kill = len(sigs_sent) == kill_at
+ sigs_sent[pid] = (sig, do_kill)
+
+ # Simulate already terminated processes; cleanup should continue
+ if do_kill:
+ raise OSError("KABOOM!")
+
+ def _wrap_exit(rc):
+ exit_called.append(rc)
+
+ _procs = [flexmock(pid=7913),
+ # I've got the same combination on my luggage!
+ flexmock(pid=12345)]
+
+ assert not paleomix.atomiccmd.command._PROCS
+ with Monkeypatch("paleomix.atomiccmd.command._PROCS", _procs):
+ assert_equal(len(paleomix.atomiccmd.command._PROCS), 2)
+ with Monkeypatch("os.killpg", _wrap_killpg):
+ with Monkeypatch("sys.exit", _wrap_exit):
+ paleomix.atomiccmd.command._cleanup_children(signal.SIGTERM, None)
+
+ assert_equal(exit_called, [-signal.SIGTERM])
+ assert_equal(sigs_sent, {7913: (signal.SIGTERM, kill_at == 0),
+ 12345: (signal.SIGTERM, kill_at == 1)})
+
+ yield _do_test_atomiccmd__cleanup_sigterm, 0
+ yield _do_test_atomiccmd__cleanup_sigterm, 1
+ yield _do_test_atomiccmd__cleanup_sigterm, 2
+
+
+# Ensure that the cleanup function handles weakrefs that have been freed
+def test_atomiccmd__cleanup_sigterm__dead_weakrefs():
+ exit_called = []
+ procs_wrapper = [weakref.ref(Monkeypatch("sys.exit", None))]
+
+ assert_equal(procs_wrapper[0](), None)
+
+ def _wrap_killpg(_pid, _sig):
+ assert False # pragma: no coverage
+
+ def _wrap_exit(rc):
+ exit_called.append(rc)
+
+ with Monkeypatch("paleomix.atomiccmd.command._PROCS", procs_wrapper):
+ with Monkeypatch("os.killpg", _wrap_killpg):
+ with Monkeypatch("sys.exit", _wrap_exit):
+ paleomix.atomiccmd.command._cleanup_children(signal.SIGTERM, None)
+ assert_equal(exit_called, [-signal.SIGTERM])
diff --git a/tests/atomiccmd_test/pprint_test.py b/tests/atomiccmd_test/pprint_test.py
new file mode 100644
index 0000000..698989b
--- /dev/null
+++ b/tests/atomiccmd_test/pprint_test.py
@@ -0,0 +1,396 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import StringIO
+
+import nose
+from nose.tools import assert_equal
+from paleomix.common.testing import with_temp_folder
+
+from paleomix.atomiccmd.command import AtomicCmd
+from paleomix.atomiccmd.sets import ParallelCmds, SequentialCmds
+from paleomix.atomiccmd.pprint import pprint, pformat, _pformat_list
+
+
+###############################################################################
+###############################################################################
+# pprint
+
+def test_pprint__simple():
+ obj = StringIO.StringIO()
+ cmd = AtomicCmd(("touch", "something"))
+ pprint(cmd, out=obj)
+ assert_equal(obj.getvalue(), ("<Command = ['touch', 'something']\n"
+ " STDOUT* = '${TEMP_DIR}/pipe_touch_%i.stdout'\n"
+ " STDERR* = '${TEMP_DIR}/pipe_touch_%i.stderr'>\n") % (id(cmd), id(cmd)))
+
+
+###############################################################################
+###############################################################################
+# INFILE
+
+def test_pformat__simple():
+ cmd = AtomicCmd(("touch", "something"))
+ assert_equal(pformat(cmd), ("<Command = ['touch', 'something']\n"
+ " STDOUT* = '${TEMP_DIR}/pipe_touch_%i.stdout'\n"
+ " STDERR* = '${TEMP_DIR}/pipe_touch_%i.stderr'>") % (id(cmd), id(cmd)))
+
+
+ at with_temp_folder
+def test_pformat__simple__running(temp_folder):
+ cmd = AtomicCmd(("sleep", "10"))
+ cmd.run(temp_folder)
+ assert_equal(pformat(cmd), ("<Command = ['sleep', '10']\n"
+ " Status = Running ...\n"
+ " STDOUT* = '{temp_dir}/pipe_sleep_{id}.stdout'\n"
+ " STDERR* = '{temp_dir}/pipe_sleep_{id}.stderr'\n"
+ " CWD = '{cwd}'>").format(id=id(cmd),
+ cwd=os.getcwd(),
+ temp_dir=temp_folder))
+ cmd.terminate()
+ cmd.join()
+
+
+ at with_temp_folder
+def test_pformat__simple__running__set_cwd(temp_folder):
+ cmd = AtomicCmd(("sleep", "10"), set_cwd=True)
+ cmd.run(temp_folder)
+ assert_equal(pformat(cmd), ("<Command = ['sleep', '10']\n"
+ " Status = Running ...\n"
+ " STDOUT* = 'pipe_sleep_{id}.stdout'\n"
+ " STDERR* = 'pipe_sleep_{id}.stderr'\n"
+ " CWD = '{temp_dir}'>").format(id=id(cmd),
+ temp_dir=temp_folder))
+ cmd.terminate()
+ cmd.join()
+
+
+ at with_temp_folder
+def test_pformat__simple__done(temp_folder):
+ cmd = AtomicCmd("true")
+ cmd.run(temp_folder)
+ assert_equal(cmd.join(), [0])
+ assert_equal(pformat(cmd), ("<Command = ['true']\n"
+ " Status = Exited with return-code 0\n"
+ " STDOUT* = '{temp_dir}/pipe_true_{id}.stdout'\n"
+ " STDERR* = '{temp_dir}/pipe_true_{id}.stderr'\n"
+ " CWD = '{cwd}'>").format(id=id(cmd),
+ cwd=os.getcwd(),
+ temp_dir=temp_folder))
+
+
+ at with_temp_folder
+def test_pformat__simple__done__before_join(temp_folder):
+ cmd = AtomicCmd("true")
+ cmd.run(temp_folder)
+ cmd._proc.wait()
+ assert_equal(pformat(cmd), ("<Command = ['true']\n"
+ " Status = Exited with return-code 0\n"
+ " STDOUT* = '{temp_dir}/pipe_true_{id}.stdout'\n"
+ " STDERR* = '{temp_dir}/pipe_true_{id}.stderr'\n"
+ " CWD = '{cwd}'>").format(id=id(cmd),
+ cwd=os.getcwd(),
+ temp_dir=temp_folder))
+ assert_equal(cmd.join(), [0])
+
+
+ at with_temp_folder
+def test_pformat__simple__done__set_cwd(temp_folder):
+ cmd = AtomicCmd("true", set_cwd=True)
+ cmd.run(temp_folder)
+ assert_equal(cmd.join(), [0])
+ assert_equal(pformat(cmd), ("<Command = ['true']\n"
+ " Status = Exited with return-code 0\n"
+ " STDOUT* = 'pipe_true_{id}.stdout'\n"
+ " STDERR* = 'pipe_true_{id}.stderr'\n"
+ " CWD = '{temp_dir}'>").format(id=id(cmd),
+ temp_dir=temp_folder))
+
+
+ at with_temp_folder
+def test_pformat__simple__killed(temp_folder):
+ cmd = AtomicCmd(("sleep", "10"))
+ cmd.run(temp_folder)
+ cmd.terminate()
+ assert_equal(cmd.join(), ["SIGTERM"])
+ assert_equal(pformat(cmd), ("<Command = ['sleep', '10']\n"
+ " Status = Terminated with signal SIGTERM\n"
+ " STDOUT* = '{temp_dir}/pipe_sleep_{id}.stdout'\n"
+ " STDERR* = '{temp_dir}/pipe_sleep_{id}.stderr'\n"
+ " CWD = '{cwd}'>").format(id=id(cmd),
+ temp_dir=temp_folder,
+ cwd=os.getcwd()))
+
+
+###############################################################################
+###############################################################################
+# INFILE
+
+def test_pformat__atomiccmd__simple_with_infile():
+ cmd = AtomicCmd(("cat", "%(IN_SOMETHING)s"), IN_SOMETHING="/etc/fstab")
+ assert_equal(pformat(cmd), ("<Command = ['cat', '/etc/fstab']\n"
+ " STDOUT* = '${TEMP_DIR}/pipe_cat_%i.stdout'\n"
+ " STDERR* = '${TEMP_DIR}/pipe_cat_%i.stderr'>") % (id(cmd), id(cmd)))
+
+
+def test_pformat__atomiccmd__simple_with_infile__set_cwd():
+ cmd = AtomicCmd(("cat", "%(IN_SOMETHING)s"),
+ IN_SOMETHING="/etc/fstab",
+ set_cwd=True)
+ assert_equal(pformat(cmd), ("<Command = ['cat', '/etc/fstab']\n"
+ " STDOUT* = 'pipe_cat_%i.stdout'\n"
+ " STDERR* = 'pipe_cat_%i.stderr'\n"
+ " CWD = '${TEMP_DIR}'>") % (id(cmd), id(cmd)))
+
+
+def test_pformat__atomiccmd__simple_with_temp_infile():
+ cmd = AtomicCmd(("cat", "%(TEMP_IN_FILE)s"),
+ TEMP_IN_FILE="infile.txt")
+ assert_equal(pformat(cmd), ("<Command = ['cat', '${TEMP_DIR}/infile.txt']\n"
+ " STDOUT* = '${TEMP_DIR}/pipe_cat_%i.stdout'\n"
+ " STDERR* = '${TEMP_DIR}/pipe_cat_%i.stderr'>") % (id(cmd), id(cmd)))
+
+
+def test_pformat__atomiccmd__simple_with_temp_infile__set_cwd():
+ cmd = AtomicCmd(("zcat", "%(TEMP_IN_FILE)s"),
+ TEMP_IN_FILE="infile.gz",
+ set_cwd=True)
+ assert_equal(pformat(cmd), ("<Command = ['zcat', 'infile.gz']\n"
+ " STDOUT* = 'pipe_zcat_%i.stdout'\n"
+ " STDERR* = 'pipe_zcat_%i.stderr'\n"
+ " CWD = '${TEMP_DIR}'>") % (id(cmd), id(cmd)))
+
+
+###############################################################################
+###############################################################################
+# OUTFILE
+
+def test_pformat__atomiccmd__simple_with_outfile():
+ cmd = AtomicCmd(("touch", "%(OUT_RC)s"), OUT_RC="/etc/bashrc")
+ assert_equal(pformat(cmd), ("<Command = ['touch', '${TEMP_DIR}/bashrc']\n"
+ " STDOUT* = '${TEMP_DIR}/pipe_touch_%i.stdout'\n"
+ " STDERR* = '${TEMP_DIR}/pipe_touch_%i.stderr'>") % (id(cmd), id(cmd)))
+
+
+def test_pformat__atomiccmd__simple_with_outfile__set_cwd():
+ cmd = AtomicCmd(("touch", "%(OUT_RC)s"), OUT_RC="/etc/bashrc", set_cwd=True)
+ assert_equal(pformat(cmd), ("<Command = ['touch', 'bashrc']\n"
+ " STDOUT* = 'pipe_touch_%i.stdout'\n"
+ " STDERR* = 'pipe_touch_%i.stderr'\n"
+ " CWD = '${TEMP_DIR}'>") % (id(cmd), id(cmd)))
+
+
+def test_pformat__atomiccmd__simple_with_temp_outfile():
+ cmd = AtomicCmd(("touch", "%(TEMP_OUT_RC)s"), TEMP_OUT_RC="bashrc")
+ assert_equal(pformat(cmd), ("<Command = ['touch', '${TEMP_DIR}/bashrc']\n"
+ " STDOUT* = '${TEMP_DIR}/pipe_touch_%i.stdout'\n"
+ " STDERR* = '${TEMP_DIR}/pipe_touch_%i.stderr'>") % (id(cmd), id(cmd)))
+
+
+def test_pformat__atomiccmd__simple_with_temp_outfile__set_cwd():
+ cmd = AtomicCmd(("touch", "%(TEMP_OUT_RC)s"), TEMP_OUT_RC="bashrc", set_cwd=True)
+ assert_equal(pformat(cmd), ("<Command = ['touch', 'bashrc']\n"
+ " STDOUT* = 'pipe_touch_%i.stdout'\n"
+ " STDERR* = 'pipe_touch_%i.stderr'\n"
+ " CWD = '${TEMP_DIR}'>") % (id(cmd), id(cmd)))
+
+
+###############################################################################
+###############################################################################
+# STDIN
+
+def test_pformat__atomiccmd__simple_with_stdin():
+ cmd = AtomicCmd("gzip", IN_STDIN="/etc/fstab")
+ assert_equal(pformat(cmd), ("<Command = ['gzip']\n"
+ " STDIN = '/etc/fstab'\n"
+ " STDOUT* = '${TEMP_DIR}/pipe_gzip_%i.stdout'\n"
+ " STDERR* = '${TEMP_DIR}/pipe_gzip_%i.stderr'>") % (id(cmd), id(cmd)))
+
+
+def test_pformat__atomiccmd__simple_with_stdin__set_cwd():
+ cmd = AtomicCmd("gzip", IN_STDIN="/etc/fstab", set_cwd=True)
+ assert_equal(pformat(cmd), ("<Command = ['gzip']\n"
+ " STDIN = '/etc/fstab'\n"
+ " STDOUT* = 'pipe_gzip_%i.stdout'\n"
+ " STDERR* = 'pipe_gzip_%i.stderr'\n"
+ " CWD = '${TEMP_DIR}'>") % (id(cmd), id(cmd)))
+
+
+def test_pformat__atomiccmd__simple_with_temp_stdin():
+ cmd = AtomicCmd("gzip", TEMP_IN_STDIN="stabstabstab")
+ assert_equal(pformat(cmd), ("<Command = ['gzip']\n"
+ " STDIN* = '${TEMP_DIR}/stabstabstab'\n"
+ " STDOUT* = '${TEMP_DIR}/pipe_gzip_%i.stdout'\n"
+ " STDERR* = '${TEMP_DIR}/pipe_gzip_%i.stderr'>") % (id(cmd), id(cmd)))
+
+
+def test_pformat__atomiccmd__simple_with_temp_stdin__set_cwd():
+ cmd = AtomicCmd("gzip", TEMP_IN_STDIN="stabstabstab", set_cwd=True)
+ assert_equal(pformat(cmd), ("<Command = ['gzip']\n"
+ " STDIN* = 'stabstabstab'\n"
+ " STDOUT* = 'pipe_gzip_%i.stdout'\n"
+ " STDERR* = 'pipe_gzip_%i.stderr'\n"
+ " CWD = '${TEMP_DIR}'>") % (id(cmd), id(cmd)))
+
+
+def test_pformat__atomiccmd__simple_with_stdin__cmd():
+ cmd_1 = AtomicCmd("gzip", OUT_STDOUT=AtomicCmd.PIPE)
+ cmd_2 = AtomicCmd("gzip", IN_STDIN=cmd_1)
+ assert_equal(pformat(cmd_2), ("<Command = ['gzip']\n"
+ " STDIN = <PIPE>\n"
+ " STDOUT* = '${TEMP_DIR}/pipe_gzip_%i.stdout'\n"
+ " STDERR* = '${TEMP_DIR}/pipe_gzip_%i.stderr'>") % (id(cmd_2), id(cmd_2)))
+
+
+###############################################################################
+###############################################################################
+# STDOUT
+
+def test_pformat__atomiccmd__simple_with_stdout():
+ cmd = AtomicCmd(("echo", "Water. Water."), OUT_STDOUT="/dev/ls")
+ assert_equal(pformat(cmd), ("<Command = ['echo', 'Water. Water.']\n"
+ " STDOUT = '${TEMP_DIR}/ls'\n"
+ " STDERR* = '${TEMP_DIR}/pipe_echo_%i.stderr'>") % (id(cmd),))
+
+
+def test_pformat__atomiccmd__simple_with_stdout__set_cwd():
+ cmd = AtomicCmd(("echo", "*pant*. *pant*."), OUT_STDOUT="/dev/barf", set_cwd=True)
+ assert_equal(pformat(cmd), ("<Command = ['echo', '*pant*. *pant*.']\n"
+ " STDOUT = 'barf'\n"
+ " STDERR* = 'pipe_echo_%i.stderr'\n"
+ " CWD = '${TEMP_DIR}'>") % (id(cmd),))
+
+
+def test_pformat__atomiccmd__simple_with_temp_stdout():
+ cmd = AtomicCmd(("echo", "Oil. Oil."), TEMP_OUT_STDOUT="dm")
+ assert_equal(pformat(cmd), ("<Command = ['echo', 'Oil. Oil.']\n"
+ " STDOUT* = '${TEMP_DIR}/dm'\n"
+ " STDERR* = '${TEMP_DIR}/pipe_echo_%i.stderr'>") % (id(cmd),))
+
+
+def test_pformat__atomiccmd__simple_with_temp_stdout__set_cwd():
+ cmd = AtomicCmd(("echo", "Room service. Room service."),
+ TEMP_OUT_STDOUT="pv",
+ set_cwd=True)
+ assert_equal(pformat(cmd), ("<Command = ['echo', 'Room service. Room service.']\n"
+ " STDOUT* = 'pv'\n"
+ " STDERR* = 'pipe_echo_%i.stderr'\n"
+ " CWD = '${TEMP_DIR}'>") % (id(cmd),))
+
+
+def test_pformat__atomiccmd__simple_with_stdout_pipe():
+ cmd = AtomicCmd(("echo", "!"), OUT_STDOUT=AtomicCmd.PIPE)
+ assert_equal(pformat(cmd), ("<Command = ['echo', '!']\n"
+ " STDOUT = <PIPE>\n"
+ " STDERR* = '${TEMP_DIR}/pipe_echo_%i.stderr'>") % (id(cmd),))
+
+
+###############################################################################
+###############################################################################
+# ParallelCmds
+
+def test_pformat__sets__simple():
+ def _do_test_pformat__sets__simple(cls, description):
+ cmd_1 = AtomicCmd(("echo", "foo"), OUT_STDOUT=AtomicCmd.PIPE)
+ cmd_2 = AtomicCmd("gzip", IN_STDIN=cmd_1)
+ cmd = cls((cmd_1, cmd_2))
+ assert_equal(pformat(cmd),
+ ("<{description}:\n"
+ " - <00> Command = ['echo', 'foo']\n"
+ " STDOUT = <01>\n"
+ " STDERR* = '${{TEMP_DIR}}/pipe_echo_{cmd_1_id}.stderr'\n"
+ " - <01> Command = ['gzip']\n"
+ " STDIN = <00>\n"
+ " STDOUT* = '${{TEMP_DIR}}/pipe_gzip_{cmd_2_id}.stdout'\n"
+ " STDERR* = '${{TEMP_DIR}}/pipe_gzip_{cmd_2_id}.stderr'>")
+ .format(description=description,
+ cmd_1_id=id(cmd_1),
+ cmd_2_id=id(cmd_2)))
+ yield _do_test_pformat__sets__simple, ParallelCmds, "Parallel commands"
+ yield _do_test_pformat__sets__simple, SequentialCmds, "Sequential commands"
+
+
+def test_pformat__sets__nested():
+ cmd_1 = AtomicCmd(("echo", "foo"), OUT_STDOUT=AtomicCmd.PIPE)
+ cmd_2 = AtomicCmd("gzip", IN_STDIN=cmd_1)
+ cmd_3 = AtomicCmd("sha1sum")
+ set_1 = ParallelCmds((cmd_1, cmd_2))
+ set_2 = SequentialCmds((set_1, cmd_3))
+ assert_equal(pformat(set_2),
+ ("<Sequential commands:\n"
+ " - Parallel commands:\n"
+ " - <00> Command = ['echo', 'foo']\n"
+ " STDOUT = <01>\n"
+ " STDERR* = '${{TEMP_DIR}}/pipe_echo_{cmd_1_id}.stderr'\n"
+ " - <01> Command = ['gzip']\n"
+ " STDIN = <00>\n"
+ " STDOUT* = '${{TEMP_DIR}}/pipe_gzip_{cmd_2_id}.stdout'\n"
+ " STDERR* = '${{TEMP_DIR}}/pipe_gzip_{cmd_2_id}.stderr'\n"
+ " - <02> Command = ['sha1sum']\n"
+ " STDOUT* = '${{TEMP_DIR}}/pipe_sha1sum_{cmd_3_id}.stdout'\n"
+ " STDERR* = '${{TEMP_DIR}}/pipe_sha1sum_{cmd_3_id}.stderr'>")
+ .format(cmd_1_id=id(cmd_1),
+ cmd_2_id=id(cmd_2),
+ cmd_3_id=id(cmd_3)))
+
+
+###############################################################################
+###############################################################################
+# Bad input
+
+def test_pformat__bad_input():
+ @nose.tools.raises(TypeError)
+ def _do_test_pformat__bad_input(value):
+ pformat(value)
+ yield _do_test_pformat__bad_input, 1
+ yield _do_test_pformat__bad_input, {}
+ yield _do_test_pformat__bad_input, ""
+
+
+###############################################################################
+###############################################################################
+# _pformat_list
+
+def test_pformat_list__empty():
+ assert_equal(_pformat_list([]), "[]")
+
+
+def test_pformat_list__single():
+ assert_equal(_pformat_list([3]), "[3]")
+
+
+def test_pformat_list__multiple():
+ assert_equal(_pformat_list([3, 2, 1]), "[3, 2, 1]")
+
+
+def test_pformat_list__wrapped():
+ assert_equal(_pformat_list([3, 2, 1], width=1), "[3,\n 2,\n 1]")
+ assert_equal(_pformat_list([3, 2, 1], width=2), "[3,\n 2,\n 1]")
+ assert_equal(_pformat_list([3, 2, 1], width=3), "[3,\n 2,\n 1]")
+ assert_equal(_pformat_list([3, 2, 1], width=4), "[3,\n 2,\n 1]")
+ assert_equal(_pformat_list([3, 2, 1], width=5), "[3,\n 2,\n 1]")
+ assert_equal(_pformat_list([3, 2, 1], width=6), "[3, 2,\n 1]")
+ assert_equal(_pformat_list([3, 2, 1], width=7), "[3, 2,\n 1]")
+ assert_equal(_pformat_list([3, 2, 1], width=8), "[3, 2,\n 1]")
+ assert_equal(_pformat_list([3, 2, 1], width=9), "[3, 2, 1]")
+ assert_equal(_pformat_list([3, 2, 1], width=10), "[3, 2, 1]")
diff --git a/tests/atomiccmd_test/sets_test.py b/tests/atomiccmd_test/sets_test.py
new file mode 100644
index 0000000..fecf702
--- /dev/null
+++ b/tests/atomiccmd_test/sets_test.py
@@ -0,0 +1,340 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# Disable warnings for wierd function names
+# pylint: disable=C0103
+# Disable warnings caused by flexmock setups ("X is assigned to nothing")
+# pylint: disable=W0106
+
+import nose
+import nose.tools
+from nose.tools import assert_equal, assert_raises
+from paleomix.common.testing import with_temp_folder
+
+from flexmock import flexmock
+
+import paleomix.atomiccmd.pprint
+from paleomix.atomiccmd.command import AtomicCmd, CmdError
+from paleomix.atomiccmd.sets import ParallelCmds, SequentialCmds
+
+
+###############################################################################
+###############################################################################
+# Properties with same expected behavior for both Parallel/SequentialCmds
+
+def test_atomicsets__properties():
+ def _do_test(cls):
+ cmd_mock_1 = AtomicCmd(("true",),
+ CHECK_A=id,
+ EXEC_1="false",
+ IN_1="/foo/bar/in_1.file",
+ IN_2="/foo/bar/in_2.file",
+ OUT_1="/bar/foo/out",
+ TEMP_OUT_1="out.log",
+ AUX_A="/aux/fA",
+ AUX_B="/aux/fB")
+ cmd_mock_2 = AtomicCmd(("false",),
+ CHECK_A=list,
+ EXEC_1="echo",
+ EXEC_2="java",
+ IN_1="/foo/bar/in.file",
+ OUT_1="out.txt")
+
+ obj = cls([cmd_mock_1, cmd_mock_2])
+ assert_equal(obj.executables, cmd_mock_1.executables | cmd_mock_2.executables)
+ assert_equal(obj.requirements, cmd_mock_1.requirements | cmd_mock_2.requirements)
+ assert_equal(obj.input_files, cmd_mock_1.input_files | cmd_mock_2.input_files)
+ assert_equal(obj.output_files, cmd_mock_1.output_files | cmd_mock_2.output_files)
+ assert_equal(obj.auxiliary_files, cmd_mock_1.auxiliary_files | cmd_mock_2.auxiliary_files)
+ assert_equal(obj.expected_temp_files, frozenset(["out", "out.txt"]))
+ assert_equal(obj.optional_temp_files, cmd_mock_1.optional_temp_files | cmd_mock_2.optional_temp_files)
+
+ for cls in (ParallelCmds, SequentialCmds):
+ yield _do_test, cls
+
+
+# Ensure that commands in a set doesn't clobber eachothers OUT files
+def test_atomicsets__no_clobbering():
+ def _do_test_atomicsets__no_clobbering(cls, kwargs_1, kwargs_2):
+ cmd_1 = AtomicCmd("true", **kwargs_1)
+ cmd_2 = AtomicCmd("true", **kwargs_2)
+ assert_raises(CmdError, cls, [cmd_1, cmd_2])
+
+ for cls in (ParallelCmds, SequentialCmds):
+ yield _do_test_atomicsets__no_clobbering, cls, {"OUT_A": "/foo/out.txt"}, {"OUT_B": "/bar/out.txt"}
+ yield _do_test_atomicsets__no_clobbering, cls, {"OUT_A": "/foo/out.txt"}, {"TEMP_OUT_B": "out.txt"}
+ yield _do_test_atomicsets__no_clobbering, cls, {"OUT_A": "/foo/out.txt"}, {"OUT_STDOUT": "/bar/out.txt"}
+ yield _do_test_atomicsets__no_clobbering, cls, {"OUT_A": "/foo/out.txt"}, {"TEMP_OUT_STDOUT": "out.txt"}
+ yield _do_test_atomicsets__no_clobbering, cls, {"OUT_A": "/foo/out.txt"}, {"OUT_STDERR": "/bar/out.txt"}
+ yield _do_test_atomicsets__no_clobbering, cls, {"OUT_A": "/foo/out.txt"}, {"TEMP_OUT_STDERR": "out.txt"}
+
+
+###############################################################################
+###############################################################################
+# Functions with same expected behavior for both Parallel/SequentialCmds
+
+def test_atomicsets__commit():
+ def _do_test_atomicsets__commit(cls):
+ mocks = []
+ for _ in range(3):
+ cmd_mock = flexmock(AtomicCmd(["ls"]))
+ cmd_mock.should_receive('commit').with_args("xTMPx").once.ordered
+ mocks.append(cmd_mock)
+
+ cls(mocks).commit("xTMPx")
+
+ yield _do_test_atomicsets__commit, ParallelCmds
+ yield _do_test_atomicsets__commit, SequentialCmds
+
+
+def test_atomicsets__stdout():
+ @nose.tools.raises(CmdError)
+ def _do_test_atomicsets__stdout(cls):
+ cmds = cls([AtomicCmd("ls")])
+ cmds.stdout
+
+ yield _do_test_atomicsets__stdout, ParallelCmds
+ yield _do_test_atomicsets__stdout, SequentialCmds
+
+
+def test_atomicsets__terminate():
+ def _do_test_atomicsets__terminate(cls):
+ mocks = []
+ for _ in reversed(range(3)):
+ cmd_mock = flexmock(AtomicCmd("true"))
+ cmd_mock.should_receive('terminate').with_args().once
+ mocks.append(cmd_mock)
+ cmds = cls(mocks)
+ cmds.terminate()
+
+ yield _do_test_atomicsets__terminate, ParallelCmds
+ yield _do_test_atomicsets__terminate, SequentialCmds
+
+
+def test_atomicsets__str__():
+ def _do_test_atomicsets__str__(cls):
+ cmds = cls([AtomicCmd("ls")])
+ assert_equal(paleomix.atomiccmd.pprint.pformat(cmds), str(cmds))
+
+ yield _do_test_atomicsets__str__, ParallelCmds
+ yield _do_test_atomicsets__str__, SequentialCmds
+
+
+def test_atomicsets__duplicate_cmds():
+ def _do_test_atomicsets__duplicate_cmds(cls):
+ cmd_1 = AtomicCmd("true")
+ cmd_2 = AtomicCmd("false")
+ assert_raises(ValueError, cls, [cmd_1, cmd_2, cmd_1])
+
+ yield _do_test_atomicsets__duplicate_cmds, ParallelCmds
+ yield _do_test_atomicsets__duplicate_cmds, SequentialCmds
+
+
+###############################################################################
+###############################################################################
+# Parallel commands
+
+def test_parallel_commands__run():
+ mocks = []
+ for _ in range(3):
+ cmd_mock = flexmock(AtomicCmd(["ls"]))
+ cmd_mock.should_receive('run').with_args("xTMPx").once
+ mocks.append(cmd_mock)
+
+ cmds = ParallelCmds(mocks)
+ cmds.run("xTMPx")
+
+
+def test_parallel_commands__ready_single():
+ def _do_test_parallel_commands__ready_single(value):
+ cmd_mock = flexmock(AtomicCmd(["ls"]))
+ cmd_mock.should_receive('ready').and_return(value).at_least.once
+ cmds = ParallelCmds([cmd_mock])
+ assert_equal(cmds.ready(), value)
+
+ yield _do_test_parallel_commands__ready_single, True
+ yield _do_test_parallel_commands__ready_single, False
+
+
+def test_parallel_commands__ready_two():
+ def _do_test_parallel_commands__ready_two(first, second, result):
+ cmd_mock_1 = flexmock(AtomicCmd(["ls"]))
+ cmd_mock_1.should_receive('ready').and_return(first).at_least.once
+ cmd_mock_2 = flexmock(AtomicCmd(["ls"]))
+ cmd_mock_2.should_receive('ready').and_return(second)
+ cmds = ParallelCmds([cmd_mock_1, cmd_mock_2])
+ assert_equal(cmds.ready(), result)
+
+ yield _do_test_parallel_commands__ready_two, True, True, True
+ yield _do_test_parallel_commands__ready_two, False, True, False
+ yield _do_test_parallel_commands__ready_two, True, False, False
+ yield _do_test_parallel_commands__ready_two, False, False, False
+
+
+def test_parallel_commands__join_before_run():
+ mocks = []
+ for value in reversed(range(3)):
+ cmd_mock = flexmock(AtomicCmd("true"))
+ cmd_mock.should_receive('join').and_return([value]).never
+ mocks.append(cmd_mock)
+ cmds = ParallelCmds(mocks)
+ assert_equal(cmds.join(), [None, None, None])
+
+
+ at with_temp_folder
+def test_parallel_commands__join_after_run(temp_folder):
+ cmds = ParallelCmds([AtomicCmd("true") for _ in range(3)])
+ cmds.run(temp_folder)
+ assert_equal(cmds.join(), [0, 0, 0])
+
+
+def _setup_mocks_for_failure(*do_mocks):
+ results = []
+ for do_mock in do_mocks:
+ if do_mock:
+ mock = flexmock(AtomicCmd(("sleep", 10)))
+ mock.should_receive('terminate')
+ mock.should_receive('join').and_return(['SIGTERM'])
+ else:
+ mock = AtomicCmd("false")
+ results.append(mock)
+ return results
+
+
+ at with_temp_folder
+def test_parallel_commands__join_failure_1(temp_folder):
+ mocks = _setup_mocks_for_failure(False, True, True)
+ cmds = ParallelCmds(mocks)
+ cmds.run(temp_folder)
+ assert_equal(cmds.join(), [1, 'SIGTERM', 'SIGTERM'])
+
+
+ at with_temp_folder
+def test_parallel_commands__join_failure_2(temp_folder):
+ mocks = _setup_mocks_for_failure(True, False, True)
+ cmds = ParallelCmds(mocks)
+ cmds.run(temp_folder)
+ assert_equal(cmds.join(), ['SIGTERM', 1, 'SIGTERM'])
+
+
+ at with_temp_folder
+def test_parallel_commands__join_failure_3(temp_folder):
+ mocks = _setup_mocks_for_failure(True, True, False)
+ cmds = ParallelCmds(mocks)
+ cmds.run(temp_folder)
+ assert_equal(cmds.join(), ['SIGTERM', 'SIGTERM', 1])
+
+
+def test_parallel_commands__reject_sequential():
+ command = AtomicCmd(["ls"])
+ seqcmd = SequentialCmds([command])
+ assert_raises(CmdError, ParallelCmds, [seqcmd])
+
+
+def test_parallel_commands__accept_parallel():
+ command = AtomicCmd(["ls"])
+ parcmd = ParallelCmds([command])
+ ParallelCmds([parcmd])
+
+
+ at nose.tools.raises(CmdError)
+def test_parallel_commands__reject_noncommand():
+ ParallelCmds([object()])
+
+
+ at nose.tools.raises(CmdError)
+def test_parallel_commands__reject_empty_commandset():
+ ParallelCmds([])
+
+
+###############################################################################
+###############################################################################
+# Sequential commands
+
+def test_sequential_commands__atomiccmds():
+ mocks = []
+ for _ in range(3):
+ cmd_mock = flexmock(AtomicCmd(["ls"]))
+ cmd_mock.should_receive('run').with_args("xTMPx").once
+ cmd_mock.should_receive('join').with_args().and_return([0]).twice
+ mocks.append(cmd_mock)
+
+ cmds = SequentialCmds(mocks)
+ assert not cmds.ready()
+ cmds.run("xTMPx")
+ assert cmds.ready()
+ assert_equal(cmds.join(), [0, 0, 0])
+
+
+ at with_temp_folder
+ at nose.tools.timed(1)
+def test_sequential_commands__abort_on_error_1(temp_folder):
+ cmd_1 = AtomicCmd("false")
+ cmd_2 = AtomicCmd(("sleep", 10))
+ cmd_3 = AtomicCmd(("sleep", 10))
+ cmds = SequentialCmds([cmd_1, cmd_2, cmd_3])
+ cmds.run(temp_folder)
+ assert_equal(cmds.join(), [1, None, None])
+
+
+ at with_temp_folder
+ at nose.tools.timed(1)
+def test_sequential_commands__abort_on_error_2(temp_folder):
+ cmd_1 = AtomicCmd("true")
+ cmd_2 = AtomicCmd("false")
+ cmd_3 = AtomicCmd(("sleep", 10))
+ cmds = SequentialCmds([cmd_1, cmd_2, cmd_3])
+ cmds.run(temp_folder)
+ assert_equal(cmds.join(), [0, 1, None])
+
+
+ at with_temp_folder
+ at nose.tools.timed(1)
+def test_sequential_commands__abort_on_error_3(temp_folder):
+ cmd_1 = AtomicCmd("true")
+ cmd_2 = AtomicCmd("true")
+ cmd_3 = AtomicCmd("false")
+ cmds = SequentialCmds([cmd_1, cmd_2, cmd_3])
+ cmds.run(temp_folder)
+ assert_equal(cmds.join(), [0, 0, 1])
+
+
+def test_sequential_commands__accept_parallel():
+ command = AtomicCmd(["ls"])
+ parcmd = ParallelCmds([command])
+ SequentialCmds([parcmd])
+
+
+def test_sequential_commands__accept_sequential():
+ command = AtomicCmd(["ls"])
+ seqcmd = SequentialCmds([command])
+ SequentialCmds([seqcmd])
+
+
+ at nose.tools.raises(CmdError)
+def test_sequential_commands__reject_noncommand():
+ SequentialCmds([object()])
+
+
+ at nose.tools.raises(CmdError)
+def test_sequential_commands__reject_empty_commandset():
+ SequentialCmds([])
diff --git a/tests/bwa/README b/tests/bwa/README
new file mode 100644
index 0000000..b6bf8c0
--- /dev/null
+++ b/tests/bwa/README
@@ -0,0 +1,4 @@
+Collection of test-cases for bugs in BWA.
+To run, simply execute the "run.sh" script in the tests/bwa/.
+
+Brief explanations are provided for each test-case in the corresponding README located in each folder.
diff --git a/tests/bwa/run.sh b/tests/bwa/run.sh
new file mode 100644
index 0000000..5f1bd8d
--- /dev/null
+++ b/tests/bwa/run.sh
@@ -0,0 +1,153 @@
+#!/bin/bash
+set -o nounset # Fail on unset variables
+set -o errexit # Fail on uncaught non-zero returncodes
+set -o pipefail # Fail is a command in a chain of pipes fails
+
+echo cd $(dirname $0)
+
+if [ ! -e "ValidateSamFile.jar" ];
+then
+ echo "Please place 'ValidateSamFile.jar' in the current folder."
+ exit 1
+fi
+
+# Fetch and build relevant versions of BWA
+function build_bwa ()
+{
+ local version=$1
+ local bwaarch=bwa-${version}.tar.bz2
+ local bwadir=bwa-${version}
+
+ cd ./builds
+ if [ ! -e "${bwadir}/bwa" ];
+ then
+ echo "Building BWA v${version}"
+
+ if [ ! -e "${bwadir}" ];
+ then
+ if [ ! -e "${bwaarch}" ];
+ then
+ wget "http://sourceforge.net/projects/bio-bwa/files/bwa-${version}.tar.bz2/download" \
+ -O ${bwaarch}
+ fi
+
+ tar xvjf bwa-${version}.tar.bz2
+ fi
+
+ nice -19 make -C bwa-${version}
+ fi
+
+ cd ..
+}
+
+mkdir -p builds
+
+build_bwa 0.7.12
+build_bwa 0.7.11
+build_bwa 0.7.10
+build_bwa 0.7.9a
+build_bwa 0.7.8
+build_bwa 0.7.7
+build_bwa 0.7.6a # This version is broken (no aln command!)
+build_bwa 0.7.5a
+build_bwa 0.7.4
+build_bwa 0.7.3a
+build_bwa 0.7.2
+build_bwa 0.7.1
+# build_bwa 0.7.0 # Broken, does not compile
+#build_bwa 0.6.2
+#build_bwa 0.6.1
+#build_bwa 0.5.9
+#build_bwa 0.6.0
+#build_bwa 0.5.10
+#build_bwa 0.5.9rc1 # Oldest acceptable version
+
+
+cd builds
+if [ ! -e "bwa-git" ];
+then
+ git clone "https://github.com/lh3/bwa.git" "bwa-git"
+fi
+
+cd bwa-git
+git pull
+make
+cd ../../
+
+
+# Errors to ignore during valiation
+IGNORE="IGNORE=RECORD_MISSING_READ_GROUP IGNORE=MISSING_READ_GROUP"
+
+ls -d testcases/* |
+while read testcase;
+do
+ echo "Running testcase ${testcase}: $(head -n1 ${testcase}/README)"
+ ls builds/*/bwa |
+ while read BWA;
+ do
+ echo -n " $BWA "
+
+ rm -rf temp
+ folder="runs/$testcase/$(dirname $BWA | xargs basename)"
+ rm -rf $folder
+ mkdir -p $folder
+ ln -s $folder temp
+
+ msg=""
+ returncode=-1
+
+ cp ${testcase}/* temp/
+ if [ -e "temp/run.sh" ];
+ then
+ bash "temp/run.sh" "$(pwd)/$BWA" && returncode=$? || returncode=$?
+
+ if [ -e "temp/run.log" ];
+ then
+ msg="$(head -n1 temp/run.log)"
+ fi
+ elif [ -e "temp/reads1.fasta" ];
+ then
+ PREFIX=temp/prefix.fasta
+ READS1=temp/reads1.fasta
+ READS2=temp/reads2.fasta
+ RESULTS=temp/results
+
+ command="index"
+ if $BWA index ${PREFIX} 2> ${PREFIX}.log;
+ then
+ command="aln #1"
+ if $BWA aln ${PREFIX} ${READS1} > ${READS1}.fai 2> ${READS1}.log;
+ then
+ command="aln #2"
+ if $BWA aln ${PREFIX} ${READS2} > ${READS2}.fai 2> ${READS2}.log;
+ then
+ command="sampe"
+ if $BWA sampe ${PREFIX} ${READS1}.fai ${READS2}.fai ${READS1} ${READS2} 2> ${RESULTS}.log | \
+ paleomix cleanup --paired --fasta ${PREFIX} --temp-prefix temp/cleanup 2> ${RESULTS}.cleanup.log > ${RESULTS}.bam;
+ then
+ java -jar "ValidateSamFile.jar" ${IGNORE} I=${RESULTS}.bam &> ${RESULTS}.bam.validated \
+ && returncode=$? || returncode=$?
+
+ if [ -e "${RESULTS}.bam.validated" ];
+ then
+ msg="$((grep ERROR ${RESULTS}.bam.validated || true) | head -n1)"
+ fi
+ fi
+ fi
+ fi
+ fi
+ fi
+
+ if test $returncode -eq 0;
+ then
+ echo -e "\033[32m[OK]\033[0m"
+ elif test $returncode -eq -1;
+ then
+ echo -e "\033[31m[TEST ERROR]: $command\033[0m"
+ else
+ echo -e "\033[33m[FAILED]\033[0m: $msg"
+ fi
+ done
+done
+
+rm -rf temp
diff --git a/tests/bwa/testcases/case_01a/README b/tests/bwa/testcases/case_01a/README
new file mode 100644
index 0000000..06311fa
--- /dev/null
+++ b/tests/bwa/testcases/case_01a/README
@@ -0,0 +1 @@
+Reads mapped to position 0, though positions are 0-based
\ No newline at end of file
diff --git a/tests/bwa/testcases/case_01a/prefix.fasta b/tests/bwa/testcases/case_01a/prefix.fasta
new file mode 100644
index 0000000..e36741e
--- /dev/null
+++ b/tests/bwa/testcases/case_01a/prefix.fasta
@@ -0,0 +1,2 @@
+>prefix
+CACACACAC
diff --git a/tests/bwa/testcases/case_01a/reads1.fasta b/tests/bwa/testcases/case_01a/reads1.fasta
new file mode 100644
index 0000000..ea2edba
--- /dev/null
+++ b/tests/bwa/testcases/case_01a/reads1.fasta
@@ -0,0 +1,2 @@
+>read1
+CGTGTTTGTG
diff --git a/tests/bwa/testcases/case_01a/reads2.fasta b/tests/bwa/testcases/case_01a/reads2.fasta
new file mode 100644
index 0000000..b8c79a6
--- /dev/null
+++ b/tests/bwa/testcases/case_01a/reads2.fasta
@@ -0,0 +1,2 @@
+>read1
+TAATA
diff --git a/tests/bwa/testcases/case_01b/README b/tests/bwa/testcases/case_01b/README
new file mode 100644
index 0000000..06311fa
--- /dev/null
+++ b/tests/bwa/testcases/case_01b/README
@@ -0,0 +1 @@
+Reads mapped to position 0, though positions are 0-based
\ No newline at end of file
diff --git a/tests/bwa/testcases/case_01b/prefix.fasta b/tests/bwa/testcases/case_01b/prefix.fasta
new file mode 100644
index 0000000..647feaf
--- /dev/null
+++ b/tests/bwa/testcases/case_01b/prefix.fasta
@@ -0,0 +1,7 @@
+>Genome
+GTTAATGTAGCTTAATAATATAAAGCAAGGCACTGAAAATGCCTAGATGAGTATTCTTAC
+CAAGTATCCGCACCCCAGTGAGAATGCCCTCTAAATCACGTCTCTACGATTAAAAGGAGC
+AGGTATCAAGCACACTAGAAAGTAGCTCATAACACCTTGCTCAGCCACACCCCCACGGGA
+CCCGCCATTAATACCAACATGCTACTTTAATCAATAAAATTTCCATAGACAGGCATCCCC
+CTAGATCTAATTTTCTAAATCTGTCAACCCTTCTTCCCCCGTTAATGTAGCTTAATAATA
+TAAAGCAAGG
diff --git a/tests/bwa/testcases/case_01b/reads1.fasta b/tests/bwa/testcases/case_01b/reads1.fasta
new file mode 100644
index 0000000..050ff32
--- /dev/null
+++ b/tests/bwa/testcases/case_01b/reads1.fasta
@@ -0,0 +1,2 @@
+>Sequence
+TAAAATTTCCATAGACAGGCATCCCCCTAGATCTGATTTTCTAAATCTGTCAACCCTTCTTCCCCCGTTAATGTAGCTTAATAATATAAAGCAAGGCACT
diff --git a/tests/bwa/testcases/case_01b/reads2.fasta b/tests/bwa/testcases/case_01b/reads2.fasta
new file mode 100644
index 0000000..3d52f99
--- /dev/null
+++ b/tests/bwa/testcases/case_01b/reads2.fasta
@@ -0,0 +1,2 @@
+>Sequence
+CAAGGTGTTATGAGCTACTTTCTAGTGTGCTTGATACCTGCTCCTTTTAATCGTAGAGACGCGATTTAGAGGGCATTCTCACTGGGGTGCGGATACTTGC
diff --git a/tests/bwa/testcases/case_02/README b/tests/bwa/testcases/case_02/README
new file mode 100644
index 0000000..0fe268e
--- /dev/null
+++ b/tests/bwa/testcases/case_02/README
@@ -0,0 +1 @@
+PE reads trigger assertion `re - rb == rlen'
\ No newline at end of file
diff --git a/tests/bwa/testcases/case_02/prefix.fasta b/tests/bwa/testcases/case_02/prefix.fasta
new file mode 100644
index 0000000..9fdc535
--- /dev/null
+++ b/tests/bwa/testcases/case_02/prefix.fasta
@@ -0,0 +1,3 @@
+>prefix
+CACAAAAAACAAACAAACAAACAAACAAACAAACAAACAAACAAACAAACAAACAAACAA
+ACAAACAAACAAACAAACAAACAC
diff --git a/tests/bwa/testcases/case_02/reads1.fasta b/tests/bwa/testcases/case_02/reads1.fasta
new file mode 100644
index 0000000..ba4a19f
--- /dev/null
+++ b/tests/bwa/testcases/case_02/reads1.fasta
@@ -0,0 +1,3 @@
+>read1
+CGTGTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTGTTTG
+TTTGTTTGTTTGTTTGTTTGTTCGGC
diff --git a/tests/bwa/testcases/case_02/reads2.fasta b/tests/bwa/testcases/case_02/reads2.fasta
new file mode 100644
index 0000000..040ef53
--- /dev/null
+++ b/tests/bwa/testcases/case_02/reads2.fasta
@@ -0,0 +1,2 @@
+>read1
+T
diff --git a/tests/bwa/testcases/case_02/run.sh b/tests/bwa/testcases/case_02/run.sh
new file mode 100644
index 0000000..bdafa8a
--- /dev/null
+++ b/tests/bwa/testcases/case_02/run.sh
@@ -0,0 +1,20 @@
+#!/bin/bash
+set -o nounset # Fail on unset variables
+set -o errexit # Fail on uncaught non-zero returncodes
+set -o pipefail # Fail is a command in a chain of pipes fails
+
+cd $(dirname $0)
+
+BWA=$1
+PREFIX=prefix.fasta
+
+rm -f ${PREFIX}.*
+
+$BWA index ${PREFIX} 2> ${PREFIX}.log
+$BWA aln ${PREFIX} reads1.fasta > reads1.fasta.fai 2> reads1.fasta.log
+$BWA aln ${PREFIX} reads2.fasta > reads2.fasta.fai 2> reads2.fasta.log
+
+set +o pipefail # Fail is a command in a chain of pipes fails
+$BWA sampe ${PREFIX} reads1.fasta.fai reads2.fasta.fai reads1.fasta reads2.fasta 2>&1 | grep "Assertion" > run.log && exit 13 || exit 0
+
+#../../check_sam.py --flags-unset 4 --position -1 results.sam
\ No newline at end of file
diff --git a/tests/bwa/testcases/case_03/README b/tests/bwa/testcases/case_03/README
new file mode 100644
index 0000000..448ab2d
--- /dev/null
+++ b/tests/bwa/testcases/case_03/README
@@ -0,0 +1,15 @@
+Insertion changed to match, causing misalignment (commit: 607e11d43d840ec5abbbbcf685e8b469521b91a6)
+
+Using revision 599e8407799ec603c3d36e9275cea959ede3a0bf (1 revision before 607e...):
+$ bwa aln genome.fa read.fa | bwa samse genome.fa - read.fa > good.sam
+$ cat good.sam
+ at SQ SN:Genome LN:300
+Sequence 16 Genome 36 37 1I3M1D37M1D146M * 0 0 GCAGAATTGAAGCTGCTCCTTTGAATTTGCAATTCAATGTGAAATTCACCACGGGACTTGATAAGAAGAGGATTCCAACCCCTGTCTTTAGATTTACAGTCTAATGCTTACTCAGCCATCTTACCTATGTTCATCAACCGCTGACTATTTTCAACTAACCACAAAGACATCGGCACTCTGTACCTCC * XT:A:U NM:i:3 X0:i:1 X1:i:0 XM:i:4 XO:i:1 XG:i:1 MD:Z:3^A37^A146
+
+
+Using revision 607e11d43d840ec5abbbbcf685e8b469521b91a6:
+$ bwa aln genome.fa read.fa | bwa samse genome.fa - read.fa > bad.sam
+$ cat bad.sam
+ at SQ SN:Genome LN:300
+Sequence 16 Genome 36 37 1M3M1D37M1D146M * 0 0 GCAGAATTGAAGCTGCTCCTTTGAATTTGCAATTCAATGTGAAATTCACCACGGGACTTGATAAGAAGAGGATTCCAACCCCTGTCTTTAGATTTACAGTCTAATGCTTACTCAGCCATCTTACCTATGTTCATCAACCGCTGACTATTTTCAACTAACCACAAAGACATCGGCACTCTGTACCTCC * XT:A:U NM:i:138 X0:i:1 X1:i:0 XM:i:4 XO:i:1 XG:i:1 MD:Z:0C0A0G0A0^A1T1G0A1G0C0T0G0C0T0C1T2G0A1T2G0C0A1T1C0A1T0G0T0G0A0^A2T1C0A0C1A0C0G2A0C0T1G0A0T0A1G0A1G0A0G1A0T1C1A1C3T0G0T0C0T2A0G0A0T2A0C0A0G0T0C0T0A1T0G0C0T1A0C0T0C0A0G0C1A0T0C0T1A0C1T0A0T0G0T1C0A0T0C0 [...]
+
diff --git a/tests/bwa/testcases/case_03/prefix.fasta b/tests/bwa/testcases/case_03/prefix.fasta
new file mode 100644
index 0000000..a2fcbeb
--- /dev/null
+++ b/tests/bwa/testcases/case_03/prefix.fasta
@@ -0,0 +1,6 @@
+>Genome
+CCGCCGCCTAGAAAAAAAGGCGGGAGAAGCCCCGGCAGAAATTGAAGCTGCTCCTTTGAA
+TTTGCAATTCAATGTGAAAATTCACCACGGGACTTGATAAGAAGAGGATTCCAACCCCTG
+TCTTTAGATTTACAGTCTAATGCTTACTCAGCCATCTTACCTATGTTCATCAACCGCTGA
+CTATTTTCAACTAACCACAAAGACATCGGCACTCTGTACCTCCTATTCGGCGCTTGAGCT
+GGAATAGTAGGAACTGCCCTAAGCCTCCTAATCCGTGCTGAATTAGGCCAACCTGGGACC
diff --git a/tests/bwa/testcases/case_03/reads.fasta b/tests/bwa/testcases/case_03/reads.fasta
new file mode 100644
index 0000000..767b92b
--- /dev/null
+++ b/tests/bwa/testcases/case_03/reads.fasta
@@ -0,0 +1,2 @@
+>Sequence
+GGAGGTACAGAGTGCCGATGTCTTTGTGGTTAGTTGAAAATAGTCAGCGGTTGATGAACATAGGTAAGATGGCTGAGTAAGCATTAGACTGTAAATCTAAAGACAGGGGTTGGAATCCTCTTCTTATCAAGTCCCGTGGTGAATTTCACATTGAATTGCAAATTCAAAGGAGCAGCTTCAATTCTGC
diff --git a/tests/bwa/testcases/case_03/run.sh b/tests/bwa/testcases/case_03/run.sh
new file mode 100644
index 0000000..59dd43a
--- /dev/null
+++ b/tests/bwa/testcases/case_03/run.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+set -o nounset # Fail on unset variables
+set -o errexit # Fail on uncaught non-zero returncodes
+set -o pipefail # Fail is a command in a chain of pipes fails
+
+cd $(dirname $0)
+
+BWA=$1
+PREFIX=prefix.fasta
+
+rm -f ${PREFIX}.*
+
+$BWA index ${PREFIX} 2> run.log
+$BWA aln ${PREFIX} reads.fasta > reads.fasta.fai 2> run.log
+
+set +o pipefail # Fail is a command in a chain of pipes fails
+$BWA samse ${PREFIX} reads.fasta.fai reads.fasta 2>&1 | grep NM | sed -e's#.*NM:i:##' | cut -f1 | xargs test 4 -lt && exit 13 || exit 0
diff --git a/tests/bwa/testcases/case_04/README b/tests/bwa/testcases/case_04/README
new file mode 100644
index 0000000..e2c813a
--- /dev/null
+++ b/tests/bwa/testcases/case_04/README
@@ -0,0 +1 @@
+CIGAR strings longer than the sequence / reference, causing samtools view to fail
diff --git a/tests/bwa/testcases/case_04/prefix.fasta b/tests/bwa/testcases/case_04/prefix.fasta
new file mode 100644
index 0000000..12a6fc4
--- /dev/null
+++ b/tests/bwa/testcases/case_04/prefix.fasta
@@ -0,0 +1,4 @@
+>Genome
+CTCAACACACCTAACAATCTTAACAGAACTTTCCCCCCGCCATTAATACCAACATGCTAC
+TTTAATCAATAAAATTTCCATAGACAGGCATCCCCCTAGATCTAATTTTCTAAATCTGTC
+AACCCTTCTTCCCCCGTTAATGTAGCTTAATAATATAAAGCAAGG
diff --git a/tests/bwa/testcases/case_04/reads.fasta b/tests/bwa/testcases/case_04/reads.fasta
new file mode 100644
index 0000000..3ccd132
--- /dev/null
+++ b/tests/bwa/testcases/case_04/reads.fasta
@@ -0,0 +1,2 @@
+>Sequence
+AACACACCTAACAATCTTAACAGAACTCTCCCCCCCGCCATTAATACCAACATGCTACTTTAATCAATAAAATTTCCATAGACAGGCATCCCCCTAGATCTGATTTTCTAAATCTGTCAACCCTTCTTCCCCCGTTAATGTAGCTTAATAATATAAAGCAAGGC
diff --git a/tests/bwa/testcases/case_04/results.sam b/tests/bwa/testcases/case_04/results.sam
new file mode 100644
index 0000000..3d1f166
--- /dev/null
+++ b/tests/bwa/testcases/case_04/results.sam
@@ -0,0 +1,3 @@
+ at SQ SN:Genome LN:165
+ at PG ID:bwa PN:bwa VN:0.7.8+dev-r462 CL:../builds/bwa-git/bwa samse prefix.fasta reads.fasta.fai reads.fasta
+Sequence 4 * 0 0 * * 0 0 AACACACCTAACAATCTTAACAGAACTCTCCCCCCCGCCATTAATACCAACATGCTACTTTAATCAATAAAATTTCCATAGACAGGCATCCCCCTAGATCTGATTTTCTAAATCTGTCAACCCTTCTTCCCCCGTTAATGTAGCTTAATAATATAAAGCAAGGC *
diff --git a/tests/bwa/testcases/case_04/run.sh b/tests/bwa/testcases/case_04/run.sh
new file mode 100644
index 0000000..db871ff
--- /dev/null
+++ b/tests/bwa/testcases/case_04/run.sh
@@ -0,0 +1,19 @@
+#!/bin/bash
+set -o nounset # Fail on unset variables
+set -o errexit # Fail on uncaught non-zero returncodes
+set -o pipefail # Fail is a command in a chain of pipes fails
+
+cd $(dirname $0)
+
+BWA=$1
+PREFIX=prefix.fasta
+
+rm -f ${PREFIX}.*
+
+$BWA index ${PREFIX} 2> run.log
+$BWA aln ${PREFIX} reads.fasta > reads.fasta.fai 2> run.log
+
+$BWA samse ${PREFIX} reads.fasta.fai reads.fasta 2> run.log > results.sam
+
+set +o pipefail
+samtools view -S results.sam 2>&1|grep error > run.log && exit 1 || exit 0
diff --git a/tests/common_tests/__init__.py b/tests/common_tests/__init__.py
new file mode 100644
index 0000000..90e5529
--- /dev/null
+++ b/tests/common_tests/__init__.py
@@ -0,0 +1,22 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
diff --git a/tests/common_tests/bedtools_tests.py b/tests/common_tests/bedtools_tests.py
new file mode 100644
index 0000000..217bd38
--- /dev/null
+++ b/tests/common_tests/bedtools_tests.py
@@ -0,0 +1,242 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# Ignore "invalid" function names
+# pylint: disable=C0103
+# No need for docstrings
+# pylint: disable=C0111
+import copy
+
+from nose.tools import \
+ assert_equal, \
+ assert_not_equal, \
+ assert_raises
+
+from paleomix.common.bedtools import \
+ BEDRecord
+
+
+###############################################################################
+###############################################################################
+# BEDRecord constructor
+
+def test_bedrecord__constructor__defaults():
+ record = BEDRecord()
+ assert_equal(len(record), 0)
+ assert_equal(str(record), "")
+ assert_equal(repr(record), "BEDRecord()")
+
+
+def test_bedrecord__constructor__empty_string():
+ record = BEDRecord("")
+ assert_equal(len(record), 0)
+ assert_equal(str(record), "")
+ assert_equal(repr(record), "BEDRecord()")
+
+
+def test_bedrecord__constructor__3_fields():
+ text = "my_contig\t12\t345"
+ record = BEDRecord(text)
+
+ assert_equal(len(record), 3)
+ assert_equal(str(record), text)
+ assert_equal(repr(record),
+ "BEDRecord(contig='my_contig', start=12, end=345)")
+
+
+def test_bedrecord__constructor__6_fields():
+ text = "my_contig\t12\t345\tmy_name\t-3\t-"
+ record = BEDRecord(text)
+
+ assert_equal(len(record), 6)
+ assert_equal(str(record), text)
+ assert_equal(repr(record),
+ "BEDRecord(contig='my_contig', start=12, "
+ "end=345, name='my_name', score=-3, strand='-')")
+
+
+def test_bedrecord__constructor__extra_fields():
+ text = "my_contig\t12\t345\tmy_name\t-3\t-\tfoo\tbar"
+ record = BEDRecord(text)
+
+ assert_equal(len(record), 8)
+ assert_equal(str(record), text)
+ assert_equal(repr(record),
+ "BEDRecord(contig='my_contig', start=12, "
+ "end=345, name='my_name', score=-3, strand='-', "
+ "'foo', 'bar')")
+
+
+###############################################################################
+###############################################################################
+# BEDRecord accessors
+
+def test_bedrecord__accessors__3_fields():
+ record = BEDRecord("my_contig\t12\t345")
+
+ assert_equal(record.contig, "my_contig")
+ assert_equal(record.start, 12)
+ assert_equal(record.end, 345)
+ assert_raises(IndexError, lambda: record.name)
+ assert_raises(IndexError, lambda: record.score)
+ assert_raises(IndexError, lambda: record.strand)
+
+
+def test_bedrecord__accessors__6_fields():
+ record = BEDRecord("my_contig\t12\t345\tmy_name\t-3\t-")
+
+ assert_equal(record.contig, "my_contig")
+ assert_equal(record.start, 12)
+ assert_equal(record.end, 345)
+ assert_equal(record.name, "my_name")
+ assert_equal(record.score, -3)
+ assert_equal(record.strand, "-")
+
+
+def test_bedrecord__accessors__extra_fields():
+ text = "my_contig\t12\t345\tmy_name\t-3\t-\tfoo\tbar"
+ record = BEDRecord(text)
+
+ assert_equal(record[6], "foo")
+ assert_equal(record[7], "bar")
+
+
+def test_bedrecord__accessors__6_fields__getitem():
+ record = BEDRecord("my_contig\t12\t345\tmy_name\t-3\t-")
+
+ assert_equal(record[0], "my_contig")
+ assert_equal(record[1], 12)
+ assert_equal(record[2], 345)
+ assert_equal(record[3], "my_name")
+ assert_equal(record[4], -3)
+ assert_equal(record[5], "-")
+
+
+def test_bedrecord__setters__3_fields():
+ record = BEDRecord("my_contig\t12\t345")
+
+ record.contig = "chrZ"
+ assert_equal(record.contig, "chrZ")
+
+ record.end += 20
+ assert_equal(record.end, 365)
+
+ assert_equal(str(record), "chrZ\t12\t365")
+ assert_equal(repr(record),
+ "BEDRecord(contig='chrZ', start=12, end=365)")
+
+
+def test_bedrecord__setters__type_errors():
+ record = BEDRecord("my_contig\t12\t345\tname\t0\t+")
+
+ assert_raises(ValueError, lambda: setattr(record, "contig", 17))
+ assert_raises(ValueError, lambda: setattr(record, "start", "foo"))
+ assert_raises(ValueError, lambda: setattr(record, "end", "foo"))
+ assert_raises(ValueError, lambda: setattr(record, "name", 17.3))
+ assert_raises(ValueError, lambda: setattr(record, "score", "foo"))
+ assert_raises(ValueError, lambda: setattr(record, "strand", "foo"))
+
+
+def test_bedrecord__setters__unset_fields__at_end():
+ record = BEDRecord("my_contig\t12\t345")
+
+ record.name = "my_region"
+ assert_equal(record.name, "my_region")
+
+ record.score = -13
+ assert_equal(record.score, -13)
+
+ record.strand = '-'
+ assert_equal(record.strand, '-')
+
+ assert_equal(str(record), "my_contig\t12\t345\tmy_region\t-13\t-")
+ assert_equal(repr(record),
+ "BEDRecord(contig='my_contig', start=12, end=345, "
+ "name='my_region', score=-13, strand='-')")
+
+
+def test_bedrecord__setters__unset_fields__after_end():
+ record = BEDRecord("")
+ record.strand = "-"
+ assert_equal(str(record), "\t0\t0\t\t0\t-")
+
+ record = BEDRecord("my_name")
+ record.strand = "-"
+ assert_equal(str(record), "my_name\t0\t0\t\t0\t-")
+
+ record = BEDRecord("my_name\t17")
+ record.strand = "-"
+ assert_equal(str(record), "my_name\t17\t0\t\t0\t-")
+
+ record = BEDRecord("my_name\t17\t258")
+ record.strand = "-"
+ assert_equal(str(record), "my_name\t17\t258\t\t0\t-")
+
+ record = BEDRecord("my_name\t17\t258\tregion")
+ record.strand = "-"
+ assert_equal(str(record), "my_name\t17\t258\tregion\t0\t-")
+
+ record = BEDRecord("my_name\t17\t258\tregion\t33")
+ record.strand = "-"
+ assert_equal(str(record), "my_name\t17\t258\tregion\t33\t-")
+
+ record = BEDRecord("my_name\t17\t258\tregion\t33\t+")
+ record.strand = "-"
+ assert_equal(str(record), "my_name\t17\t258\tregion\t33\t-")
+
+
+def test_bedrecord__cmp():
+ record_1_txt = "my_contig\t12\t345\tmy_name\t-3\t-\tfoo"
+ record_1 = BEDRecord(record_1_txt)
+ record_2 = BEDRecord("chrZ\t132\t4345\tchrZ_region\t0\t+\tbar")
+
+ for idx in xrange(len(record_2)):
+ record_tmp = BEDRecord(record_1_txt)
+ assert_equal(record_1, record_tmp)
+ record_tmp[idx] = record_2[idx]
+ assert_not_equal(record_1, record_tmp)
+ record_tmp[idx] = record_1[idx]
+ assert_equal(record_1, record_tmp)
+
+
+###############################################################################
+###############################################################################
+
+def test_bedrecord__copy():
+ record_1_txt = "my_contig\t12\t345\tmy_name\t-3\t-"
+ record_1 = BEDRecord(record_1_txt)
+ record_2 = copy.copy(record_1)
+ record_2.name = "my_clone"
+
+ assert_equal(str(record_1), record_1_txt)
+ assert_equal(str(record_2), "my_contig\t12\t345\tmy_clone\t-3\t-")
+
+
+def test_bedrecord__deepcopy():
+ record_1_txt = "my_contig\t12\t345\tmy_name\t-3\t-"
+ record_1 = BEDRecord(record_1_txt)
+ record_1[6] = ["foo"]
+ record_2 = copy.deepcopy(record_1)
+ record_2[6][0] = "bar"
+
+ assert_equal(str(record_1), record_1_txt + "\t['foo']")
+ assert_equal(str(record_2), record_1_txt + "\t['bar']")
diff --git a/tests/common_tests/fileutils_test.py b/tests/common_tests/fileutils_test.py
new file mode 100644
index 0000000..02e2bc3
--- /dev/null
+++ b/tests/common_tests/fileutils_test.py
@@ -0,0 +1,952 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import stat
+import errno
+
+import nose
+from nose.tools import \
+ assert_is, \
+ assert_in, \
+ assert_equal, \
+ assert_raises
+
+import paleomix
+from paleomix.common.testing import \
+ with_temp_folder, \
+ Monkeypatch, \
+ SetWorkingDirectory, \
+ set_file_contents, \
+ get_file_contents
+
+from paleomix.common.fileutils import \
+ add_postfix, \
+ swap_ext, \
+ reroot_path, \
+ create_temp_dir, \
+ missing_files, \
+ modified_after, \
+ is_executable, \
+ which_executable, \
+ executable_exists, \
+ missing_executables, \
+ make_dirs, \
+ move_file, \
+ copy_file, \
+ open_ro, \
+ try_rmdir, \
+ try_remove, \
+ try_rmtree, \
+ describe_files, \
+ describe_paired_files
+
+
+###############################################################################
+###############################################################################
+# Setup timestamps for test files
+
+def test_dir():
+ return os.path.dirname(os.path.dirname(__file__))
+
+
+def test_file(*args):
+ return os.path.join(test_dir(), "data", *args)
+
+
+def setup_module():
+ timestamps = {test_file("timestamp_a_older"): 1000190760,
+ test_file("timestamp_b_older"): 1000190760,
+ test_file("timestamp_a_younger"): 1120719000,
+ test_file("timestamp_b_younger"): 1120719000}
+
+ for filename, timestamp in timestamps.iteritems():
+ # Set atime and mtime
+ os.utime(filename, (timestamp, timestamp))
+
+
+###############################################################################
+###############################################################################
+# Tests for 'add_postfix'
+
+def test_add_postfix__no_postfix():
+ assert_equal(add_postfix("name.foo", ""), "name.foo")
+
+
+def test_add_postfix__dot_postfix():
+ assert_equal(add_postfix("name.foo", ".pf"), "name.pf.foo")
+
+
+def test_add_postfix__underscore_postfix():
+ assert_equal(add_postfix("name.foo", "_pf"), "name_pf.foo")
+
+
+def test_add_postfix__no_ext__no_postfix():
+ assert_equal(add_postfix("name", ""), "name")
+
+
+def test_add_postfix__no_ext__dot_postfix():
+ assert_equal(add_postfix("name", ".pf"), "name.pf")
+
+
+def test_add_postfix__no_ext__underscore_postfix():
+ assert_equal(add_postfix("name", "_pf"), "name_pf")
+
+
+###############################################################################
+###############################################################################
+# Tests for 'swap_ext'
+
+def test_swap_ext__has_ext_vs_empty_ext():
+ assert_equal(swap_ext("name.foo", ""), "name")
+
+
+def test_swap_ext__empty_ext_vs_empty_ext():
+ assert_equal(swap_ext("name", ""), "name")
+
+
+def test_swap_ext__has_ext_vs_dot_ext():
+ assert_equal(swap_ext("name.foo", "."), "name")
+
+
+def test_swap_ext__dot_ext_vs_dot_ext():
+ assert_equal(swap_ext("name.", "."), "name")
+
+
+def test_swap_ext__multiple__has_ext_vs_empty_ext():
+ assert_equal(swap_ext("name.foo.bar", ""), "name.foo")
+
+
+def test_swap_ext__multiple__has_ext_vs_dot_ext():
+ assert_equal(swap_ext("name.foo.bar", "."), "name.foo")
+
+
+def test_swap_ext__multiple__dot_ext_vs_dot_ext():
+ assert_equal(swap_ext("name.foo.", "."), "name.foo")
+
+
+def test_swap_ext__has_ext_vs_new_ext():
+ assert_equal(swap_ext("name.foo", "bar"), "name.bar")
+
+
+def test_swap_ext__has_ext_vs_new_dot_ext():
+ assert_equal(swap_ext("name.foo", ".bar"), "name.bar")
+
+
+def test_swap_ext__empty_ext_vs_new_ext():
+ assert_equal(swap_ext("name", "bar"), "name.bar")
+
+
+def test_swap_ext__dot_ext_vs_new_dot_ext():
+ assert_equal(swap_ext("name", ".bar"), "name.bar")
+
+
+###############################################################################
+###############################################################################
+# Tests for 'reroot_path'
+
+def test_reroot_path__empty_root():
+ assert_equal(reroot_path("", "/etc/apt/sources.list"), "sources.list")
+
+
+def test_reroot_path__empty_path():
+ assert_equal(reroot_path("/etc/apt", ""), "/etc/apt/")
+
+
+def test_reroot_path__abs_abs__wo_final_dash():
+ assert_equal(reroot_path("/etc/apt", "/tmp/sources.list"),
+ "/etc/apt/sources.list")
+
+
+def test_reroot_path__abs_abs__w_final_dash():
+ assert_equal(reroot_path("/etc/apt/", "/tmp/sources.list"),
+ "/etc/apt/sources.list")
+
+
+def test_reroot_path__abs_rel__wo_final_dash():
+ assert_equal(reroot_path("/etc/apt", "tmp/sources.list"),
+ "/etc/apt/sources.list")
+
+
+def test_reroot_path__abs_rel__w_final_dash():
+ assert_equal(reroot_path("/etc/apt/", "tmp/sources.list"),
+ "/etc/apt/sources.list")
+
+
+def test_reroot_path__rel_abs__wo_final_dash():
+ assert_equal(reroot_path("etc/apt", "/tmp/sources.list"),
+ "etc/apt/sources.list")
+
+
+def test_reroot_path__rel_abs__w_final_dash():
+ assert_equal(reroot_path("etc/apt/", "/tmp/sources.list"),
+ "etc/apt/sources.list")
+
+
+def test_reroot_path__rel_rel__wo_final_dash():
+ assert_equal(reroot_path("etc/apt", "tmp/sources.list"),
+ "etc/apt/sources.list")
+
+
+def test_reroot_path__rel_rel__w_final_dash():
+ assert_equal(reroot_path("etc/apt/", "tmp/sources.list"),
+ "etc/apt/sources.list")
+
+
+###############################################################################
+###############################################################################
+# Tests for 'create_temp_dir'
+
+ at with_temp_folder
+def test_create_temp_dir__create(temp_folder):
+ tmp_dir_1 = create_temp_dir(temp_folder)
+ tmp_dir_2 = create_temp_dir(temp_folder)
+ assert os.path.exists(tmp_dir_1)
+ assert os.path.exists(tmp_dir_2)
+
+
+ at with_temp_folder
+def test_create_temp_dir__empty(temp_folder):
+ tmp_dir = create_temp_dir(temp_folder)
+ contents = os.listdir(tmp_dir)
+ assert not contents
+
+
+ at with_temp_folder
+def test_create_temp_dir__permissions(temp_folder):
+ tmp_dir = create_temp_dir(temp_folder)
+ stats = os.stat(tmp_dir)
+ assert_equal(stats.st_mode & 0777, 0750)
+
+
+ at with_temp_folder
+def test_create_temp_dir__creation_preempted(temp_folder):
+ unwrapped, preempted_once = os.makedirs, []
+
+ def _wrap_os_makedirs(*args, **kwargs):
+ # Simulate somebody else creating the directory first
+ if not preempted_once:
+ unwrapped(*args, **kwargs)
+ preempted_once.append(True)
+ unwrapped(*args, **kwargs)
+
+ with Monkeypatch("os.makedirs", _wrap_os_makedirs):
+ assert not os.listdir(temp_folder)
+ work_dir = create_temp_dir(temp_folder)
+ assert os.path.exists(temp_folder)
+ dirs = os.listdir(temp_folder)
+ assert_equal(len(dirs), 2)
+ assert_in(os.path.basename(work_dir), dirs)
+ assert bool(preempted_once)
+
+
+def test_create_temp_dir__permission_denied():
+ def _wrap_os_makedirs(*_args, **_kwargs):
+ raise OSError((errno.EACCES, "Simulated premission denied"))
+
+ with Monkeypatch("os.makedirs", _wrap_os_makedirs):
+ assert_raises(OSError, create_temp_dir, "/tmp")
+
+
+###############################################################################
+###############################################################################
+# Tests for 'missing_files'
+
+def test_missing_files__file_exists():
+ assert_equal(missing_files([test_file("empty_file_1")]), [])
+
+
+def test_missing_files__file_doesnt_exist():
+ assert_equal(missing_files([test_file("missing_file_1")]),
+ [test_file("missing_file_1")])
+
+
+def test_missing_files__mixed_files():
+ files = [test_file("missing_file_1"),
+ test_file("empty_file_1")]
+ result = [test_file("missing_file_1")]
+
+ assert_equal(missing_files(files), result)
+
+
+###############################################################################
+###############################################################################
+# Tests for 'modified_after'
+
+def test_modified_after__modified_after():
+ assert modified_after(test_file("timestamp_a_younger"),
+ test_file("timestamp_a_older"))
+ assert modified_after(test_file("timestamp_a_younger"),
+ test_file("timestamp_b_older"))
+ assert modified_after(test_file("timestamp_b_younger"),
+ test_file("timestamp_a_older"))
+
+
+def test_modified_after__not_modified_after():
+ assert not modified_after(test_file("timestamp_a_older"),
+ test_file("timestamp_a_younger"))
+ assert not modified_after(test_file("timestamp_a_older"),
+ test_file("timestamp_b_younger"))
+ assert not modified_after(test_file("timestamp_b_older"),
+ test_file("timestamp_a_younger"))
+
+
+def test_modified_after__same_file():
+ assert not modified_after(test_file("timestamp_a_older"),
+ test_file("timestamp_a_older"))
+ assert not modified_after(test_file("timestamp_a_older"),
+ test_file("timestamp_b_older"))
+ assert not modified_after(test_file("timestamp_b_older"),
+ test_file("timestamp_a_older"))
+
+
+###############################################################################
+###############################################################################
+# Tests for 'is_executable'
+
+def test_is_executable__full_path__is_executable():
+ assert is_executable("/bin/ls")
+
+
+def test_is_executable__full_path__is_non_executable():
+ assert not is_executable("/etc/fstab")
+
+
+def test_is_executable__full_path__folder_is_non_executable():
+ assert not is_executable("/etc")
+
+
+def test_is_executable__rel_path__is_executable():
+ assert is_executable(os.path.join(test_dir(), "run"))
+
+
+def test_is_executable__rel_path__is_non_executable():
+ assert not is_executable(test_file("empty_file_1"))
+
+
+###############################################################################
+###############################################################################
+# Tests for 'which_executable'
+
+def test_which_executable__executable():
+ assert_equal("/bin/ls", which_executable("ls"))
+
+
+def test_which_executable__non_executable():
+ assert_is(None, which_executable("lsxxxx"))
+
+
+def test_which_executable__executable__but_no_path():
+ path = os.environ.pop('PATH')
+ try:
+ assert_is(None, which_executable("ls"))
+ finally:
+ os.environ['PATH'] = path
+
+
+def test_which_executable__executable__but_empty_path():
+ path = os.environ.pop('PATH')
+ try:
+ os.environ['PATH'] = ""
+ assert_is(None, which_executable("ls"))
+ finally:
+ os.environ['PATH'] = path
+
+
+def test_which_executable__executable__by_path_order_1():
+ path = os.environ.pop('PATH')
+ try:
+ path_1 = test_dir()
+ path_2 = os.path.join(os.getcwd(), path_1)
+
+ os.environ['PATH'] = ":".join((path_1, path_2))
+ assert_equal(os.path.join(path_1, "run"), which_executable("run"))
+ finally:
+ os.environ['PATH'] = path
+
+
+def test_which_executable__executable__by_path_order_2():
+ path = os.environ.pop('PATH')
+ try:
+ path_1 = test_dir()
+ path_2 = os.path.join(os.getcwd(), path_1)
+
+ os.environ['PATH'] = ":".join((path_2, path_1))
+ assert_equal(os.path.join(path_2, "run"), which_executable("run"))
+ finally:
+ os.environ['PATH'] = path
+
+
+###############################################################################
+###############################################################################
+# Tests for 'executable_exists'
+
+def test_executable_exists__executable():
+ assert executable_exists("ls")
+
+
+def test_executable_exists__non_executable():
+ assert not executable_exists("lsxxxx")
+
+
+def test_executable_exists__full_path__is_executable():
+ assert executable_exists("/bin/ls")
+
+
+def test_executable_exists__full_path__is_non_executable():
+ assert not executable_exists("/etc/fstab")
+
+
+def test_executable_exists__rel_path__is_executable():
+ assert executable_exists(os.path.join(test_dir(), "run"))
+
+
+def test_executable_exists__rel_path__is_non_executable():
+ assert not executable_exists(test_file("empty_file_1"))
+
+
+###############################################################################
+###############################################################################
+# Tests for 'missing_executables'
+
+def test_missing_executables__executable():
+ assert_equal(missing_executables(["ls"]), [])
+
+
+def test_missing_executables__non_executable():
+ assert_equal(missing_executables(["lsxxxx"]), ["lsxxxx"])
+
+
+def test_missing_executables__mixed():
+ assert_equal(missing_executables(["lsxxxx", "ls"]), ["lsxxxx"])
+
+
+###############################################################################
+###############################################################################
+# Tests for 'make_dirs'
+
+ at with_temp_folder
+def test_make_dirs__create_dir(temp_folder):
+ assert not os.listdir(temp_folder)
+ assert make_dirs(os.path.join(temp_folder, "test123"))
+ assert_equal(os.listdir(temp_folder), ["test123"])
+
+
+ at with_temp_folder
+def test_make_dirs__return_values(temp_folder):
+ assert make_dirs(os.path.join(temp_folder, "test234"))
+ assert not make_dirs(os.path.join(temp_folder, "test234"))
+
+
+ at with_temp_folder
+def test_make_dirs__subdirs_return_values(temp_folder):
+ assert make_dirs(os.path.join(temp_folder, "test"))
+ assert make_dirs(os.path.join(temp_folder, "test", "234"))
+ assert not make_dirs(os.path.join(temp_folder, "test", "234"))
+
+
+ at with_temp_folder
+def test_make_dirs__sub_directories(temp_folder):
+ assert not os.listdir(temp_folder)
+ assert make_dirs(os.path.join(temp_folder, "test", "123"))
+ assert_equal(os.listdir(temp_folder), ["test"])
+ assert_equal(os.listdir(os.path.join(temp_folder, "test")), ["123"])
+
+
+ at with_temp_folder
+def test_make_dirs__permissions(temp_folder):
+ work_dir = os.path.join(temp_folder, "test_1")
+ assert make_dirs(work_dir, mode=0511)
+ stats = os.stat(work_dir)
+ assert_equal(oct(stats.st_mode & 0777), oct(0511))
+
+
+ at with_temp_folder
+def test_make_dirs__creation_preemted(temp_folder):
+ unwrapped, preempted = os.makedirs, []
+
+ def _wrap_os_makedirs(*args, **kwargs):
+ # Simulate somebody else creating the directory first
+ preempted.append(True)
+ unwrapped(*args, **kwargs)
+ unwrapped(*args, **kwargs)
+
+ with Monkeypatch("os.makedirs", _wrap_os_makedirs):
+ work_folder = os.path.join(temp_folder, "test")
+ assert not make_dirs(work_folder)
+ assert os.path.exists(work_folder)
+ assert_equal(os.listdir(temp_folder), ["test"])
+ assert_equal(preempted, [True])
+
+
+ at with_temp_folder
+def test_make_dirs__permission_denied(temp_folder):
+ # Make temporary folder read-only
+ mode = os.stat(temp_folder).st_mode
+ ro_mode = mode & ~(stat.S_IWUSR | stat.S_IWGRP | stat.S_IWOTH)
+ os.chmod(temp_folder, ro_mode)
+ # Non OEXIST errors should be re-raised:
+ assert_raises(OSError, make_dirs, os.path.join(temp_folder, "foo"))
+
+
+ at nose.tools.raises(ValueError)
+def test_make_dirs__empty_directory():
+ make_dirs("")
+
+
+###############################################################################
+###############################################################################
+# Tests for 'move_file'
+
+ at with_temp_folder
+def test_move_file__simple_move(temp_folder):
+ file_1 = os.path.join(temp_folder, "file_1")
+ file_2 = os.path.join(temp_folder, "file_2")
+ assert_equal(os.listdir(temp_folder), [])
+ set_file_contents(file_1, "1")
+ assert_equal(os.listdir(temp_folder), ["file_1"])
+ move_file(file_1, file_2)
+ assert_equal(os.listdir(temp_folder), ["file_2"])
+ assert_equal(get_file_contents(file_2), "1")
+
+
+ at with_temp_folder
+def test_move_file__simple_move_in_cwd(temp_folder):
+ with SetWorkingDirectory(temp_folder):
+ assert_equal(os.listdir("."), [])
+ set_file_contents("file_1", "1")
+ assert_equal(os.listdir("."), ["file_1"])
+ move_file("file_1", "file_2")
+ assert_equal(os.listdir("."), ["file_2"])
+ assert_equal(get_file_contents("file_2"), "1")
+
+
+ at with_temp_folder
+def test_move_dirs__permission_denied(temp_folder):
+ dst_folder = os.path.join(temp_folder, "dst")
+ file_1 = os.path.join(temp_folder, "file")
+ file_2 = os.path.join(dst_folder, "file")
+ set_file_contents(file_1, "1")
+
+ # Make destination folder read-only
+ assert make_dirs(os.path.join(temp_folder, "dst"))
+ mode = os.stat(dst_folder).st_mode
+ ro_mode = mode & ~(stat.S_IWUSR | stat.S_IWGRP | stat.S_IWOTH)
+ os.chmod(dst_folder, ro_mode)
+
+ # Non ENOENT errors should be re-raised:
+ assert_raises(IOError, move_file, file_1, file_2)
+
+
+ at with_temp_folder
+def test_move_file__move_to_existing_folder(temp_folder):
+ assert make_dirs(os.path.join(temp_folder, "src"))
+ assert make_dirs(os.path.join(temp_folder, "dst"))
+ file_1 = os.path.join(temp_folder, "src", "file_1")
+ file_2 = os.path.join(temp_folder, "dst", "file_2")
+ set_file_contents(file_1, "2")
+ move_file(file_1, file_2)
+ assert_equal(os.listdir(os.path.dirname(file_1)), [])
+ assert_equal(os.listdir(os.path.dirname(file_2)), ["file_2"])
+ assert_equal(get_file_contents(file_2), "2")
+
+
+ at with_temp_folder
+def test_move_file__move_to_new_folder(temp_folder):
+ assert make_dirs(os.path.join(temp_folder, "src"))
+ file_1 = os.path.join(temp_folder, "src", "file_1")
+ file_2 = os.path.join(temp_folder, "dst", "file_2")
+ set_file_contents(file_1, "2")
+ move_file(file_1, file_2)
+ assert_equal(os.listdir(os.path.dirname(file_1)), [])
+ assert_equal(os.listdir(os.path.dirname(file_2)), ["file_2"])
+ assert_equal(get_file_contents(file_2), "2")
+
+
+ at with_temp_folder
+def test_move_file__move_to_different_folder(temp_folder):
+ with SetWorkingDirectory(temp_folder):
+ set_file_contents("file_1", "3")
+ move_file("file_1", "dst/file_1")
+ assert_equal(os.listdir("."), ["dst"])
+ assert_equal(os.listdir("dst"), ["file_1"])
+ assert_equal(get_file_contents("dst/file_1"), "3")
+
+
+ at with_temp_folder
+def test_move_file__overwrite(temp_folder):
+ with SetWorkingDirectory(temp_folder):
+ set_file_contents("file_1", "4")
+ set_file_contents("file_2", "5")
+ move_file("file_1", "file_2")
+ assert_equal(os.listdir("."), ["file_2"])
+ assert_equal(get_file_contents("file_2"), "4")
+
+
+def test_move_file__enoent_reraised_if_not_due_to_missing_folder():
+ assert_raises(IOError, move_file, "", "./dst")
+
+
+###############################################################################
+###############################################################################
+# Tests for 'copy_file'
+
+ at with_temp_folder
+def test_copy_file__simple_copy(temp_folder):
+ file_1 = os.path.join(temp_folder, "file_1")
+ file_2 = os.path.join(temp_folder, "file_2")
+ assert_equal(os.listdir(temp_folder), [])
+ set_file_contents(file_1, "1")
+ assert_equal(os.listdir(temp_folder), ["file_1"])
+ copy_file(file_1, file_2)
+ assert_equal(set(os.listdir(temp_folder)), set(["file_1", "file_2"]))
+ assert_equal(get_file_contents(file_1), "1")
+ assert_equal(get_file_contents(file_2), "1")
+
+
+ at with_temp_folder
+def test_copy_file__simple_copy_in_cwd(temp_folder):
+ with SetWorkingDirectory(temp_folder):
+ assert_equal(os.listdir("."), [])
+ set_file_contents("file_1", "1")
+ assert_equal(os.listdir("."), ["file_1"])
+ copy_file("file_1", "file_2")
+ assert_equal(set(os.listdir(".")), set(["file_1", "file_2"]))
+ assert_equal(get_file_contents("file_1"), "1")
+ assert_equal(get_file_contents("file_2"), "1")
+
+
+ at with_temp_folder
+def test_copy_file__copy_to_existing_folder(temp_folder):
+ assert make_dirs(os.path.join(temp_folder, "src"))
+ assert make_dirs(os.path.join(temp_folder, "dst"))
+ file_1 = os.path.join(temp_folder, "src", "file_1")
+ file_2 = os.path.join(temp_folder, "dst", "file_2")
+ set_file_contents(file_1, "2")
+ copy_file(file_1, file_2)
+ assert_equal(os.listdir(os.path.dirname(file_1)), ["file_1"])
+ assert_equal(os.listdir(os.path.dirname(file_2)), ["file_2"])
+ assert_equal(get_file_contents(file_1), "2")
+ assert_equal(get_file_contents(file_2), "2")
+
+
+ at with_temp_folder
+def test_copy_file__copy_to_new_folder(temp_folder):
+ assert make_dirs(os.path.join(temp_folder, "src"))
+ file_1 = os.path.join(temp_folder, "src", "file_1")
+ file_2 = os.path.join(temp_folder, "dst", "file_2")
+ set_file_contents(file_1, "2")
+ copy_file(file_1, file_2)
+ assert_equal(os.listdir(os.path.dirname(file_1)), ["file_1"])
+ assert_equal(os.listdir(os.path.dirname(file_2)), ["file_2"])
+ assert_equal(get_file_contents(file_1), "2")
+ assert_equal(get_file_contents(file_2), "2")
+
+
+ at with_temp_folder
+def test_copy_file__copy_to_different_folder(temp_folder):
+ with SetWorkingDirectory(temp_folder):
+ set_file_contents("file_1", "3")
+ copy_file("file_1", "dst/file_1")
+ assert_equal(set(os.listdir(".")), set(["file_1", "dst"]))
+ assert_equal(os.listdir("dst"), ["file_1"])
+ assert_equal(get_file_contents("file_1"), "3")
+ assert_equal(get_file_contents("dst/file_1"), "3")
+
+
+ at with_temp_folder
+def test_copy_file__overwrite(temp_folder):
+ with SetWorkingDirectory(temp_folder):
+ set_file_contents("file_1", "4")
+ set_file_contents("file_2", "5")
+ copy_file("file_1", "file_2")
+ assert_equal(set(os.listdir(".")), set(["file_1", "file_2"]))
+ assert_equal(get_file_contents("file_1"), "4")
+ assert_equal(get_file_contents("file_2"), "4")
+
+
+def test_copy_file__enoent_reraised_if_not_due_to_missing_folder():
+ assert_raises(IOError, copy_file, "", "./dst")
+
+
+###############################################################################
+###############################################################################
+# Tests for 'open'
+
+
+def test_open_ro__uncompressed():
+ handle = open_ro(test_file('fasta_file.fasta'))
+ try:
+ assert_equal(handle.read(),
+ b'>This_is_FASTA!\nACGTN\n'
+ b'>This_is_ALSO_FASTA!\nCGTNA\n')
+ finally:
+ handle.close()
+
+
+def test_open_ro__gz():
+ handle = open_ro(test_file('fasta_file.fasta.gz'))
+ try:
+ assert_equal(handle.read(),
+ b'>This_is_GZipped_FASTA!\nACGTN\n'
+ b'>This_is_ALSO_GZipped_FASTA!\nCGTNA\n')
+ finally:
+ handle.close()
+
+
+def test_open_ro__bz2():
+ handle = open_ro(test_file('fasta_file.fasta.bz2'))
+ try:
+ assert_equal(handle.read(),
+ b'>This_is_BZ_FASTA!\nCGTNA\n'
+ b'>This_is_ALSO_BZ_FASTA!\nACGTN\n')
+ finally:
+ handle.close()
+
+
+class OddException(RuntimeError):
+ pass
+
+
+def test_open_ro__close_handle_on_error():
+ class _FileMock(object):
+ def __init__(self, filename):
+ self._close_called = False
+ assert_equal(filename, "/var/abc")
+
+ def read(self, *_args, **_kwargs):
+ # pylint: disable=R0201
+ raise OddException("ARGH!")
+
+ def close(self):
+ self._close_called = True
+
+ try:
+ paleomix.common.fileutils.open = _FileMock
+ assert_raises(OddException, open_ro, "/var/abc")
+ finally:
+ del paleomix.common.fileutils.open
+
+
+###############################################################################
+###############################################################################
+# Tests for 'try_remove'
+
+ at with_temp_folder
+def test_try_remove(temp_folder):
+ fpath = os.path.join(temp_folder, "test.txt")
+ set_file_contents(fpath, "1 2 3")
+ assert try_remove(fpath)
+ assert not os.path.exists(fpath)
+
+
+ at with_temp_folder
+def test_try_remove__missing(temp_folder):
+ fpath = os.path.join(temp_folder, "test.txt")
+ assert not try_remove(fpath)
+ assert not os.path.exists(fpath)
+
+
+ at with_temp_folder
+def test_try_remove__non_file(temp_folder):
+ assert_raises(OSError, try_remove, temp_folder)
+
+
+###############################################################################
+###############################################################################
+# Tests for 'try_rmdir'
+
+ at with_temp_folder
+def test_try_rmdir(temp_folder):
+ fpath = os.path.join(temp_folder, "testdir")
+ os.mkdir(fpath)
+ assert try_rmdir(fpath)
+ assert not os.path.exists(fpath)
+
+
+ at with_temp_folder
+def test_try_rmdir__missing(temp_folder):
+ fpath = os.path.join(temp_folder, "testdir")
+ assert not try_rmdir(fpath)
+ assert not os.path.exists(fpath)
+
+
+ at with_temp_folder
+def test_try_rmdir__non_file(temp_folder):
+ fpath = os.path.join(temp_folder, "test.txt")
+ set_file_contents(fpath, "1 2 3")
+ assert_raises(OSError, try_rmdir, fpath)
+
+
+###############################################################################
+###############################################################################
+# Tests for 'try_rmdir'
+
+ at with_temp_folder
+def test_try_rmtree(temp_folder):
+ fpath = os.path.join(temp_folder, "testdir")
+ os.mkdir(fpath)
+ set_file_contents(os.path.join(fpath, "file"), "1 2 3")
+ assert try_rmtree(fpath)
+ assert not os.path.exists(fpath)
+
+
+ at with_temp_folder
+def test_try_treedir__missing(temp_folder):
+ fpath = os.path.join(temp_folder, "testdir")
+ assert not try_rmtree(fpath)
+ assert not os.path.exists(fpath)
+
+
+###############################################################################
+###############################################################################
+# Tests for 'describe_files'
+
+def test_describe_files__no_files():
+ assert_equal(describe_files(()), "No files")
+
+
+def test_describe_files__single_file():
+ fpath = "/var/foo/bar"
+ assert_equal(describe_files((fpath,)), repr(fpath))
+
+
+def test_describe_files__same_path_abs__3_differences():
+ fpaths = ("/var/foo/bar", "/var/foo/foo")
+ assert_equal(describe_files(fpaths), "2 files in '/var/foo'")
+
+
+def test_describe_files__same_path_abs__2_differences():
+ fpaths = ("/var/foo/faz", "/var/foo/foo")
+ assert_equal(describe_files(fpaths), "'/var/foo/f??'")
+
+
+def test_describe_files__same_path_abs__1_differences():
+ fpaths = ("/var/foo/faz", "/var/foo/fao")
+ assert_equal(describe_files(fpaths), "'/var/foo/fa?'")
+
+
+def test_describe_files__different_paths_abs():
+ fpaths = ("/var/foo/bar", "/var/bar/foo")
+ assert_equal(describe_files(fpaths), "2 files")
+
+
+def test_describe_files__same_path_rel():
+ fpaths = ("var/foo/bar", "var/foo/foo")
+ assert_equal(describe_files(fpaths), "2 files in 'var/foo'")
+
+
+def test_describe_files__different_paths_rel():
+ fpaths = ("var/foo/bar", "var/bar/foo")
+ assert_equal(describe_files(fpaths), "2 files")
+
+
+def test_describe_files__iterable():
+ fpaths = iter(("/var/foo/bar", "/var/foo/foo"))
+ assert_equal(describe_files(fpaths), "2 files in '/var/foo'")
+
+
+def test_describe_files__none_files():
+ assert_raises(ValueError, describe_files, None)
+
+
+###############################################################################
+###############################################################################
+# Tests for 'describe_paired_files'
+
+def test_describe_paired_files__single_file():
+ fpath = "/var/foo/bar"
+ assert_equal(describe_paired_files((fpath,), ()), repr(fpath))
+
+
+def test_describe_paired_files__identical_files():
+ fpath = "/var/foo/bar"
+ ftuple = (fpath,)
+ assert_equal(describe_paired_files(ftuple, ftuple), repr(fpath))
+
+
+def test_describe_paired_files__same_path__similar_files():
+ files_1 = ("foo/1_abc", "foo/1_def")
+ files_2 = ("foo/1_ghi", "foo/1_jkl")
+ expected = "'foo/1_???'"
+ result = describe_paired_files(files_1, files_2)
+ assert_equal(result, expected)
+
+
+def test_describe_paired_files__same_path__similar_files__different_prefixes():
+ files_1 = ("foo/1_abc", "foo/1_def")
+ files_2 = ("foo/2_ghi", "foo/2_jkl")
+ expected = "'foo/[12]_???'"
+ result = describe_paired_files(files_1, files_2)
+ assert_equal(result, expected)
+
+
+def test_describe_paired_files__same_path__similar_files__too_different():
+ files_1 = ("foo/1a_abc", "foo/1a_def")
+ files_2 = ("foo/2b_ghi", "foo/2b_jkl")
+ expected = "2 pair(s) of files in 'foo'"
+ result = describe_paired_files(files_1, files_2)
+ assert_equal(result, expected)
+
+
+def test_describe_paired_files__same_path__different_files():
+ files_1 = ("foo/1_abc", "foo/2_def")
+ files_2 = ("foo/3_ghi", "foo/4_jkl")
+ expected = "2 pair(s) of files in 'foo'"
+ result = describe_paired_files(files_1, files_2)
+ assert_equal(result, expected)
+
+
+def test_describe_paired_files__same_path__different_file_lens():
+ files_1 = ("foo/1_a", "foo/2_de")
+ files_2 = ("foo/3_g", "foo/4_jk")
+ expected = "2 pair(s) of files in 'foo'"
+ result = describe_paired_files(files_1, files_2)
+ assert_equal(result, expected)
+
+
+def test_describe_paired_files__different_path_and_files():
+ files_1 = ("foo/1_abc", "bar/2_def")
+ files_2 = ("zed/3_ghi", "not/4_jkl")
+ expected = "2 pair(s) of files"
+ result = describe_paired_files(files_1, files_2)
+ assert_equal(result, expected)
+
+
+def test_describe_paired_files__files_1_longer():
+ assert_raises(ValueError, describe_paired_files, ("a", "b"), ("c",))
+
+
+def test_describe_paired_files__files_2_longer():
+ assert_raises(ValueError, describe_paired_files, ("a",), ("b", "c"))
+
+
+def test_describe_paired_files__none_files():
+ assert_raises(ValueError, describe_paired_files, None, None)
+
+
+def test_describe_paired_files__none_files_1():
+ assert_raises(ValueError, describe_paired_files, None, ())
+
+
+def test_describe_paired_files__none_files_2():
+ assert_raises(ValueError, describe_paired_files, (), None)
diff --git a/tests/common_tests/formats_tests/__init__.py b/tests/common_tests/formats_tests/__init__.py
new file mode 100644
index 0000000..90e5529
--- /dev/null
+++ b/tests/common_tests/formats_tests/__init__.py
@@ -0,0 +1,22 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
diff --git a/tests/common_tests/formats_tests/fasta_test.py b/tests/common_tests/formats_tests/fasta_test.py
new file mode 100644
index 0000000..bac51f6
--- /dev/null
+++ b/tests/common_tests/formats_tests/fasta_test.py
@@ -0,0 +1,296 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import StringIO
+
+import nose.tools
+from nose.tools import \
+ assert_is, \
+ assert_equal, \
+ assert_raises, \
+ assert_not_equal, \
+ assert_less, \
+ assert_less_equal, \
+ assert_greater, \
+ assert_greater_equal
+
+
+from paleomix.common.testing import assert_list_equal
+from paleomix.common.formats.fasta import \
+ FASTA, \
+ FASTAError
+
+
+###############################################################################
+###############################################################################
+
+_SEQ_FRAG = "AAGTCC" # len() = 6
+
+
+def test_dir():
+ return os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+
+
+def test_file(*args):
+ return os.path.join(test_dir(), "data", *args)
+
+
+###############################################################################
+###############################################################################
+# Tests for FASTA constructor
+
+def _simple_fasta_record():
+ return FASTA("Dummy", "Meta-inf", "ACGT")
+
+
+def test_fasta__constructor__name():
+ record = _simple_fasta_record()
+ assert_equal(record.name, "Dummy")
+
+
+def test_fasta__constructor__meta():
+ record = _simple_fasta_record()
+ assert_equal(record.meta, "Meta-inf")
+
+
+def test_fasta__constructor__sequence():
+ record = _simple_fasta_record()
+ assert_equal(record.sequence, "ACGT")
+
+
+def test_fasta__constructor__name_must_be_non_empty():
+ assert_raises(FASTAError, FASTA, "", None, "ACGT")
+
+
+def test_fasta__constructor__name_must_be_string_type():
+ assert_raises(FASTAError, FASTA, 1, None, "ACGT")
+
+
+def test_fasta__constructor__name_must_be_string_type_or_none():
+ assert_raises(FASTAError, FASTA, "Seq1", 1, "ACGT")
+
+
+def test_fasta__constructor__sequence_must_be_string_type():
+ assert_raises(FASTAError, FASTA, "Seq1", None, 1)
+
+
+###############################################################################
+###############################################################################
+# Tests for __repr__
+
+def test_fasta__repr__partial_line_test():
+ expected = ">foobar\n%s\n" % (_SEQ_FRAG, )
+ result = repr(FASTA("foobar", None, _SEQ_FRAG))
+ assert_equal(result, expected)
+
+
+def test_fasta__repr__complete_line_test():
+ expected = ">barfoo\n%s\n" % (_SEQ_FRAG * 10, )
+ result = repr(FASTA("barfoo", None, _SEQ_FRAG * 10))
+ assert_equal(result, expected)
+
+
+def test_fasta__repr__multiple_lines():
+ expected = ">foobar\n%s\n%s\n" \
+ % (_SEQ_FRAG * 10, _SEQ_FRAG * 5)
+ result = repr(FASTA("foobar", None, _SEQ_FRAG * 15))
+ assert_equal(result, expected)
+
+
+def test_fasta__repr__partial_line_test_with_meta_information():
+ expected = ">foobar my Meta-Info\n%s\n" % (_SEQ_FRAG, )
+ result = repr(FASTA("foobar", "my Meta-Info", _SEQ_FRAG))
+ assert_equal(result, expected)
+
+
+###############################################################################
+###############################################################################
+# Tests for print_fasta
+
+
+def test_fasta__write__partial_line():
+ expected = ">foobar\n%s\n" % (_SEQ_FRAG, )
+ stringf = StringIO.StringIO()
+ FASTA("foobar", None, _SEQ_FRAG).write(stringf)
+ assert_equal(stringf.getvalue(), expected)
+
+
+def test_fasta__write__complete_line_test():
+ expected = ">barfoo\n%s\n" % (_SEQ_FRAG * 10, )
+ stringf = StringIO.StringIO()
+ FASTA("barfoo", None, _SEQ_FRAG * 10).write(stringf)
+ assert_equal(stringf.getvalue(), expected)
+
+
+def test_fasta__write__multiple_lines():
+ expected = ">foobar\n%s\n%s\n" \
+ % (_SEQ_FRAG * 10, _SEQ_FRAG * 5)
+ stringf = StringIO.StringIO()
+ FASTA("foobar", None, _SEQ_FRAG * 15).write(stringf)
+ assert_equal(stringf.getvalue(), expected)
+
+
+###############################################################################
+###############################################################################
+# Tests for FASTA.from_lines
+
+def test_fasta__from_lines__no_records():
+ assert_list_equal(FASTA.from_lines([]), [])
+
+
+def test_fasta__from_lines_single_record():
+ lines = [">single\n", "TGTTCTCCACCGTGCACAAC\n", "CCTTCATCCA\n"]
+ expected = [FASTA("single", None, "TGTTCTCCACCGTGCACAACCCTTCATCCA")]
+ assert_list_equal(FASTA.from_lines(lines), expected)
+
+
+def test_fasta__from_lines__multiple_records():
+ lines = [">first\n", "TGTTCTCCACCGTGCACAAC\n", "CCTTCATCCA\n",
+ ">Second XT:1:0\n", "GAGAGCTCAGCTAAC\n",
+ ">Third\n", "CGCTGACCAAAAACGGACAG\n", "GGCATTCGGC\n"]
+ expected = [FASTA("first", None, "TGTTCTCCACCGTGCACAACCCTTCATCCA"),
+ FASTA("Second", "XT:1:0", "GAGAGCTCAGCTAAC"),
+ FASTA("Third", None, "CGCTGACCAAAAACGGACAGGGCATTCGGC")]
+ assert_list_equal(FASTA.from_lines(lines), expected)
+
+
+ at nose.tools.raises(FASTAError)
+def test_fasta__from_lines__empty_record_name_only__nothing_else():
+ list(FASTA.from_lines([">fasta1\n"]))
+
+
+ at nose.tools.raises(FASTAError)
+def test_fasta__from_lines__empty_record_name_only__first():
+ list(FASTA.from_lines([">fasta1\n", ">fasta2\n", "AGTC\n"]))
+
+
+ at nose.tools.raises(FASTAError)
+def test_fasta__from_lines__empty_record__middle():
+ lines = [">fasta0\n", "ACGT\n", ">fasta1\n", ">fasta2\n", "AGTC\n"]
+ list(FASTA.from_lines(lines))
+
+
+ at nose.tools.raises(FASTAError)
+def test_fasta__from_lines__empty_record_last():
+ lines = [">fasta1\n", "ACGT\n", ">fasta2\n"]
+ list(FASTA.from_lines(lines))
+
+
+ at nose.tools.raises(FASTAError)
+def test_fasta__from_lines__missing_name__alone():
+ lines = ["ACGT\n"]
+ list(FASTA.from_lines(lines))
+
+
+ at nose.tools.raises(FASTAError)
+def test_fasta__from_lines__missing_name__with_others():
+ lines = ["ACGT\n", ">Foo\n", "ACGGTA\n"]
+ list(FASTA.from_lines(lines))
+
+
+ at nose.tools.raises(FASTAError)
+def test_fasta__from_lines__empty_name__alone():
+ lines = [">\n", "ACGT\n"]
+ list(FASTA.from_lines(lines))
+
+
+ at nose.tools.raises(FASTAError)
+def test_fasta__from_lines__empty_name__with_others():
+ lines = [">\n", "ACGT\n", ">Foo\n", "ACGGTA\n"]
+ list(FASTA.from_lines(lines))
+
+
+###############################################################################
+###############################################################################
+# Tests for 'FASTA.from_file'
+
+def test_fasta__from_file__uncompressed():
+ expected = [FASTA("This_is_FASTA!", None, "ACGTN"),
+ FASTA("This_is_ALSO_FASTA!", None, "CGTNA")]
+ results = list(FASTA.from_file(test_file("fasta_file.fasta")))
+ assert_equal(results, expected)
+
+
+def test_fasta__from_file__compressed_gz():
+ expected = [FASTA("This_is_GZipped_FASTA!", None, "ACGTN"),
+ FASTA("This_is_ALSO_GZipped_FASTA!", None, "CGTNA")]
+ results = list(FASTA.from_file(test_file("fasta_file.fasta.gz")))
+ assert_equal(results, expected)
+
+
+def test_fasta__from_file__compressed_bz2():
+ expected = [FASTA("This_is_BZ_FASTA!", None, "CGTNA"),
+ FASTA("This_is_ALSO_BZ_FASTA!", None, "ACGTN")]
+ results = list(FASTA.from_file(test_file("fasta_file.fasta.bz2")))
+ assert_equal(results, expected)
+
+
+###############################################################################
+###############################################################################
+
+def test_fasta__equality():
+ assert_equal(FASTA("A", "B", "C"), FASTA("A", "B", "C"))
+
+
+def test_fasta__inequality():
+ assert_not_equal(FASTA("A", "B", "C"), FASTA("A", "B", "D"))
+ assert_not_equal(FASTA("A", "B", "C"), FASTA("A", None, "C"))
+ assert_not_equal(FASTA("A", "B", "C"), FASTA("D", "B", "C"))
+
+
+def test_fasta__sorting_less_equal():
+ assert not FASTA("A", "B", "C") < FASTA("A", "B", "C")
+ assert_less(FASTA("A", "B", "C"), FASTA("B", "B", "C"))
+ assert_less(FASTA("A", "B", "C"), FASTA("A", "C", "C"))
+ assert_less(FASTA("A", "B", "C"), FASTA("A", "B", "D"))
+ assert_less_equal(FASTA("A", "B", "C"), FASTA("A", "B", "C"))
+ assert_less_equal(FASTA("A", "B", "C"), FASTA("B", "B", "C"))
+ assert_less_equal(FASTA("A", "B", "C"), FASTA("A", "C", "C"))
+ assert_less_equal(FASTA("A", "B", "C"), FASTA("A", "B", "D"))
+
+
+def test_fasta__sorting_greater_equal():
+ assert not FASTA("A", "B", "C") > FASTA("A", "B", "C")
+ assert_greater(FASTA("B", "B", "C"), FASTA("A", "B", "C"))
+ assert_greater(FASTA("A", "C", "C"), FASTA("A", "B", "C"))
+ assert_greater(FASTA("A", "B", "D"), FASTA("A", "B", "C"))
+ assert_greater_equal(FASTA("A", "B", "C"), FASTA("A", "B", "C"))
+ assert_greater_equal(FASTA("B", "B", "C"), FASTA("A", "B", "C"))
+ assert_greater_equal(FASTA("A", "C", "C"), FASTA("A", "B", "C"))
+ assert_greater_equal(FASTA("A", "B", "D"), FASTA("A", "B", "C"))
+
+
+def test_fasta__hash():
+ assert_equal(hash(FASTA("A", "B", "C")), hash(FASTA("A", "B", "C")))
+ assert_not_equal(hash(FASTA("A", "B", "C")), hash(FASTA("B", "B", "C")))
+ assert_not_equal(hash(FASTA("A", "B", "C")), hash(FASTA("A", "C", "C")))
+ assert_not_equal(hash(FASTA("A", "B", "C")), hash(FASTA("A", "B", "D")))
+
+
+def test_fasta__unimplemented_comparison():
+ assert_is(NotImplemented, FASTA("A", None, "C").__eq__(10))
+ assert_is(NotImplemented, FASTA("A", None, "C").__lt__(10))
+ assert_is(NotImplemented, FASTA("A", None, "C").__le__(10))
+ assert_is(NotImplemented, FASTA("A", None, "C").__ge__(10))
+ assert_is(NotImplemented, FASTA("A", None, "C").__gt__(10))
+
diff --git a/tests/common_tests/formats_tests/msa_test.py b/tests/common_tests/formats_tests/msa_test.py
new file mode 100644
index 0000000..f549b1c
--- /dev/null
+++ b/tests/common_tests/formats_tests/msa_test.py
@@ -0,0 +1,436 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import copy
+import StringIO
+
+import nose.tools
+from nose.tools import \
+ assert_equal, \
+ assert_raises
+from flexmock import \
+ flexmock
+
+from paleomix.common.formats.fasta import FASTA
+from paleomix.common.formats.msa import \
+ MSA, \
+ MSAError, \
+ FASTAError
+
+
+def test_dir():
+ return os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
+
+
+def test_file(*args):
+ return os.path.join(test_dir(), "data", *args)
+
+
+###############################################################################
+###############################################################################
+# Tests for constructor
+
+def test_msa_constructor__calls_validate():
+ _mock = flexmock(MSA).should_receive('validate').at_least.once
+ MSA([FASTA("NA", None, "ACGT")])
+
+
+def test_msa_constructor__duplicate_names():
+ records = [FASTA("Foo", None, "ACGT"),
+ FASTA("Foo", None, "GTCA")]
+ assert_raises(MSAError, MSA, records)
+
+
+def test_msa_constructor__empty_msa():
+ assert_raises(MSAError, MSA, [])
+
+
+###############################################################################
+###############################################################################
+# Tests for 'seqlen' / len
+
+def test_msa__len__corresponds_to_sequence_number_of_records():
+ msa = MSA((FASTA("seq1", None, "ACGCGTATGCATGCCGA"),
+ FASTA("seq2", None, "TGAACACACAGTAGGAT")))
+ assert_equal(len(msa), 2)
+
+
+def test_msa__seqlen__corresponds_to_sequence_lengths():
+ msa = MSA((FASTA("seq1", None, "ACGCGTATGCATGCCGA"),
+ FASTA("seq2", None, "TGAACACACAGTAGGAT")))
+ assert_equal(msa.seqlen(), 17)
+
+
+###############################################################################
+###############################################################################
+# Tests for 'exclude'
+
+def test_msa_exclude__remove_one():
+ fa_1 = FASTA("A", None, "ACGT")
+ fa_2 = FASTA("B", None, "GCTA")
+ initial = MSA([fa_1, fa_2])
+ expected = MSA([fa_1])
+ result = initial.exclude(["B"])
+ assert_equal(result, expected)
+
+
+def test_msa_exclude__missing_keys():
+ msa = MSA([FASTA("Foo", None, "ACGT")])
+ assert_raises(KeyError, msa.exclude, ["Bar"])
+
+
+def test_msa_exclude__no_keys():
+ msa = MSA([FASTA("Foo", None, "ACGT")])
+ assert_raises(ValueError, msa.exclude, [])
+
+
+###############################################################################
+###############################################################################
+# Tests for 'select'
+
+def test_msa_select__remove_one():
+ fa_1 = FASTA("A", None, "ACGT")
+ fa_2 = FASTA("B", None, "GCTA")
+ initial = MSA([fa_1, fa_2])
+ expected = MSA([fa_1])
+ result = initial.select(["A"])
+ assert_equal(result, expected)
+
+
+def test_msa_select__missing_keys():
+ msa = MSA([FASTA("Foo", None, "ACGT")])
+ assert_raises(KeyError, msa.select, ["Bar"])
+
+
+def test_msa_select__no_keys():
+ msa = MSA([FASTA("Foo", None, "ACGT")])
+ assert_raises(ValueError, msa.select, [])
+
+
+###############################################################################
+###############################################################################
+# Tests for 'reduce'
+
+def test_msa_reduce__no_empty_columns__no_columns_are_removed():
+ fa_1 = FASTA("Name_A", "Meta_A", "ACnT")
+ fa_2 = FASTA("Name_B", "Meta_B", "C-TN")
+ initial = MSA([fa_1, fa_2])
+ expected = MSA([fa_1, fa_2])
+ assert_equal(initial.reduce(), expected)
+
+
+def test_msa_reduce__one_empty_column__column_are_removed():
+ fa_1 = FASTA("Name_A", "Meta_A", "AnT")
+ fa_2 = FASTA("Name_B", "Meta_B", "C-N")
+ initial = MSA([fa_1, fa_2])
+ fa_reduced_1 = FASTA("Name_A", "Meta_A", "AT")
+ fa_reduced_2 = FASTA("Name_B", "Meta_B", "CN")
+ expected = MSA([fa_reduced_1, fa_reduced_2])
+ assert_equal(initial.reduce(), expected)
+
+
+def test_msa_reduce__multiple_empty_column__all_empty_column_are_removed():
+ fa_1 = FASTA("Name_A", "Meta_A", "-AnTN")
+ fa_2 = FASTA("Name_B", "Meta_B", "NC-NN")
+ initial = MSA([fa_1, fa_2])
+ fa_reduced_1 = FASTA("Name_A", "Meta_A", "AT")
+ fa_reduced_2 = FASTA("Name_B", "Meta_B", "CN")
+ expected = MSA([fa_reduced_1, fa_reduced_2])
+ assert_equal(initial.reduce(), expected)
+
+
+def test_msa_reduce__only_empty_column__none_is_returned():
+ fa_1 = FASTA("Name_A", "Meta_A", "---Nn")
+ fa_2 = FASTA("Name_B", "Meta_B", "Nn--N")
+ initial = MSA([fa_1, fa_2])
+ assert_equal(initial.reduce(), None)
+
+
+###############################################################################
+###############################################################################
+# Tests for 'filter_singletons'
+
+_FILTER_MSA_1 = MSA((FASTA("Seq1", "Meta1", "ACGNTYCSTG"),
+ FASTA("Seq2", "Meta2", "ACTA-WCCTG"),
+ FASTA("Seq3", "Meta3", "NCGGTYCGTC")))
+
+
+def test_msa_filter_singletons__filter_by_second():
+ expected = MSA((FASTA("Seq1", "Meta1", "ACnNntCcTG"),
+ FASTA("Seq2", "Meta2", "ACTA-WCCTG"),
+ FASTA("Seq3", "Meta3", "NCGGTYCGTC")))
+ result = _FILTER_MSA_1.filter_singletons("Seq1", ["Seq2"])
+ assert_equal(result, expected)
+
+
+def test_msa_filter_singletons__filter_by_third():
+ expected = MSA((FASTA("Seq1", "Meta1", "nCGNTYCgTn"),
+ FASTA("Seq2", "Meta2", "ACTA-WCCTG"),
+ FASTA("Seq3", "Meta3", "NCGGTYCGTC")))
+ result = _FILTER_MSA_1.filter_singletons("Seq1", ["Seq3"])
+ assert_equal(result, expected)
+
+
+def test_msa_filter_singletons__filter_by_both():
+ result = _FILTER_MSA_1.filter_singletons("Seq1", ["Seq2", "Seq3"])
+ assert_equal(result, _FILTER_MSA_1)
+
+
+def test_msa_filter_singletons__filter_by_itself():
+ assert_raises(MSAError, _FILTER_MSA_1.filter_singletons, "Seq1", ["Seq1", "Seq2"])
+
+
+def test_msa_filter_singletons__filter_by_nothing():
+ assert_raises(ValueError, _FILTER_MSA_1.filter_singletons, "Seq1", [])
+
+
+###############################################################################
+###############################################################################
+# Tests for 'MSA.join'
+
+_JOIN_MSA_1 = MSA((FASTA("nc", None, "ACG"),
+ FASTA("nm", None, "TGA"),
+ FASTA("miRNA", None, "UCA")))
+_JOIN_MSA_2 = MSA((FASTA("nc", None, "TGA"),
+ FASTA("nm", None, "CTT"),
+ FASTA("miRNA", None, "GAC")))
+_JOIN_MSA_3 = MSA((FASTA("nc", None, "AAG"),
+ FASTA("nm", None, "GAG"),
+ FASTA("miRNA", None, "CAU")))
+
+
+def test_msa_join__single_msa():
+ result = MSA.join(_JOIN_MSA_1)
+ assert_equal(result, _JOIN_MSA_1)
+
+
+def test_msa_join__two_msa():
+ expected = MSA((FASTA("nc", None, "ACGTGA"),
+ FASTA("nm", None, "TGACTT"),
+ FASTA("miRNA", None, "UCAGAC")))
+ result = MSA.join(_JOIN_MSA_1, _JOIN_MSA_2)
+ assert_equal(result, expected)
+
+
+def test_msa_join__three_msa():
+ expected = MSA((FASTA("nc", None, "ACGTGAAAG"),
+ FASTA("nm", None, "TGACTTGAG"),
+ FASTA("miRNA", None, "UCAGACCAU")))
+ result = MSA.join(_JOIN_MSA_1, _JOIN_MSA_2, _JOIN_MSA_3)
+ assert_equal(result, expected)
+
+
+ at nose.tools.raises(TypeError)
+def test_msa_join__missing_arguments():
+ MSA.join()
+
+
+###############################################################################
+###############################################################################
+# Tests for 'MSA.from_lines'
+
+def test_msa_from_lines__single_entry():
+ lines = [">seq1", "ACG"]
+ result = MSA([FASTA("seq1", None, "ACG")])
+ assert_equal(MSA.from_lines(lines), result)
+
+
+def test_msa_from_lines__single_entry_with_meta():
+ lines = [">seq1 Meta info", "ACG"]
+ expected = MSA([FASTA("seq1", "Meta info", "ACG")])
+ result = MSA.from_lines(lines)
+ assert_equal(result, expected)
+
+
+def test_msa_from_lines__two_entries():
+ lines = [">seq1", "ACG", ">seq2", "TGA"]
+ expected = MSA([FASTA("seq1", None, "ACG"),
+ FASTA("seq2", None, "TGA")])
+ result = MSA.from_lines(lines)
+ assert_equal(result, expected)
+
+
+def test_msa_from_lines__two_entries_with_meta():
+ lines = [">seq1", "ACG", ">seq2 Second meta", "TGA"]
+ expected = MSA([FASTA("seq1", None, "ACG"),
+ FASTA("seq2", "Second meta", "TGA")])
+ result = MSA.from_lines(lines)
+ assert_equal(result, expected)
+
+
+ at nose.tools.raises(MSAError)
+def test_msa_from_lines__duplicate_names():
+ MSA.from_lines([">seq1", "ACG", ">seq1", "TGA"])
+
+
+ at nose.tools.raises(MSAError)
+def test_msa_from_lines__mismatched_lengths():
+ MSA.from_lines([">seq1", "ACG", ">seq2", "TGAN"])
+
+
+ at nose.tools.raises(FASTAError)
+def test_msa_from_lines__empty_name():
+ MSA.from_lines([">", "ACG", ">seq1", "TGAN"])
+
+
+###############################################################################
+###############################################################################
+# Tests for 'MSA.from_file'
+
+def test_msa_from_file__uncompressed():
+ expected = MSA([FASTA("This_is_FASTA!", None, "ACGTN"),
+ FASTA("This_is_ALSO_FASTA!", None, "CGTNA")])
+ results = MSA.from_file(test_file("fasta_file.fasta"))
+ assert_equal(results, expected)
+
+
+def test_msa_from_file__compressed_gz():
+ expected = MSA([FASTA("This_is_GZipped_FASTA!", None, "ACGTN"),
+ FASTA("This_is_ALSO_GZipped_FASTA!", None, "CGTNA")])
+ results = MSA.from_file(test_file("fasta_file.fasta.gz"))
+ assert_equal(results, expected)
+
+
+def test_msa_from_file__compressed_bz2():
+ expected = MSA([FASTA("This_is_BZ_FASTA!", None, "CGTNA"),
+ FASTA("This_is_ALSO_BZ_FASTA!", None, "ACGTN")])
+ results = MSA.from_file(test_file("fasta_file.fasta.bz2"))
+ assert_equal(results, expected)
+
+
+###############################################################################
+###############################################################################
+# Tests for 'MSA.split'
+
+def test_msa_split_msa__single_group():
+ msa = MSA([FASTA("seq1", None, "ACGCAT"),
+ FASTA("seq2", None, "GAGTGA")])
+ expected = {'1': copy.copy(msa)}
+ assert_equal(msa.split("111"), expected)
+
+
+def test_msa_split_msa__two_groups():
+ msa = MSA([FASTA("seq1", None, "ACGCAT"),
+ FASTA("seq2", None, "GAGTGA")])
+ expected = {"1": MSA([FASTA("seq1", None, "ACCA"),
+ FASTA("seq2", None, "GATG")]),
+ "2": MSA([FASTA("seq1", None, "GT"),
+ FASTA("seq2", None, "GA")])}
+ assert_equal(msa.split("112"), expected)
+
+
+def test_msa_split__three_groups():
+ msa = MSA([FASTA("seq1", None, "ACGCAT"),
+ FASTA("seq2", None, "GAGTGA")])
+ expected = {"1": MSA([FASTA("seq1", None, "AC"),
+ FASTA("seq2", None, "GT")]),
+ "2": MSA([FASTA("seq1", None, "CA"),
+ FASTA("seq2", None, "AG")]),
+ "3": MSA([FASTA("seq1", None, "GT"),
+ FASTA("seq2", None, "GA")])}
+ assert_equal(msa.split("123"), expected)
+
+
+def test_msa_split__empty_group():
+ msa = MSA([FASTA("seq1", None, "AC"),
+ FASTA("seq2", None, "GA")])
+ expected = {"1": MSA([FASTA("seq1", None, "A"),
+ FASTA("seq2", None, "G")]),
+ "2": MSA([FASTA("seq1", None, "C"),
+ FASTA("seq2", None, "A")]),
+ "3": MSA([FASTA("seq1", None, ""),
+ FASTA("seq2", None, "")])}
+ assert_equal(msa.split("123"), expected)
+
+
+def test_msa_split__partial_group():
+ msa = MSA([FASTA("seq1", None, "ACGCA"),
+ FASTA("seq2", None, "GAGTG")])
+ expected = {"1": MSA([FASTA("seq1", None, "AC"),
+ FASTA("seq2", None, "GT")]),
+ "2": MSA([FASTA("seq1", None, "CA"),
+ FASTA("seq2", None, "AG")]),
+ "3": MSA([FASTA("seq1", None, "G"),
+ FASTA("seq2", None, "G")])}
+ assert_equal(msa.split("123"), expected)
+
+
+ at nose.tools.raises(TypeError)
+def test_msa_split_msa__no_split_by():
+ msa = MSA([FASTA("seq1", None, "ACG"),
+ FASTA("seq2", None, "GAT")])
+ msa.split(split_by="")
+
+
+###############################################################################
+###############################################################################
+# Tests for 'MSA.to_file'
+
+def test_msa_to_file__complete_line_test():
+ msa = MSA([FASTA("barfoo", None, "ACGATA" * 10 + "CGATAG" * 5),
+ FASTA("foobar", None, "CGAATG" * 10 + "TGTCAT" * 5)])
+ expected = ">barfoo\n%s\n%s\n" % ("ACGATA" * 10, "CGATAG" * 5)
+ expected += ">foobar\n%s\n%s\n" % ("CGAATG" * 10, "TGTCAT" * 5)
+ stringf = StringIO.StringIO()
+ MSA.to_file(msa, stringf)
+ assert_equal(stringf.getvalue(), expected)
+
+
+###############################################################################
+###############################################################################
+# Tests for 'MSA.validate'
+
+def test_msa_validate__missing_names_first():
+ msa_1 = MSA(list(_JOIN_MSA_1)[:-1])
+ msa_2 = copy.copy(_JOIN_MSA_2)
+ assert_raises(MSAError, MSA.validate, msa_1, msa_2)
+
+
+def test_msa_validate__missing_names_second():
+ msa_1 = copy.copy(_JOIN_MSA_1)
+ msa_2 = MSA(list(_JOIN_MSA_2)[:-1])
+ assert_raises(MSAError, MSA.validate, msa_1, msa_2)
+
+
+###############################################################################
+###############################################################################
+# Tests for 'MSA.names'
+
+def test_msa_names():
+ assert_equal(_JOIN_MSA_1.names(), set(("nc", "nm", "miRNA")))
+
+
+###############################################################################
+###############################################################################
+# Tests for str/repr
+
+def test_msa_repr():
+ msa = MSA((FASTA("nc", None, "ACGTA"),
+ FASTA("nm", "META", "TGAGT"),
+ FASTA("miRNA", None, "UCAGA")))
+ expected = "MSA(FASTA('miRNA', None, 'UCAGA'), FASTA('nc', None, 'ACGTA'), FASTA('nm', 'META', 'TGAGT'))"
+ assert_equal(str(msa), expected)
+
+
+def test_msa_repr__same_as_str():
+ assert_equal(str(_JOIN_MSA_1), repr(_JOIN_MSA_1))
diff --git a/tests/common_tests/formats_tests/newick_tests.py b/tests/common_tests/formats_tests/newick_tests.py
new file mode 100644
index 0000000..ad643e4
--- /dev/null
+++ b/tests/common_tests/formats_tests/newick_tests.py
@@ -0,0 +1,635 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+from nose.tools import \
+ assert_equal, \
+ assert_not_equal, \
+ assert_raises
+
+from paleomix.common.formats.newick import \
+ Newick, \
+ GraphError, \
+ NewickError, \
+ NewickParseError
+
+from paleomix.common.testing import \
+ assert_list_equal
+
+
+###############################################################################
+###############################################################################
+# Constructor
+
+def test_newick__constructor__name():
+ node = Newick(name="AbC")
+ assert_equal(node.name, "AbC")
+
+
+def test_newick__constructor__children_set_in_internal_nodes():
+ node = Newick(name="Leaf")
+ top_node = Newick(children=[node])
+ assert_equal(top_node.children, (node,))
+
+
+def test_newick__constructor__children_not_set_in_leaf_nodes():
+ node = Newick(name="Leaf")
+ assert_equal(node.children, ())
+
+
+def test_newick__constructor__is_leaf_true_for_leaf_nodes():
+ node = Newick(name="Another Leaf")
+ assert node.is_leaf
+
+
+def test_newick__constructor__is_leaf_false_for_internal_nodes():
+ node = Newick(name="Leaf")
+ top_node = Newick(children=[node])
+ assert not top_node.is_leaf
+
+
+def test_newick__constuctor__leaf_nodes_must_have_name_or_length():
+ assert_raises(NewickError, Newick, children=None)
+
+
+def test_newick__constructor__internal_nodes_must_have_children():
+ assert_raises(NewickError, Newick, children=[])
+
+
+def test_newick__constructor__children_must_be_newick():
+ assert_raises(TypeError, Newick, children=["A", "B"])
+
+
+###############################################################################
+###############################################################################
+# get_leaf_nodes
+
+def test_newick__get_leaf_nodes__leaf_returns_self():
+ node = Newick(name="Leaf")
+ assert_list_equal(node.get_leaf_nodes(), [node])
+
+
+def test_newick__get_leaf_nodes__internal_node_returns_leaf_nodes():
+ node_a = Newick(name="Leaf A")
+ node_b = Newick(name="Leaf B")
+ top_node = Newick(children=[node_a, node_b])
+ assert_list_equal(top_node.get_leaf_nodes(), [node_a, node_b])
+
+
+def test_newick__get_leaf_nodes__complex_case():
+ node_a = Newick(name="Leaf A")
+ node_b = Newick(name="Leaf B")
+ node_c = Newick(name="Leaf C")
+ sub_a = Newick(children=[node_b, node_c])
+ top_node = Newick(children=[node_a, sub_a])
+ assert_list_equal(top_node.get_leaf_nodes(), [node_a, node_b, node_c])
+
+
+###############################################################################
+###############################################################################
+# get_leaf_nodes
+
+def test_newick__get_leaf_names__leaf_returns_self():
+ node = Newick(name="Leaf")
+ assert_list_equal(node.get_leaf_names(), ["Leaf"])
+
+
+def test_newick__get_leaf_names__internal_node_returns_leaf_nodes():
+ node_a = Newick(name="Leaf A")
+ node_b = Newick(name="Leaf B")
+ top_node = Newick(children=[node_a, node_b])
+ assert_list_equal(top_node.get_leaf_names(), ["Leaf A", "Leaf B"])
+
+
+def test_newick__get_leaf_names__complex_case():
+ node_a = Newick(name="Leaf A")
+ node_b = Newick(name="Leaf B")
+ node_c = Newick(name="Leaf C")
+ sub_a = Newick(children=[node_b, node_c])
+ top_node = Newick(children=[node_a, sub_a])
+ assert_list_equal(top_node.get_leaf_names(), ["Leaf A", "Leaf B", "Leaf C"])
+
+
+###############################################################################
+###############################################################################
+# reroot_on_taxa
+
+def test_newick__reroot_on_taxa__single_taxa():
+ source = Newick.from_string("((A,B),C);")
+ expected = Newick.from_string("((B,C),A);")
+ assert_equal(expected, source.reroot_on_taxa("A"))
+
+
+def test_newick__reroot_on_taxa__single_taxa_with_branch_lengths():
+ source = Newick.from_string("((A:4,B:3):2,C:1);")
+ expected = Newick.from_string("((B:3,C:3.0):2.0,A:2.0);")
+ assert_equal(expected, source.reroot_on_taxa("A"))
+
+
+def test_newick__reroot_on_taxa__multiple_taxa__clade():
+ source = Newick.from_string("((A,(B,C)),(D,E));")
+ expected = Newick.from_string("(((D,E),A),(B,C));")
+ assert_equal(expected, source.reroot_on_taxa(("B", "C")))
+
+
+def test_newick__reroot_on_taxa__multiple_taxa__paraphylogeny():
+ source = Newick.from_string("((B,C),((D,E),A));")
+ expected = Newick.from_string("(((B,C),A),(D,E));")
+ assert_equal(expected, source.reroot_on_taxa(("A", "C")))
+
+
+def test_newick__reroot_on_taxa__no_taxa():
+ source = Newick.from_string("((B,C),((D,E),A));")
+ assert_raises(ValueError, source.reroot_on_taxa, ())
+
+
+def test_newick__reroot_on_taxa__unknown_taxa():
+ source = Newick.from_string("((B,C),((D,E),A));")
+ assert_raises(ValueError, source.reroot_on_taxa, ("A", "Z"))
+
+
+def test_newick__reroot_on_taxa__no_non_root_taxa():
+ source = Newick.from_string("((B,C),((D,E),A));")
+ assert_raises(ValueError, source.reroot_on_taxa, ("A", "B", "C", "D", "E"))
+
+
+###############################################################################
+###############################################################################
+# reroot_on_midpoint
+
+def test_newick__reroot_on_midpoint__single_node():
+ source = Newick.from_string("(A:3.0);")
+ expected = Newick.from_string("(A:3.0);")
+ assert_equal(expected, source.reroot_on_midpoint())
+
+
+def test_newick__reroot_on_midpoint__two_nodes():
+ source = Newick.from_string("(A:3.0,B:8.0);")
+ rerooted = source.reroot_on_midpoint()
+ expected = Newick.from_string("(A:5.5,B:5.5);")
+ assert_equal(expected, rerooted)
+
+
+def test_newick__reroot_on_midpoint__two_clades():
+ source = Newick.from_string("((A:7,B:2):1,(C:1,D:0.5):2);")
+ rerooted = source.reroot_on_midpoint()
+ expected = Newick.from_string("(((C:1,D:0.5):3.0,B:2):1.5,A:5.5);")
+ assert_equal(expected, rerooted)
+
+
+def test_newick__reroot_on_midpoint__nested_clades():
+ source = Newick.from_string("((A:2,(B:2,C:3):4):1,(D:1,E:0.5):2);")
+ rerooted = source.reroot_on_midpoint()
+ expected = Newick.from_string("(((D:1,E:0.5):3.0,A:2):1.5,(B:2,C:3):2.5);")
+ assert_equal(expected, rerooted)
+
+
+def test_newick__reroot_on_midpoint__reroot_on_internal_node():
+ source = Newick.from_string("((A:5.0,B:1.0)C:2.0,D:3.0);")
+ rerooted = source.reroot_on_midpoint()
+ expected = Newick.from_string("(A:5.0,B:1.0,D:5.0)C;")
+ assert_equal(expected, rerooted)
+
+
+def test_newick__reroot_on_midpoint__invalid_branch_lengths():
+ def _test_invalid_branch_lengths(newick):
+ source = Newick.from_string(newick)
+ assert_raises(GraphError, source.reroot_on_midpoint)
+
+ yield _test_invalid_branch_lengths, "(A,B);" # No branch lengths
+ yield _test_invalid_branch_lengths, "(A:7,B);" # Length missing for leaf node
+ yield _test_invalid_branch_lengths, "(A:7,(B:3));" # Length missing for internal node
+ yield _test_invalid_branch_lengths, "(A:7,(B:3):-1);" # Negative branch length
+ yield _test_invalid_branch_lengths, "(A:7,B:-1);" # Negative leaf length
+
+
+###############################################################################
+###############################################################################
+# add_support
+
+def test_newick__add_support__no_trees():
+ main_tree = Newick.from_string("(((A,B),C),D);")
+ expected = Newick.from_string("(((A,B)0,C)0,D);")
+ result = main_tree.add_support([])
+ assert_equal(expected, result)
+
+
+def test_newick__add_support__single_identical_tree():
+ main_tree = Newick.from_string("(((A,B),C),D);")
+ bootstraps = [Newick.from_string("(((A,B),C),D);")]
+ expected = Newick.from_string("(((A,B)1,C)1,D);")
+ result = main_tree.add_support(bootstraps)
+ assert_equal(expected, result)
+
+
+def test_newick__add_support__single_identical_tree__different_rooting():
+ main_tree = Newick.from_string("(((A,B),C),D);")
+ bootstraps = [Newick.from_string("(((C,D),B),A);")]
+ expected = Newick.from_string("(((A,B)1,C)1,D);")
+ result = main_tree.add_support(bootstraps)
+ assert_equal(expected, result)
+
+
+def test_newick__add_support__multiple_trees__different_topologies():
+ main_tree = Newick.from_string("(((A,B),C),D);")
+ bootstraps = [Newick.from_string("(((C,B),D),A);"),
+ Newick.from_string("(((A,D),B),C);")]
+ expected = Newick.from_string("(((A,B)0,C)2,D);")
+ result = main_tree.add_support(bootstraps)
+ assert_equal(expected, result)
+
+
+def test_newick__add_support__multiple_trees__partially_different_topologies():
+ main_tree = Newick.from_string("(((A,B),C),D);")
+ bootstraps = [Newick.from_string("(((C,D),A),B);"),
+ Newick.from_string("(((A,D),B),C);")]
+ expected = Newick.from_string("(((A,B)1,C)2,D);")
+ result = main_tree.add_support(bootstraps)
+ assert_equal(expected, result)
+
+
+def test_newick__add_support__multiple_trees__two_cladees():
+ main_tree = Newick.from_string("((A,B),(C,(D,E)));")
+ bootstraps = [Newick.from_string("((((C,E),D),A),B);"),
+ Newick.from_string("(((A,(C,D)),B),E);")]
+ expected = Newick.from_string("((A,B)1,(C,(D,E)0)1);")
+ result = main_tree.add_support(bootstraps)
+ assert_equal(expected, result)
+
+
+def test_newick__add_support__differing_leaf_names():
+ main_tree = Newick.from_string("(((A,B),C),D);")
+ bootstraps = [Newick.from_string("(((C,E),B),A);")]
+ assert_raises(NewickError, main_tree.add_support, bootstraps)
+
+
+def test_newick__add_support__formatting():
+ def _do_test_formatting(fmt, expected):
+ main_tree = Newick.from_string("(((A,B),C),D);")
+ bootstraps = [Newick.from_string("(((C,D),A),B);"),
+ Newick.from_string("(((C,B),A),D);"),
+ Newick.from_string("(((A,D),B),C);")]
+ expected = Newick.from_string(expected)
+ result = main_tree.add_support(bootstraps, fmt)
+ assert_equal(expected, result)
+
+ yield _do_test_formatting, "{Support}", "(((A,B)1,C)3,D);"
+ yield _do_test_formatting, "{Percentage:.0f}", "(((A,B)33,C)100,D);"
+ yield _do_test_formatting, "{Fraction:.2f}", "(((A,B)0.33,C)1.00,D);"
+
+
+def test_newick__add_support__unique_names_required():
+ main_tree = Newick.from_string("(((A,B),C),A);")
+ bootstraps = [Newick.from_string("(((A,B),C),A);")]
+ assert_raises(NewickError, main_tree.add_support, bootstraps)
+
+
+###############################################################################
+###############################################################################
+# from_string
+
+def test_newick__parse__minimal_newick__name_only():
+ top_node = Newick(name="A")
+ assert_equal(Newick.from_string("A;"), top_node)
+
+
+def test_newick__parse__single_taxa():
+ child_node = Newick(name="Ab")
+ top_node = Newick(children=[child_node])
+ assert_equal(Newick.from_string("(Ab);"), top_node)
+
+
+def test_newick__parse__two_taxa():
+ child_node_1 = Newick(name="A")
+ child_node_2 = Newick(name="Bc")
+ top_node = Newick(children=[child_node_1, child_node_2])
+ assert_equal(Newick.from_string("(A,Bc);"), top_node)
+
+
+def test_newick__parse__three_taxa():
+ child_node_1 = Newick(name="A")
+ child_node_2 = Newick(name="Bc")
+ child_node_3 = Newick(name="DeF")
+ top_node = Newick(children=[child_node_1, child_node_2, child_node_3])
+ assert_equal(Newick.from_string("(A,Bc,DeF);"), top_node)
+
+
+def test_newick__parse__ignore_whitespace():
+ assert_equal(Newick.from_string("(A,B);"), Newick.from_string("(A, B);"))
+
+
+def test_newick__parse__missing_semicolon():
+ assert_raises(NewickParseError, Newick.from_string, "()")
+
+
+def test_newick__parse__subnode__single_taxa():
+ child_node_1 = Newick(name="A")
+ child_node_2a = Newick(name="B")
+ child_node_2 = Newick(children=[child_node_2a])
+ top_node = Newick(children=[child_node_1, child_node_2])
+ assert_equal(Newick.from_string("(A,(B));"), top_node)
+
+
+def test_newick__parse__subnode__two_taxa():
+ child_node_1 = Newick(name="A")
+ child_node_2a = Newick(name="B")
+ child_node_2b = Newick(name="C")
+ child_node_2 = Newick(children=[child_node_2a, child_node_2b])
+ top_node = Newick(children=[child_node_1, child_node_2])
+ assert_equal(Newick.from_string("(A,(B,C));"), top_node)
+
+
+###########################################################################
+###########################################################################
+# cmp - white-box, just make sure all properties are compared
+
+def test_newick__cmp__identical():
+ node_a = Newick(name="A", length=13, children=[Newick(name="B")])
+ node_b = Newick(name="A", length=13, children=[Newick(name="B")])
+ assert_equal(node_a, node_b)
+
+
+def test_newick__cmp__identical_for_empty_string_length():
+ node_a = Newick(name="A", length="", children=[Newick(name="B")])
+ node_b = Newick(name="A", length=None, children=[Newick(name="B")])
+ assert_equal(node_a, node_b)
+
+
+def test_newick__cmp__identical_for_empty_string_name():
+ node_a = Newick(name="", length=13, children=[Newick(name="B")])
+ node_b = Newick(name=None, length=13, children=[Newick(name="B")])
+ assert_equal(node_a, node_b)
+
+
+def test_newick__cmp__not_identical():
+ def _not_identical(node_b):
+ node_a = Newick(name="A", length=13, children=[Newick(name="B")])
+ assert_not_equal(node_a, node_b)
+ yield _not_identical, Newick(name="B", length=13, children=[Newick(name="B")])
+ yield _not_identical, Newick(name="A", length=14, children=[Newick(name="B")])
+ yield _not_identical, Newick(name="A", length=13, children=[])
+ yield _not_identical, Newick(name="A", length=13, children=[Newick(name="C")])
+ yield _not_identical, Newick(name="B", length=14, children=[Newick(name="C")])
+
+
+###############################################################################
+###############################################################################
+# hash - white-box, just make sure all properties are used
+
+def test_newick__hash__identical():
+ node_a = Newick(name="A", length=13, children=[Newick(name="B")])
+ node_b = Newick(name="A", length=13, children=[Newick(name="B")])
+ assert_equal(hash(node_a), hash(node_b))
+
+
+def test_newick__hash__not_identical():
+ def _not_identical(node_b):
+ node_a = Newick(name="A", length=13, children=[Newick(name="B")])
+ assert_not_equal(hash(node_a), hash(node_b))
+ yield _not_identical, Newick(name="B", length=13, children=[Newick(name="B")])
+ yield _not_identical, Newick(name="A", length=14, children=[Newick(name="B")])
+ yield _not_identical, Newick(name="A", length=13, children=[])
+ yield _not_identical, Newick(name="A", length=13, children=[Newick(name="C")])
+ yield _not_identical, Newick(name="B", length=14, children=[Newick(name="C")])
+
+
+def test_newick__hash__hashable():
+ key_a = Newick(name="A", length=13.7, children=[Newick(name="F")])
+ key_b = Newick(name="A", length=13.7, children=[Newick(name="F")])
+ assert key_b in {key_a: True}
+
+
+###############################################################################
+###############################################################################
+# Malformed newick strings
+
+def test_newick__malformed__unbalanced_parantheses():
+ assert_raises(NewickParseError, Newick.from_string, "(A,(B,C);")
+
+
+def test_newick__malformed__mismatched_parantheses():
+ assert_raises(NewickParseError, Newick.from_string, "(A,(B,C();")
+
+
+def test_newick__malformed__missing_parantheses():
+ assert_raises(NewickParseError, Newick.from_string, "(A,(B,C))")
+
+
+def test_newick__malformed__missing_length():
+ assert_raises(NewickParseError, Newick.from_string, "(A:,(B,C));")
+ assert_raises(NewickParseError, Newick.from_string, "(A,(B:,C));")
+ assert_raises(NewickParseError, Newick.from_string, "(A,(B,C:));")
+ assert_raises(NewickParseError, Newick.from_string, "(A,(B,C):);")
+ assert_raises(NewickParseError, Newick.from_string, "(A,(B,C)):;")
+
+
+def test_newick__malformed__multiple_lengths():
+ assert_raises(NewickParseError, Newick.from_string, "(A:1:2);")
+
+
+###############################################################################
+###############################################################################
+# Implicit leafs are not supported (due to problems with ambiguiety)
+
+def test_newick__parse__first_taxa_unnamed():
+ assert_raises(NewickError, Newick.from_string, "(,A);")
+
+
+def test_newick__parse__second_taxa_unnamed():
+ assert_raises(NewickError, Newick.from_string, "(A,);")
+
+
+def test_newick__parse__two_taxa_unnamed():
+ assert_raises(NewickError, Newick.from_string, "(,);")
+
+
+def test_newick__parse__three_taxa_unnamed():
+ assert_raises(NewickError, Newick.from_string, "(,,);")
+
+
+###############################################################################
+###############################################################################
+# Empty non-leaf nodes are not allowed, as their interpretation is unclear
+
+def test_newick__parse__minimal_newick__implicit_nodes():
+ assert_raises(NewickParseError, Newick.from_string, "();")
+
+
+def test_newick__parse__subnode__empty():
+ assert_raises(NewickParseError, Newick.from_string, "(A,());")
+
+
+###############################################################################
+###############################################################################
+# The following tests are derived from the wikipedia description of the
+# newick format: http://en.wikipedia.org/wiki/Newick_format#Examples
+
+def test_newick__wikipedia_example_1():
+ # no nodes are named, this format is not supported here!
+ assert_raises(NewickError, Newick.from_string, "(,,(,));")
+
+
+def test_newick__wikipedia_example_2():
+ # leaf nodes are named
+ taxa_d = Newick(name="D")
+ taxa_c = Newick(name="C")
+ taxa_sub = Newick(children=[taxa_c, taxa_d])
+ taxa_b = Newick(name="B")
+ taxa_a = Newick(name="A")
+ top_node = Newick(children=[taxa_a, taxa_b, taxa_sub])
+ assert_equal(Newick.from_string("(A,B,(C,D));"), top_node)
+
+
+def test_newick__wikipedia_example_3():
+ # all nodes are named
+ taxa_d = Newick(name="D")
+ taxa_c = Newick(name="C")
+ taxa_sub = Newick(children=[taxa_c, taxa_d], name="E")
+ taxa_b = Newick(name="B")
+ taxa_a = Newick(name="A")
+ top_node = Newick(children=[taxa_a, taxa_b, taxa_sub], name="F")
+ assert_equal(Newick.from_string("(A,B,(C,D)E)F;"), top_node)
+
+
+def test_newick__wikipedia_example_4():
+ # all but root node have a distance to parent
+ taxa_d = Newick(length="0.4")
+ taxa_c = Newick(length="0.3")
+ taxa_sub = Newick(children=[taxa_c, taxa_d], length="0.5")
+ taxa_b = Newick(length="0.2")
+ taxa_a = Newick(length="0.1")
+ top_node = Newick(children=[taxa_a, taxa_b, taxa_sub])
+ assert_equal(Newick.from_string("(:0.1,:0.2,(:0.3,:0.4):0.5);"), top_node)
+
+
+def test_newick__wikipedia_example_5():
+ # all have a distance to parent
+ taxa_d = Newick(length="0.4")
+ taxa_c = Newick(length="0.3")
+ taxa_sub = Newick(children=[taxa_c, taxa_d], length="0.5")
+ taxa_b = Newick(length="0.2")
+ taxa_a = Newick(length="0.1")
+ top_node = Newick(children=[taxa_a, taxa_b, taxa_sub], length="0.0")
+ assert_equal(Newick.from_string("(:0.1,:0.2,(:0.3,:0.4):0.5):0.0;"), top_node)
+
+
+def test_newick__wikipedia_example_6():
+ # distances and leaf names (popular)
+ taxa_d = Newick(length="0.4", name="D")
+ taxa_c = Newick(length="0.3", name="C")
+ taxa_sub = Newick(children=[taxa_c, taxa_d], length="0.5")
+ taxa_b = Newick(length="0.2", name="B")
+ taxa_a = Newick(length="0.1", name="A")
+ top_node = Newick(children=[taxa_a, taxa_b, taxa_sub])
+ assert_equal(Newick.from_string("(A:0.1,B:0.2,(C:0.3,D:0.4):0.5);"), top_node)
+
+
+def test_newick__wikipedia_example_7():
+ # distances and all names
+ taxa_d = Newick(length="0.4", name="D")
+ taxa_c = Newick(length="0.3", name="C")
+ taxa_sub = Newick(children=[taxa_c, taxa_d], length="0.5", name="E")
+ taxa_b = Newick(length="0.2", name="B")
+ taxa_a = Newick(length="0.1", name="A")
+ top_node = Newick(children=[taxa_a, taxa_b, taxa_sub], name="F")
+ assert_equal(Newick.from_string("(A:0.1,B:0.2,(C:0.3,D:0.4)E:0.5)F;"), top_node)
+
+
+def test_newick__wikipedia_example_8():
+ # a tree rooted on a leaf node (rare)
+ taxa_b = Newick(length="0.2", name="B")
+ taxa_c = Newick(length="0.3", name="C")
+ taxa_d = Newick(length="0.4", name="D")
+ node_e = Newick(length="0.5", name="E", children=[taxa_c, taxa_d])
+ node_f = Newick(length="0.1", name="F", children=[taxa_b, node_e])
+ node_a = Newick(name="A", children=[node_f])
+ assert_equal(Newick.from_string("((B:0.2,(C:0.3,D:0.4)E:0.5)F:0.1)A;"), node_a)
+
+
+###############################################################################
+###############################################################################
+# str / repr
+
+def test_newick__str__non_string_name():
+ node = Newick(children=[Newick(name=17, length="1.3")])
+ assert_equal(str(node), "(17:1.3);")
+
+
+def test_newick__str__non_string_length():
+ node = Newick(children=[Newick(name="Foo", length=1.3)])
+ assert_equal(str(node), "(Foo:1.3);")
+
+
+def test_newick__str__repr_equal_to_str():
+ node_a = Newick(name="A", length="123")
+ node_b = Newick(name="B", length="4567")
+ top_node = Newick(children=[node_a, node_b])
+ assert_equal(str(top_node), "(A:123,B:4567);")
+
+
+def test_newick__str__single_leaf_should_not_be_followed_by_comma():
+ node = Newick(name="A")
+ top_node = Newick(children=[node])
+ assert_equal(str(top_node), "(A);")
+
+
+def test_newick__wikipedia_examples__str_equality():
+ def _newick_str_input_equals_output(nwk_str):
+ nodes = Newick.from_string(nwk_str)
+ result = str(nodes)
+ assert_equal(result, nwk_str)
+
+ # 2. leaf nodes are named
+ yield _newick_str_input_equals_output, "(A,B,(C,D));"
+ # 3. all nodes are named
+ yield _newick_str_input_equals_output, "(A,B,(C,D)E)F;"
+ # 4. all but root node have a distance to parent
+ yield _newick_str_input_equals_output, "(:0.1,:0.2,(:0.3,:0.4):0.5);"
+ # 5. all have a distance to parent
+ yield _newick_str_input_equals_output, "(:0.1,:0.2,(:0.3,:0.4):0.5):0.0;"
+ # 6. distances and leaf names (popular)
+ yield _newick_str_input_equals_output, "(A:0.1,B:0.2,(C:0.3,D:0.4):0.5);"
+ # 7. distances and all names
+ yield _newick_str_input_equals_output, "(A:0.1,B:0.2,(C:0.3,D:0.4)E:0.5)F;"
+
+
+###############################################################################
+###############################################################################
+# Immutability
+
+def test_newick__properties_are_immutable():
+ def _test_newick__properties_are_immutable(name, value):
+ node = Newick(name="A", length=3, children=[Newick(name="B")])
+ assert_raises(NotImplementedError, setattr, node, name, value)
+ yield _test_newick__properties_are_immutable, "name", "foo"
+ yield _test_newick__properties_are_immutable, "length", "13"
+ yield _test_newick__properties_are_immutable, "children", []
+ yield _test_newick__properties_are_immutable, "foobar", True
+
+
+def test_newick__properties_cannot_be_deleted():
+ def _test_newick__properties_cannot_be_deleted(name):
+ node = Newick(name="A", length=3, children=[Newick(name="B")])
+ assert_raises(NotImplementedError, delattr, node, name)
+ yield _test_newick__properties_cannot_be_deleted, "name"
+ yield _test_newick__properties_cannot_be_deleted, "length"
+ yield _test_newick__properties_cannot_be_deleted, "children"
+ yield _test_newick__properties_cannot_be_deleted, "foobar"
diff --git a/tests/common_tests/formats_tests/phylip_test.py b/tests/common_tests/formats_tests/phylip_test.py
new file mode 100644
index 0000000..62f4adb
--- /dev/null
+++ b/tests/common_tests/formats_tests/phylip_test.py
@@ -0,0 +1,186 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+from flexmock import flexmock
+from nose.tools import assert_equal
+
+from paleomix.common.formats.phylip import \
+ sequential_phy, \
+ interleaved_phy
+
+from paleomix.common.formats.msa import \
+ MSA
+
+from paleomix.common.formats.fasta import \
+ FASTA
+
+
+_MSA_SHORT_SEQUENCES = \
+ MSA([FASTA("seq1", None, "ACGTTGATAACCAGG"),
+ FASTA("seq2", None, "TGCAGAGTACGACGT")])
+_MSA_MEDIUM_SEQUENCES = \
+ MSA([FASTA("seq1", None, "ACGTTGATAACCAGGAGGGATTCGCGATTGGTGGTAACGTAGCC"),
+ FASTA("seq2", None, "TGCAGAGTACGACGTCTCCTAGATCCTGGACAATTTAAACCGAA")])
+_MSA_LONG_SEQUENCES = \
+ MSA([FASTA("seq1", None, "CGGATCTGCTCCTCCACTGGCCACGTTTACTGTCCCCCAACCGTT"
+ "CGTCCCGACCTAGTTATACTTCTTAGCAAGGTGTAAAACCAGAGATTGAGGTTATAACG"
+ "TTCCTAATCAGTTATTAAATTACCGCGCCCCGACAG"),
+ FASTA("seq2", None, "AGTTGAAGAGGCGGAACGTTTGTAAACCGCGCTAACGTAGTTCTA"
+ "CAACCAGCCACCCGGTTCGAAGGAACAACTGGTCGCCATAATTAGGCGAAACGATAGTG"
+ "CACTAAGGTCAGGTGCGCCCCTGTAAATAATTAGAT")])
+
+_MSA_MEDIUM_NAMES = \
+ MSA([FASTA("A_really_long_sequence", None, "ACGTTGATAACCAGG"),
+ FASTA("Another_real_long_one!", None, "TGCAGAGTACGACGT")])
+_MSA_LONG_NAMES = \
+ MSA([FASTA("A_really_long_sequence_name_that_is_in_fact_too_long", None, "ACGTTGATAACCAGG"),
+ FASTA("Another_really_long_sequence_name_that_is_too_long", None, "TGCAGAGTACGACGT")])
+
+
+###############################################################################
+###############################################################################
+# Tests of 'sequential_phy'
+
+def test_sequential_phy__short_sequences():
+ expected = """2 44
+
+seq1
+ACGTTGATAA CCAGGAGGGA TTCGCGATTG GTGGTAACGT AGCC
+seq2
+TGCAGAGTAC GACGTCTCCT AGATCCTGGA CAATTTAAAC CGAA"""
+ assert_equal(sequential_phy(_MSA_MEDIUM_SEQUENCES), expected)
+
+
+def test_sequential_phy__multi_line_sequences():
+ expected = """2 140
+
+seq1
+CGGATCTGCT CCTCCACTGG CCACGTTTAC TGTCCCCCAA CCGTTCGTCC CGACCTAGTT
+ATACTTCTTA GCAAGGTGTA AAACCAGAGA TTGAGGTTAT AACGTTCCTA ATCAGTTATT
+AAATTACCGC GCCCCGACAG
+seq2
+AGTTGAAGAG GCGGAACGTT TGTAAACCGC GCTAACGTAG TTCTACAACC AGCCACCCGG
+TTCGAAGGAA CAACTGGTCG CCATAATTAG GCGAAACGAT AGTGCACTAA GGTCAGGTGC
+GCCCCTGTAA ATAATTAGAT"""
+ assert_equal(sequential_phy(_MSA_LONG_SEQUENCES), expected)
+
+
+def test_sequential_phy__with_flag():
+ expected = """2 15 S
+
+seq1
+ACGTTGATAA CCAGG
+seq2
+TGCAGAGTAC GACGT"""
+ assert_equal(sequential_phy(_MSA_SHORT_SEQUENCES, add_flag=True), expected)
+
+
+def test_sequentual_phy__long_names():
+ expected = """2 15
+
+A_really_long_sequence_name_th
+ACGTTGATAA CCAGG
+Another_really_long_sequence_n
+TGCAGAGTAC GACGT"""
+ assert_equal(sequential_phy(_MSA_LONG_NAMES), expected)
+
+
+def test_sequential_phy__different_lengths():
+ _mock = flexmock(MSA).should_receive('validate').at_least.once
+ sequential_phy(_MSA_MEDIUM_NAMES)
+
+
+###############################################################################
+###############################################################################
+# Tests of 'interleaved_phy'
+
+def test_interleaved_phy__short_sequences():
+ expected = """2 44
+
+seq1 ACGTTGATAA CCAGGAGGGA TTCGCGATTG GTGGTAACGT AGCC
+seq2 TGCAGAGTAC GACGTCTCCT AGATCCTGGA CAATTTAAAC CGAA"""
+ assert_equal(interleaved_phy(_MSA_MEDIUM_SEQUENCES), expected)
+
+
+def test_interleaved_phy__multi_line_sequences():
+ expected = """2 140
+
+seq1 CGGATCTGCT CCTCCACTGG CCACGTTTAC TGTCCCCCAA CCGTTCGTCC
+seq2 AGTTGAAGAG GCGGAACGTT TGTAAACCGC GCTAACGTAG TTCTACAACC
+
+CGACCTAGTT ATACTTCTTA GCAAGGTGTA AAACCAGAGA TTGAGGTTAT AACGTTCCTA
+AGCCACCCGG TTCGAAGGAA CAACTGGTCG CCATAATTAG GCGAAACGAT AGTGCACTAA
+
+ATCAGTTATT AAATTACCGC GCCCCGACAG
+GGTCAGGTGC GCCCCTGTAA ATAATTAGAT"""
+ assert_equal(interleaved_phy(_MSA_LONG_SEQUENCES), expected)
+
+
+def test_interleaved_phy__with_flag():
+ expected = """2 15 I
+
+seq1 ACGTTGATAA CCAGG
+seq2 TGCAGAGTAC GACGT"""
+ assert_equal(interleaved_phy(_MSA_SHORT_SEQUENCES, add_flag=True), expected)
+
+
+def test_interleaved_phy__medium_names():
+ expected = """2 15
+
+A_really_long_sequence ACGTTGATAA CCAGG
+Another_real_long_one! TGCAGAGTAC GACGT"""
+ assert_equal(interleaved_phy(_MSA_MEDIUM_NAMES), expected)
+
+
+def test_interleaved_phy__long_names():
+ expected = """2 15
+
+A_really_long_sequence_name_th ACGTTGATAA CCAGG
+Another_really_long_sequence_n TGCAGAGTAC GACGT"""
+ assert_equal(interleaved_phy(_MSA_LONG_NAMES), expected)
+
+
+def test_sequentual_phy__different_length_names_1():
+ msa = MSA([FASTA("A_short_name", None, "ACGTTGATAACCAGG"),
+ FASTA("Another_really_long_sequence_name_that_is_too_long", None, "TGCAGAGTACGACGT")])
+ expected = """2 15
+
+A_short_name ACGTTGATAA CCAGG
+Another_really_long_sequence_n TGCAGAGTAC GACGT"""
+ print interleaved_phy(msa), expected
+ assert_equal(interleaved_phy(msa), expected)
+
+
+def test_sequentual_phy__different_length_names_2():
+ msa = MSA([FASTA("Burchelli_4", None, "ACGTTGATAACCAGG"),
+ FASTA("Donkey", None, "TGCAGAGTACGACGT")])
+ expected = """2 15
+
+Burchelli_4 ACGTTGATAA CCAGG
+Donkey TGCAGAGTAC GACGT"""
+ print interleaved_phy(msa), expected
+ assert_equal(interleaved_phy(msa), expected)
+
+
+def test_interleaved_phy__different_lengths():
+ _mock = flexmock(MSA).should_receive('validate').at_least.once
+ interleaved_phy(_MSA_MEDIUM_NAMES)
diff --git a/tests/common_tests/makefile_test.py b/tests/common_tests/makefile_test.py
new file mode 100644
index 0000000..58a099a
--- /dev/null
+++ b/tests/common_tests/makefile_test.py
@@ -0,0 +1,1709 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2013 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import types
+
+from nose.tools import \
+ assert_is, \
+ assert_equal, \
+ assert_raises, \
+ assert_raises_regexp
+
+from paleomix.common.makefile import \
+ DEFAULT_NOT_SET, \
+ REQUIRED_VALUE, \
+ MakefileError, \
+ MakefileSpec, \
+ read_makefile, \
+ process_makefile, \
+ WithoutDefaults, \
+ IsInt, \
+ IsUnsignedInt, \
+ IsFloat, \
+ IsBoolean, \
+ IsStr, \
+ IsNone, \
+ ValueLT, \
+ ValueLE, \
+ ValueGE, \
+ ValueGT, \
+ ValueIn, \
+ ValuesIntersect, \
+ ValuesSubsetOf, \
+ ValueMissing, \
+ And, \
+ Or, \
+ Xor, \
+ Not, \
+ StringIn, \
+ StringsIntersect, \
+ StringsSubsetOf, \
+ StringIsUppercase, \
+ StringStartsWith, \
+ StringEndsWith, \
+ IsListOf, \
+ IsDictOf, \
+ PreProcessMakefile
+
+
+# Dummy value for the path parameters
+_DUMMY_PATH = ("a", "random", "path")
+_DUMMY_PATH_STR = ":".join(_DUMMY_PATH)
+
+
+###############################################################################
+###############################################################################
+# Setup timestamps for test files
+
+def test_dir():
+ return os.path.dirname(os.path.dirname(__file__))
+
+
+def test_file(*args):
+ return os.path.join(test_dir(), "data", *args)
+
+
+def setup_module():
+ timestamps = {test_file("simple.yaml"): 1120719000}
+
+ for filename, timestamp in timestamps.iteritems():
+ # Set atime and mtime
+ os.utime(filename, (timestamp, timestamp))
+
+
+###############################################################################
+###############################################################################
+# MakefileSpec
+
+def test_makefilespec__description_is_set():
+ desc = "a random description"
+ spec = MakefileSpec(description=desc)
+ assert_equal(spec.description, desc)
+
+
+def test_makefilespec__meets_spec_must_be_implemented():
+ spec = MakefileSpec(description="some description")
+ assert_raises(NotImplementedError, spec, _DUMMY_PATH, 1)
+
+
+###############################################################################
+###############################################################################
+# IsInt
+
+def test_is_int__accepts_integers():
+ spec = IsInt()
+ spec(_DUMMY_PATH, 1234)
+ spec(_DUMMY_PATH, 0)
+ spec(_DUMMY_PATH, -1234)
+
+
+def test_is_int__accepts_longs():
+ spec = IsInt()
+ spec(_DUMMY_PATH, 1234L)
+ spec(_DUMMY_PATH, 0L)
+ spec(_DUMMY_PATH, -1234L)
+
+
+def test_is_int__rejects_not_int():
+ def _reject_not_str(value):
+ spec = IsInt()
+ assert_raises(MakefileError, spec, _DUMMY_PATH, value)
+
+ yield _reject_not_str, None
+ yield _reject_not_str, False
+ yield _reject_not_str, ()
+ yield _reject_not_str, {}
+
+
+def test_is_int__default_description():
+ spec = IsInt()
+ assert_equal(spec.description, "an integer")
+
+
+def test_is_int__custom_description():
+ custom_desc = "any old integer"
+ spec = IsInt(description=custom_desc)
+ assert_equal(spec.description, custom_desc)
+
+
+def test_is_int__default_not_set():
+ spec = IsInt()
+ assert_is(spec.default, DEFAULT_NOT_SET)
+
+
+def test_is_int__default_set__valid_value():
+ spec = IsInt(default=7913)
+ assert_equal(spec.default, 7913)
+
+
+def test_is_int__default_set__must_meet_spec():
+ assert_raises(ValueError, IsInt, default="abc")
+
+
+###############################################################################
+###############################################################################
+# IsUnsignedInt
+
+def test_is_unsigned_int__accepts_non_negative_integers():
+ spec = IsUnsignedInt()
+ spec(_DUMMY_PATH, 1234)
+ spec(_DUMMY_PATH, 0)
+
+
+def test_is_unsigned_int__accepts_longs():
+ spec = IsUnsignedInt()
+ spec(_DUMMY_PATH, 1234L)
+ spec(_DUMMY_PATH, 0L)
+
+
+def test_is_unsigned_int__rejects_not_unsigned_int():
+ def _reject_not_str(value):
+ spec = IsUnsignedInt()
+ assert_raises(MakefileError, spec, _DUMMY_PATH, value)
+
+ yield _reject_not_str, -1
+ yield _reject_not_str, -1L
+ yield _reject_not_str, None
+ yield _reject_not_str, False
+ yield _reject_not_str, ()
+ yield _reject_not_str, {}
+
+
+def test_is_unsigned_int__default_description():
+ spec = IsUnsignedInt()
+ assert_equal(spec.description, "an unsigned integer")
+
+
+def test_is_unsigned_int__custom_description():
+ custom_desc = "any old unsigned integer"
+ spec = IsUnsignedInt(description=custom_desc)
+ assert_equal(spec.description, custom_desc)
+
+
+def test_is_unsigned_int__default_not_set():
+ spec = IsUnsignedInt()
+ assert_is(spec.default, DEFAULT_NOT_SET)
+
+
+def test_is_unsigned_int__default_set__valid_value():
+ spec = IsUnsignedInt(default=7913)
+ assert_equal(spec.default, 7913)
+
+
+def test_is_unsigned_int__default_set__must_meet_spec():
+ assert_raises(ValueError, IsUnsignedInt, default=-3)
+
+
+###############################################################################
+###############################################################################
+# IsFloat
+
+def test_is_float__accepts_float():
+ spec = IsFloat()
+ spec(_DUMMY_PATH, 1.0)
+
+
+def test_is_float__rejects_not_float():
+ def _reject_not_str(value):
+ spec = IsFloat()
+ assert_raises(MakefileError, spec, _DUMMY_PATH, value)
+
+ yield _reject_not_str, 0
+ yield _reject_not_str, None
+ yield _reject_not_str, False
+ yield _reject_not_str, ()
+ yield _reject_not_str, {}
+
+
+def test_is_float__default_description():
+ spec = IsFloat()
+ assert_equal(spec.description, "a float")
+
+
+def test_is_float__custom_description():
+ custom_desc = "a floaty, float"
+ spec = IsFloat(description=custom_desc)
+ assert_equal(spec.description, custom_desc)
+
+
+def test_is_float__default_not_set():
+ spec = IsFloat()
+ assert_is(spec.default, DEFAULT_NOT_SET)
+
+
+def test_is_float__default_set__valid_value():
+ spec = IsFloat(default=3.14)
+ assert_equal(spec.default, 3.14)
+
+
+def test_is_float__default_set__must_meet_spec():
+ assert_raises(ValueError, IsFloat, default="abc")
+
+
+###############################################################################
+###############################################################################
+# IsBoolean
+
+def test_is_boolean__accepts_boolean():
+ spec = IsBoolean()
+ spec(_DUMMY_PATH, False)
+
+
+def test_is_boolean__rejects_not_boolean():
+ def _reject_not_str(value):
+ spec = IsBoolean()
+ assert_raises(MakefileError, spec, _DUMMY_PATH, value)
+
+ yield _reject_not_str, None
+ yield _reject_not_str, 0
+ yield _reject_not_str, ()
+ yield _reject_not_str, {}
+
+
+def test_is_boolean__default_description():
+ spec = IsBoolean()
+ assert_equal(spec.description, "a boolean")
+
+
+def test_is_boolean__custom_description():
+ custom_desc = "True or False"
+ spec = IsBoolean(description=custom_desc)
+ assert_equal(spec.description, custom_desc)
+
+
+def test_is_boolean__default_not_set():
+ spec = IsBoolean()
+ assert_is(spec.default, DEFAULT_NOT_SET)
+
+
+def test_is_boolean__default_set__valid_value():
+ spec = IsBoolean(default=True)
+ assert_equal(spec.default, True)
+
+
+def test_is_boolean__default_set__must_meet_spec():
+ assert_raises(ValueError, IsBoolean, default="abc")
+
+
+###############################################################################
+###############################################################################
+# IsStr
+
+def test_is_str__accepts_standard_str():
+ spec = IsStr()
+ spec(_DUMMY_PATH, "abc")
+
+
+def test_is_str__accepts_unicode_str():
+ spec = IsStr()
+ spec(_DUMMY_PATH, u"def")
+
+
+def test_is_str__rejects_empty_str():
+ spec = IsStr()
+ assert_raises(MakefileError, spec, _DUMMY_PATH, "")
+
+
+def test_is_str__rejects_not_str():
+ def _reject_not_str(value):
+ spec = IsStr()
+ assert_raises(MakefileError, spec, _DUMMY_PATH, value)
+
+ yield _reject_not_str, None
+ yield _reject_not_str, 1
+ yield _reject_not_str, ()
+ yield _reject_not_str, {}
+
+
+def test_is_str__default_description():
+ spec = IsStr()
+ assert_equal(spec.description, "a non-empty string")
+
+
+def test_is_str__custom_description():
+ custom_desc = "a ball of string"
+ spec = IsStr(description=custom_desc)
+ assert_equal(spec.description, custom_desc)
+
+
+def test_is_str__default_not_set():
+ spec = IsStr()
+ assert_is(spec.default, DEFAULT_NOT_SET)
+
+
+def test_is_str__default_set__valid_value():
+ spec = IsStr(default="abc")
+ assert_equal(spec.default, "abc")
+
+
+def test_is_str__default_set__must_meet_spec():
+ assert_raises(ValueError, IsStr, default=17)
+
+
+###############################################################################
+###############################################################################
+# IsNone
+
+def test_is_none__accepts_none():
+ spec = IsNone()
+ spec(_DUMMY_PATH, None)
+
+
+def test_is_none__rejects_not_none():
+ def _reject_not_none(value):
+ spec = IsNone()
+ assert_raises(MakefileError, spec, _DUMMY_PATH, value)
+
+ yield _reject_not_none, ""
+ yield _reject_not_none, 0
+ yield _reject_not_none, ()
+ yield _reject_not_none, {}
+
+
+def test_is_none__default_description():
+ spec = IsNone()
+ assert_equal(spec.description, "None or not set")
+
+
+def test_is_none__custom_description():
+ custom_desc = "NOTHING!"
+ spec = IsNone(description=custom_desc)
+ assert_equal(spec.description, custom_desc)
+
+
+def test_is_none__default_not_set():
+ spec = IsNone()
+ assert_is(spec.default, DEFAULT_NOT_SET)
+
+
+def test_is_none__default_not_implemented_for_is_none():
+ assert_raises(NotImplementedError, IsNone, default=None)
+
+
+###############################################################################
+###############################################################################
+# ValueLT
+
+def test_value_lt__accepts_value_lt():
+ spec = ValueLT(7)
+ spec(_DUMMY_PATH, 6)
+
+
+def test_value_lt__rejects_value_eq():
+ spec = ValueLT(7)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, 7)
+
+
+def test_value_lt__rejects_value_gt():
+ spec = ValueLT(7)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, 8)
+
+
+def test_value_lt__accepts_value_lt__with_key():
+ spec = ValueLT(7, key=len)
+ spec(_DUMMY_PATH, "abcdef")
+
+
+def test_value_lt__rejects_value_eq__with_key():
+ spec = ValueLT(7, key=len)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, "abcdefg")
+
+
+def test_value_lt__rejects_value_gt__with_key():
+ spec = ValueLT(7, key=len)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, "abcdefgh")
+
+
+def test_value_lt__default_description():
+ spec = ValueLT('Foo')
+ assert_equal(spec.description, "value < 'Foo'")
+
+
+def test_value_lt__custom_description():
+ spec = ValueLT('Bar', description='anything less than {rvalue}')
+ assert_equal(spec.description, "anything less than 'Bar'")
+
+
+def test_is_value_lt__default_not_set():
+ spec = ValueLT(10)
+ assert_is(spec.default, DEFAULT_NOT_SET)
+
+
+def test_is_value_lt__default_set__valid_value():
+ spec = ValueLT(10, default=9)
+ assert_equal(spec.default, 9)
+
+
+def test_is_value_lt__default_set__must_meet_spec():
+ assert_raises(ValueError, ValueLT, 10, default=17)
+
+
+###############################################################################
+###############################################################################
+# ValueLE
+
+def test_value_le__accepts_value_lt():
+ spec = ValueLE(7)
+ spec(_DUMMY_PATH, 6)
+
+
+def test_value_le__accepts_value_eq():
+ spec = ValueLE(7)
+ spec(_DUMMY_PATH, 7)
+
+
+def test_value_le__rejects_value_gt():
+ spec = ValueLE(7)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, 8)
+
+
+def test_value_le__accepts_value_lt__with_key():
+ spec = ValueLE(7, key=len)
+ spec(_DUMMY_PATH, "abcdef")
+
+
+def test_value_le__accepts_value_eq__with_key():
+ spec = ValueLE(7, key=len)
+ spec(_DUMMY_PATH, "abcdefg")
+
+
+def test_value_le__rejects_value_gt__with_key():
+ spec = ValueLE(7, key=len)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, "abcdefgh")
+
+
+def test_value_le__default_description():
+ spec = ValueLE('Foo')
+ assert_equal(spec.description, "value <= 'Foo'")
+
+
+def test_value_le__custom_description():
+ spec = ValueLE('Bar', description='no more than {rvalue}')
+ assert_equal(spec.description, "no more than 'Bar'")
+
+
+def test_is_value_le__default_not_set():
+ spec = ValueLE(10)
+ assert_is(spec.default, DEFAULT_NOT_SET)
+
+
+def test_is_value_le__default_set__valid_value():
+ spec = ValueLE(10, default=10)
+ assert_equal(spec.default, 10)
+
+
+def test_is_value_le__default_set__must_meet_spec():
+ assert_raises(ValueError, ValueLE, 10, default=17)
+
+
+###############################################################################
+###############################################################################
+# ValueGE
+
+def test_value_ge__rejects_value_lt():
+ spec = ValueGE(7)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, 6)
+
+
+def test_value_ge__accepts_value_eq():
+ spec = ValueGE(7)
+ spec(_DUMMY_PATH, 7)
+
+
+def test_value_ge__accepts_value_gt():
+ spec = ValueGE(7)
+ spec(_DUMMY_PATH, 8)
+
+
+def test_value_ge__accepts_value_lt__with_key():
+ spec = ValueGE(7, key=len)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, "abcdef")
+
+
+def test_value_ge__accepts_value_eq__with_key():
+ spec = ValueGE(7, key=len)
+ spec(_DUMMY_PATH, "abcdefg")
+
+
+def test_value_ge__accepts_value_gt__with_key():
+ spec = ValueGE(7, key=len)
+ spec(_DUMMY_PATH, "abcdefgh")
+
+
+def test_value_ge__default_description():
+ spec = ValueGE('Foo')
+ assert_equal(spec.description, "value >= 'Foo'")
+
+
+def test_value_ge__custom_description():
+ spec = ValueGE('Bar', description='no less than {rvalue}')
+ assert_equal(spec.description, "no less than 'Bar'")
+
+
+def test_is_value_ge__default_not_set():
+ spec = ValueGE(10)
+ assert_is(spec.default, DEFAULT_NOT_SET)
+
+
+def test_is_value_ge__default_set__valid_value():
+ spec = ValueGE(10, default=10)
+ assert_equal(spec.default, 10)
+
+
+def test_is_value_ge__default_set__must_meet_spec():
+ assert_raises(ValueError, ValueGE, 10, default=7)
+
+
+###############################################################################
+###############################################################################
+# ValueGT
+
+def test_value_gt__rejects_value_lt():
+ spec = ValueGT(7)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, 6)
+
+
+def test_value_gt__rejects_value_eq():
+ spec = ValueGT(7)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, 7)
+
+
+def test_value_gt__accepts_value_gt():
+ spec = ValueGT(7)
+ spec(_DUMMY_PATH, 8)
+
+
+def test_value_gt__accepts_value_lt__with_key():
+ spec = ValueGT(7, key=len)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, "abcdef")
+
+
+def test_value_gt__accepts_value_eq__with_key():
+ spec = ValueGT(7, key=len)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, "abcdefg")
+
+
+def test_value_gt__accepts_value_gt__with_key():
+ spec = ValueGT(7, key=len)
+ spec(_DUMMY_PATH, "abcdefgh")
+
+
+def test_value_gt__default_description():
+ spec = ValueGT('Foo')
+ assert_equal(spec.description, "value > 'Foo'")
+
+
+def test_value_gt__custom_description():
+ spec = ValueGT('Bar', description='more than {rvalue}')
+ assert_equal(spec.description, "more than 'Bar'")
+
+
+def test_is_value_gt__default_not_set():
+ spec = ValueGT(10)
+ assert_is(spec.default, DEFAULT_NOT_SET)
+
+
+def test_is_value_gt__default_set__valid_value():
+ spec = ValueGT(10, default=11)
+ assert_equal(spec.default, 11)
+
+
+def test_is_value_gt__default_set__must_meet_spec():
+ assert_raises(ValueError, ValueGT, 10, default=10)
+
+
+###############################################################################
+###############################################################################
+# ValueIn
+
+def test_value_in__single_value_in_set():
+ spec = ValueIn(range(5))
+ spec(_DUMMY_PATH, 1)
+
+
+def test_value_in__single_value_not_in_set():
+ spec = ValueIn(range(5))
+ assert_raises(MakefileError, spec, _DUMMY_PATH, 5)
+
+
+def test_value_in__single_value_in_set__with_key():
+ spec = ValueIn(range(5), key=len)
+ spec(_DUMMY_PATH, "a")
+
+
+def test_value_in__single_value_not_in_set__with_key():
+ spec = ValueIn(range(5), key=len)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, "abcde")
+
+
+def test_value_in__case_sensitive__value_in_set():
+ spec = ValueIn(("Abc", "bCe", "cdE"))
+ spec(_DUMMY_PATH, "bCe")
+
+
+def test_value_in__case_sensitive__value_in_not_set():
+ spec = ValueIn(("Abc", "bCe", "cdE"))
+ assert_raises(MakefileError, spec, _DUMMY_PATH, "Bce")
+
+
+def test_value_in__default_description():
+ spec = ValueIn(("Abc", "bCe", "cdE"))
+ assert_equal(spec.description, "value in 'Abc', 'bCe', or 'cdE'")
+
+
+def test_value_in__custom_description():
+ spec = ValueIn(("Abc", "bCe", "cdE"), description="One of {rvalue}")
+ assert_equal(spec.description, "One of 'Abc', 'bCe', or 'cdE'")
+
+
+def test_is_value_in__default_not_set():
+ spec = ValueIn(range(5))
+ assert_is(spec.default, DEFAULT_NOT_SET)
+
+
+def test_is_value_in__default_set__valid_value():
+ spec = ValueIn(range(5), default=4)
+ assert_equal(spec.default, 4)
+
+
+def test_is_value_in__default_set__must_meet_spec():
+ assert_raises(ValueError, ValueGT, range(5), default=5)
+
+
+###############################################################################
+###############################################################################
+# ValuesIntersects
+
+def test_intersects__single_value_in_set():
+ spec = ValuesIntersect(range(5))
+ spec(_DUMMY_PATH, [1])
+
+
+def test_intersects__multiple_values_in_set():
+ spec = ValuesIntersect(range(5))
+ spec(_DUMMY_PATH, [1, 4])
+
+
+def test_intersects__single_value_not_in_set():
+ spec = ValuesIntersect(range(5))
+ assert_raises(MakefileError, spec, _DUMMY_PATH, [5])
+
+
+def test_intersects__some_values_in_set():
+ spec = ValuesIntersect(range(5))
+ spec(_DUMMY_PATH, [4, 5])
+
+
+def test_intersects__empty_set():
+ spec = ValuesIntersect(range(5))
+ assert_raises(MakefileError, spec, _DUMMY_PATH, [])
+
+
+def test_intersects__case_sensitive__value_in_set():
+ spec = ValuesIntersect(("Abc", "bCe", "cdE"))
+ spec(_DUMMY_PATH, ["bCe"])
+
+
+def test_intersects__case_sensitive__value_in_not_set():
+ spec = ValuesIntersect(("Abc", "bCe", "cdE"))
+ assert_raises(MakefileError, spec, _DUMMY_PATH, ["Bce"])
+
+
+def test_intersects__chars__case_sensitive():
+ spec = ValuesIntersect("abcdefghijkl")
+ spec(_DUMMY_PATH, "a big deal")
+
+
+def test_intersects__chars__case_sensitive__rejects_differences_in_case():
+ spec = ValuesIntersect("abcdefghijkl")
+ assert_raises(MakefileError, spec, _DUMMY_PATH, "A BIG DEAL")
+
+
+def test_intersects__rejects_dictionary():
+ spec = ValuesIntersect("abc")
+ assert_raises(MakefileError, spec, _DUMMY_PATH, {"a": 1, "d": 2})
+
+
+def test_intersects__default_not_set():
+ spec = ValuesIntersect(range(5))
+ assert_is(spec.default, DEFAULT_NOT_SET)
+
+
+def test_intersects__default_set__valid_value():
+ spec = ValuesIntersect(range(5), default=[3, 4])
+ assert_equal(spec.default, [3, 4])
+
+
+def test_intersects__default_set__must_meet_spec():
+ assert_raises(ValueError, ValuesIntersect, range(5), default=[5])
+
+
+###############################################################################
+###############################################################################
+# ValueSubsetOf
+
+def test_subset_of__single_value_in_set():
+ spec = ValuesSubsetOf(range(5))
+ spec(_DUMMY_PATH, [1])
+
+
+def test_subset_of__multiple_values_in_set():
+ spec = ValuesSubsetOf(range(5))
+ spec(_DUMMY_PATH, [1, 4])
+
+
+def test_subset_of__single_value_not_in_set():
+ spec = ValuesSubsetOf(range(5))
+ assert_raises(MakefileError, spec, _DUMMY_PATH, [5])
+
+
+def test_subset_of__multiple_values_not_in_set():
+ spec = ValuesSubsetOf(range(5))
+ assert_raises(MakefileError, spec, _DUMMY_PATH, [4, 5])
+
+
+def test_subset_of__empty_set():
+ spec = ValuesSubsetOf(range(5))
+ spec(_DUMMY_PATH, [])
+
+
+def test_subset_of__case_sensitive__value_in_set():
+ spec = ValuesSubsetOf(("Abc", "bCe", "cdE"))
+ spec(_DUMMY_PATH, ["bCe"])
+
+
+def test_subset_of__case_sensitive__value_in_not_set():
+ spec = ValuesSubsetOf(("Abc", "bCe", "cdE"))
+ assert_raises(MakefileError, spec, _DUMMY_PATH, ["Bce"])
+
+
+def test_subset_of__chars__case_sensitive():
+ spec = ValuesSubsetOf("abcdefghijkl ")
+ spec(_DUMMY_PATH, "a big deal")
+
+
+def test_subset_of__chars__case_sensitive__rejects_differences_in_case():
+ spec = ValuesSubsetOf("abcdefghijkl ")
+ assert_raises(MakefileError, spec, _DUMMY_PATH, "A big DEAL")
+
+
+def test_subset_of__rejects_dictionary():
+ spec = ValuesIntersect("abc")
+ assert_raises(MakefileError, spec, _DUMMY_PATH, {"a": 1, "b": 2})
+
+
+def test_subset_of__default_not_set():
+ spec = ValuesSubsetOf(range(5))
+ assert_is(spec.default, DEFAULT_NOT_SET)
+
+
+def test_subset_of__default_set__valid_value():
+ spec = ValuesSubsetOf(range(5), default=[3, 4])
+ assert_equal(spec.default, [3, 4])
+
+
+def test_subset_of__default_set__must_meet_spec():
+ assert_raises(ValueError, ValuesSubsetOf, range(5), default=[4, 5])
+
+
+###############################################################################
+###############################################################################
+# And
+
+def test_and__accepts_when_all_true():
+ spec = And(IsFloat, ValueLT(1.5))
+ spec(_DUMMY_PATH, 0.0)
+
+
+def test_and__rejects_when_first_is_false():
+ spec = And(IsFloat, ValueLT(1.5))
+ assert_raises(MakefileError, spec, _DUMMY_PATH, 0)
+
+
+def test_and__rejects_when_second_is_false():
+ spec = And(IsFloat, ValueLT(1.5))
+ assert_raises(MakefileError, spec, _DUMMY_PATH, 2.0)
+
+
+def test_and__rejects_when_both_is_false():
+ spec = And(IsFloat, ValueLT(1.5))
+ assert_raises(MakefileError, spec, _DUMMY_PATH, 2)
+
+
+def test_and__rejects_no_tests():
+ assert_raises(ValueError, And)
+
+
+def test_and__rejects_non_spec_tests():
+ assert_raises(TypeError, And, id)
+
+
+def test_and__default_not_set():
+ spec = And(IsInt, ValueGT(10))
+ assert_is(spec.default, DEFAULT_NOT_SET)
+
+
+def test_and__default_set__valid_value():
+ spec = And(IsInt, ValueGT(10), default=20)
+ assert_equal(spec.default, 20)
+
+
+def test_and__default_set__must_meet_spec():
+ assert_raises(ValueError, And, IsInt, ValueGT(10), default=5)
+
+
+def test_and__defaults_not_set_in_specs():
+ assert_raises(ValueError, And, IsInt(default=10), ValueGT(10))
+
+
+###############################################################################
+###############################################################################
+# Or
+
+def test_or__accepts_first_test():
+ spec = Or(IsStr, IsBoolean)
+ spec(_DUMMY_PATH, "Foo")
+
+
+def test_or__accepts_second_test():
+ spec = Or(IsStr, IsBoolean)
+ spec(_DUMMY_PATH, False)
+
+
+def test_or__rejects_if_both_specs_fail():
+ spec = Or(IsStr, IsBoolean)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, 1)
+
+
+def test_or__rejects_no_tests():
+ assert_raises(ValueError, Or)
+
+
+def test_or__rejects_non_spec_tests():
+ assert_raises(TypeError, Or, id)
+
+
+def test_or__default_not_set():
+ spec = Or(IsInt, ValueGT(10))
+ assert_is(spec.default, DEFAULT_NOT_SET)
+
+
+def test_or__default_set__valid_value():
+ spec = Or(IsInt, ValueGT(10), default=17)
+ assert_equal(spec.default, 17)
+
+
+def test_or__default_set__must_meet_spec():
+ assert_raises(ValueError, Or, IsInt, ValueGT(10), default=5.5)
+
+
+def test_or__defaults_not_set_in_specs():
+ assert_raises(ValueError, Or, IsInt(default=10), ValueGT(10))
+
+
+###############################################################################
+###############################################################################
+# Xor
+
+def test_xor__rejects_when_all_true():
+ spec = Xor(IsFloat, ValueLT(1))
+ assert_raises(MakefileError, spec, _DUMMY_PATH, 0.0)
+
+
+def test_xor__accepts_when_first_is_false():
+ spec = Xor(IsFloat, ValueLT(1))
+ spec(_DUMMY_PATH, 0)
+
+
+def test_xor__accepts_when_second_is_false():
+ spec = Xor(IsFloat, ValueLT(1.0))
+ spec(_DUMMY_PATH, 2.0)
+
+
+def test_xor__rejects_when_both_is_false():
+ spec = Xor(IsFloat, ValueLT(1.0))
+ assert_raises(MakefileError, spec, _DUMMY_PATH, 2)
+
+
+def test_xor__rejects_no_tests():
+ assert_raises(ValueError, Xor)
+
+
+def test_xor__rejects_one_test():
+ assert_raises(ValueError, Xor, IsInt)
+
+
+def test_xor__rejects_three_tests():
+ assert_raises(ValueError, Xor, IsInt, IsFloat, IsStr)
+
+
+def test_xor__rejects_non_spec_tests():
+ assert_raises(TypeError, Xor, id, id)
+
+
+def test_xor__default_not_set():
+ spec = Xor(IsInt, ValueGT(10))
+ assert_is(spec.default, DEFAULT_NOT_SET)
+
+
+def test_xor__default_set__valid_value():
+ spec = Xor(IsInt, ValueGT(10), default=5)
+ assert_equal(spec.default, 5)
+
+
+def test_xor__default_set__must_meet_spec():
+ assert_raises(ValueError, Xor, IsInt, ValueGT(10), default=17)
+
+
+def test_xor__defaults_not_set_in_specs():
+ assert_raises(ValueError, Xor, IsInt(default=10), ValueGT(10))
+
+
+###############################################################################
+###############################################################################
+# Not
+
+def test_not__accepts_when_test_is_false():
+ spec = Not(IsInt)
+ spec(_DUMMY_PATH, True)
+
+
+def test_not__rejects_when_test_is_true():
+ spec = Not(IsInt)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, 1)
+
+
+def test_not__defaults_not_set_in_specs():
+ assert_raises(ValueError, Not, IsInt(default=10))
+
+
+###############################################################################
+###############################################################################
+# StringIn
+
+def test_string_in__case_sensitive__value_in_set():
+ spec = StringIn(("Abc", "bCe", "cdE"))
+ spec(_DUMMY_PATH, "bCe")
+
+
+def test_string_in__case_insensitive__value_in_set():
+ spec = StringIn(("Abc", "bCe", "cdE"))
+ spec(_DUMMY_PATH, "Bce")
+
+
+def test_string_in__case_insensitive__value_not_set():
+ spec = StringIn(("Abc", "bCe", "cdE"))
+ assert_raises(MakefileError, spec, _DUMMY_PATH, "ABce")
+
+
+def test_string_in__case_insensitive__mixed_string__non_string_found():
+ spec = StringIn(("A", "c", "B", 1, 2, 3))
+ spec(_DUMMY_PATH, 1)
+
+
+def test_string_in__case_insensitive__mixed_string__string_found():
+ spec = StringIn(("A", "c", "B", 1, 2, 3))
+ spec(_DUMMY_PATH, "a")
+
+
+def test_string_in__default_not_set():
+ spec = StringIn("ABCDEFGH")
+ assert_is(spec.default, DEFAULT_NOT_SET)
+
+
+def test_string_in__default_set__valid_value():
+ spec = StringIn("ABCDEFGH", default="e")
+ assert_equal(spec.default, "e")
+
+
+def test_string_in__default_set__must_meet_spec():
+ assert_raises(ValueError, StringIn, "ABCDEFGH", default="i")
+
+
+###############################################################################
+###############################################################################
+# StringsIntersect
+
+def test_strings_intersect__case_insensitive__value_in_set():
+ spec = StringsIntersect(("Abc", "bCe", "cdE"))
+ spec(_DUMMY_PATH, ["Bce"])
+
+
+def test_strings_intersect__chars__case_insensitive__accepts_differt_in_case():
+ spec = StringsIntersect("abcdefghijkl")
+ spec(_DUMMY_PATH, "A BIG DEAL")
+
+
+def test_strings_intersect__case_insensitive__mixed_string__non_string_found():
+ spec = StringsIntersect(("A", "c", "B", 1, 2, 3))
+ spec(_DUMMY_PATH, [1])
+
+
+def test_strings_intersect__case_insensitive__mixed_string_string_found():
+ spec = StringsIntersect(("A", "c", "B", 1, 2, 3))
+ spec(_DUMMY_PATH, "a")
+
+
+def test_strings_intersect__rejects_dictionary():
+ spec = StringsIntersect("abc")
+ assert_raises(MakefileError, spec, _DUMMY_PATH, {"a": 1, "b": 2})
+
+
+def test_strings_intersect__default_not_set():
+ spec = StringsIntersect("ABCDEFGH")
+ assert_is(spec.default, DEFAULT_NOT_SET)
+
+
+def test_strings_intersect__default_set__valid_value():
+ spec = StringsIntersect("ABCDEFGH", default="eabi")
+ assert_equal(spec.default, "eabi")
+
+
+def test_strings_intersect__default_set__must_meet_spec():
+ assert_raises(ValueError, StringsIntersect, "ABCDEFGH", default=[1, 2, 3])
+
+
+###############################################################################
+###############################################################################
+# StringsSubsetOf
+
+def test_subset_of__case_insensitive__value_in_set():
+ spec = StringsSubsetOf(("Abc", "bCe", "cdE"))
+ spec(_DUMMY_PATH, ["Bce"])
+
+
+def test_subset_of__chars__case_insensitive__accepts_differences_in_case():
+ spec = StringsSubsetOf("abcdefghijkl ")
+ spec(_DUMMY_PATH, "A big DEAL")
+
+
+def test_subset_of__case_insensitive__mixed_string__non_string_found():
+ spec = StringsSubsetOf(("A", "c", "B", 1, 2, 3))
+ spec(_DUMMY_PATH, [1])
+
+
+def test_subset_of__case_insensitive__mixed_string_string_found():
+ spec = StringsSubsetOf(("A", "c", "B", 1, 2, 3))
+ spec(_DUMMY_PATH, "a")
+
+
+def test_strings_subset_of__rejects_dictionary():
+ spec = StringsSubsetOf("abc")
+ assert_raises(MakefileError, spec, _DUMMY_PATH, {"a": 1, "b": 2})
+
+
+def test_strings_subset_of__default_not_set():
+ spec = StringsSubsetOf("ABCDEFGH")
+ assert_is(spec.default, DEFAULT_NOT_SET)
+
+
+def test_strings_subset_of__default_set__valid_value():
+ spec = StringsSubsetOf("ABCDEFGH", default="adFg")
+ assert_equal(spec.default, "adFg")
+
+
+def test_string_subset_of__default_set__must_meet_spec():
+ assert_raises(ValueError, StringsSubsetOf, "ABCDEFGH", default=[1, 2, 3])
+
+
+###############################################################################
+###############################################################################
+# StringIsUppercase
+
+def test_string_is_uppercase__accepts_standard_str():
+ spec = StringIsUppercase()
+ spec(_DUMMY_PATH, "ABC")
+
+
+def test_string_is_uppercase__accepts_unicode_str():
+ spec = StringIsUppercase()
+ spec(_DUMMY_PATH, u"DEF")
+
+
+def test_string_is_uppercase__rejects_empty_string():
+ spec = StringIsUppercase()
+ assert_raises(MakefileError, spec, _DUMMY_PATH, "")
+
+
+def test_string_is_uppercase__rejects_not_uppercase_str():
+ def _reject_not_uppercase_str(value):
+ spec = StringIsUppercase()
+ assert_raises(MakefileError, spec, _DUMMY_PATH, value)
+
+ yield _reject_not_uppercase_str, "AcEf"
+ yield _reject_not_uppercase_str, None
+ yield _reject_not_uppercase_str, 1
+ yield _reject_not_uppercase_str, ()
+ yield _reject_not_uppercase_str, {}
+
+
+def test_string_is_uppercase__default_not_set():
+ spec = StringIsUppercase()
+ assert_is(spec.default, DEFAULT_NOT_SET)
+
+
+def test_string_is_uppercase__default_set__valid_value():
+ spec = StringIsUppercase(default="FOO")
+ assert_equal(spec.default, "FOO")
+
+
+def test_string_is_uppercase__default_set__must_meet_spec():
+ assert_raises(ValueError, StringIsUppercase, default="foo")
+
+
+###############################################################################
+###############################################################################
+# StringStartsWith
+
+def test_string_starts_with__accepts_standard_str():
+ spec = StringStartsWith("A_")
+ spec(_DUMMY_PATH, "A_BC")
+
+
+def test_string_starts_with__accepts_unicode_str():
+ spec = StringStartsWith("A_")
+ spec(_DUMMY_PATH, u"A_DEF")
+
+
+def test_string_starts_with__rejects_string_without_prefix():
+ spec = StringStartsWith("A_")
+ assert_raises(MakefileError, spec, _DUMMY_PATH, "B_GHI")
+
+
+def test_string_starts_with__rejects_not_uppercase_str():
+ spec = StringStartsWith("Foo")
+
+ def _reject_not_str_with_prefix(value):
+ assert_raises(MakefileError, spec, _DUMMY_PATH, value)
+
+ yield _reject_not_str_with_prefix, None
+ yield _reject_not_str_with_prefix, 1
+ yield _reject_not_str_with_prefix, ()
+ yield _reject_not_str_with_prefix, {}
+
+
+def test_string_starts_with__default_not_set():
+ spec = StringStartsWith("Foo")
+ assert_is(spec.default, DEFAULT_NOT_SET)
+
+
+def test_string_starts_with__default_set__valid_value():
+ spec = StringStartsWith("Foo", default="FooBar")
+ assert_equal(spec.default, "FooBar")
+
+
+def test_string_starts_with__default_set__must_meet_spec():
+ assert_raises(ValueError, StringStartsWith, "FooBar", default="BarFoo")
+
+
+###############################################################################
+###############################################################################
+# StringEndsWith
+
+def test_string_ends_with__accepts_standard_str():
+ spec = StringEndsWith("_A")
+ spec(_DUMMY_PATH, "BC_A")
+
+
+def test_string_ends_with__accepts_unicode_str():
+ spec = StringEndsWith("_A")
+ spec(_DUMMY_PATH, u"DEF_A")
+
+
+def test_string_ends_with__rejects_string_without_prefix():
+ spec = StringEndsWith("_A")
+ assert_raises(MakefileError, spec, _DUMMY_PATH, "GHI_B")
+
+
+def test_string_ends_with__rejects_not_uppercase_str():
+ spec = StringEndsWith("Foo")
+
+ def _reject_not_str_with_postfix(value):
+ assert_raises(MakefileError, spec, _DUMMY_PATH, value)
+
+ yield _reject_not_str_with_postfix, None
+ yield _reject_not_str_with_postfix, 1
+ yield _reject_not_str_with_postfix, ()
+ yield _reject_not_str_with_postfix, {}
+
+
+def test_string_ends_with__default_not_set():
+ spec = StringEndsWith("Foo")
+ assert_is(spec.default, DEFAULT_NOT_SET)
+
+
+def test_string_ends_with__default_set__valid_value():
+ spec = StringEndsWith("Bar", default="FooBar")
+ assert_equal(spec.default, "FooBar")
+
+
+def test_string_ends_with__default_set__must_meet_spec():
+ assert_raises(ValueError, StringEndsWith, "FooBar", default="BarFoo")
+
+
+###############################################################################
+###############################################################################
+# IsListOf
+
+def test_is_list_of__empty_list_always_ok():
+ spec = IsListOf(IsInt)
+ spec(_DUMMY_PATH, [])
+
+
+def test_is_list_of__list_of_ints_accepted():
+ spec = IsListOf(IsInt)
+ spec(_DUMMY_PATH, [1, 2, 3])
+
+
+def test_is_list_of__list_of_non_ints_rejected():
+ spec = IsListOf(IsInt)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, ['a', 'b', 'c'])
+
+
+def test_is_list_of__mixed_list_rejected():
+ spec = IsListOf(IsInt)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, [1, 'b', 3])
+
+
+def test_is_list_of__default_description():
+ spec = IsListOf(IsInt, IsFloat)
+ assert_equal(spec.description, "[(an integer) or (a float), ...]")
+
+
+def test_is_list_of__non_list_rejected():
+ spec = IsListOf(IsInt)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, {1: 2})
+
+
+def test_is_list_of__default_not_set():
+ spec = IsListOf(IsInt)
+ assert_is(spec.default, DEFAULT_NOT_SET)
+
+
+def test_is_list_of__default_set__valid_value():
+ spec = IsListOf(IsInt, default=range(5))
+ assert_equal(spec.default, range(5))
+
+
+def test_is_list_of__default_set__must_meet_spec():
+ assert_raises(ValueError, IsListOf, IsInt, default=17)
+
+
+def test_is_list_of__defaults_not_set_in_specs():
+ assert_raises(ValueError, IsListOf, IsInt(default=10))
+
+
+###############################################################################
+###############################################################################
+# IsDictOf
+
+def test_is_dict_of__empty_dict_always_ok():
+ spec = IsDictOf(IsInt, IsStr)
+ spec(_DUMMY_PATH, {})
+
+
+def test_is_dict_of__correct_key_and_value_accepted():
+ spec = IsDictOf(IsInt, IsStr)
+ spec(_DUMMY_PATH, {1: "foo"})
+
+
+def test_is_dict_of__wrong_key_and_correct_value_rejected():
+ spec = IsDictOf(IsInt, IsStr)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, {1.5: "foo"})
+
+
+def test_is_dict_of__correct_key_and_wrong_value_rejected():
+ spec = IsDictOf(IsInt, IsStr)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, {1: 1})
+
+
+def test_is_dict_of__mixed_rejected():
+ spec = IsDictOf(IsInt, IsStr)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, {1: 1, 2: "foo"})
+
+
+def test_is_dict_of__default_description():
+ spec = IsDictOf(IsInt, IsStr)
+ assert_equal(spec.description, "{(an integer) : (a non-empty string)}")
+
+
+def test_is_dict_of__rejects_non_dict():
+ spec = IsDictOf(IsInt, IsStr)
+ assert_raises(MakefileError, spec, _DUMMY_PATH, [])
+
+
+def test_is_dict_of__default_not_set():
+ spec = IsDictOf(IsInt, IsInt)
+ assert_is(spec.default, DEFAULT_NOT_SET)
+
+
+def test_is_dict_of__default_set__valid_value():
+ spec = IsDictOf(IsInt, IsInt, default={1: 2})
+ assert_equal(spec.default, {1: 2})
+
+
+def test_is_dict_of__default_set__must_meet_spec():
+ assert_raises(ValueError, IsDictOf, IsInt, IsInt, default={1: "b"})
+
+
+def test_is_dict_of__defaults_not_set_in_key_specs():
+ assert_raises(ValueError, IsDictOf, IsInt(default=10), IsInt)
+
+
+def test_is_dict_of__defaults_not_set_in_value_specs():
+ assert_raises(ValueError, IsDictOf, IsInt, IsInt(default=10))
+
+
+###############################################################################
+###############################################################################
+# Path is displayed in exception
+
+def test_specs__path_is_displayed_in_exception():
+ def _path_is_displayed_in_exception(spec, value):
+ assert_raises_regexp(MakefileError, _DUMMY_PATH_STR,
+ spec, _DUMMY_PATH, value)
+
+ yield _path_is_displayed_in_exception, IsInt(), "foo"
+ yield _path_is_displayed_in_exception, IsUnsignedInt(), -1
+ yield _path_is_displayed_in_exception, IsFloat(), "abc"
+ yield _path_is_displayed_in_exception, IsBoolean(), 1
+ yield _path_is_displayed_in_exception, IsStr(), 1
+ yield _path_is_displayed_in_exception, IsNone(), 1
+ yield _path_is_displayed_in_exception, ValueLT(0), 1
+ yield _path_is_displayed_in_exception, ValueLE(0), 1
+ yield _path_is_displayed_in_exception, ValueGE(0), -1
+ yield _path_is_displayed_in_exception, ValueGT(0), -1
+ yield _path_is_displayed_in_exception, ValueIn([1]), 2
+ yield _path_is_displayed_in_exception, ValuesIntersect([1]), [2]
+ yield _path_is_displayed_in_exception, ValuesSubsetOf([1]), [2]
+ yield _path_is_displayed_in_exception, ValueMissing(), True
+ yield _path_is_displayed_in_exception, And(IsStr), 1
+ yield _path_is_displayed_in_exception, Or(IsStr), 1
+ yield _path_is_displayed_in_exception, Xor(IsStr, IsInt), True
+ yield _path_is_displayed_in_exception, Not(IsInt), 1
+ yield _path_is_displayed_in_exception, StringIn("abc"), 1
+ yield _path_is_displayed_in_exception, StringsIntersect("abc"), [1]
+ yield _path_is_displayed_in_exception, StringsSubsetOf("abc"), [1]
+ yield _path_is_displayed_in_exception, StringIsUppercase(), 1
+ yield _path_is_displayed_in_exception, StringStartsWith("FOO"), 1
+ yield _path_is_displayed_in_exception, StringEndsWith("FOO"), 1
+ yield _path_is_displayed_in_exception, IsListOf(IsInt), "foo"
+ yield _path_is_displayed_in_exception, IsDictOf(IsInt, IsInt), 1
+
+
+###############################################################################
+###############################################################################
+# process_makefile
+
+def test_process_makefile__dict_keys_found():
+ def _dict_keys_found(current, specs):
+ process_makefile(current, specs)
+
+ # String keys
+ yield _dict_keys_found, {"B": 7}, {"A": IsInt, "B": IsInt}
+ # Spec keys
+ yield _dict_keys_found, {1: "Abc"}, {IsStr: IsInt, IsInt: IsStr}
+ # Spec keys, instantiated
+ yield _dict_keys_found, {1: "Abc"}, {IsStr(): IsInt, IsInt: IsStr()}
+ # Mixed keys, spec matches
+ yield _dict_keys_found, {3: 14}, {IsInt: IsInt, "A": IsInt}
+ # Mixed keys, key matches
+ yield _dict_keys_found, {"A": 23}, {IsInt: IsInt, "A": IsInt}
+
+
+def test_process_makefile__dict_keys_not_found():
+ def _dict_keys_missing(current, specs):
+ assert_raises(MakefileError, process_makefile, current, specs)
+
+ # String keys
+ yield _dict_keys_missing, {"C": 7}, {"A": IsInt, "B": IsInt}
+ # Spec keys
+ yield _dict_keys_missing, {1.3: "Abc"}, {IsStr: IsInt, IsInt: IsStr}
+ # Spec keys, instantiated
+ yield _dict_keys_missing, {1.3: "Abc"}, {IsStr(): IsInt, IsInt: IsStr()}
+ # Mixed keys, spec matches
+ yield _dict_keys_missing, {"C": 14}, {IsInt: IsInt, "A": IsInt}
+ yield _dict_keys_missing, {"A": 23}, {}
+
+
+def test_validate_makefile__unexpected_type_in_reference():
+ current = {1: 2}
+ specs = {IsInt: 2}
+ assert_raises(TypeError, process_makefile, current, specs)
+
+
+def test_validate_makefile__unexpected_type_in_current():
+ current = {1: []}
+ specs = {IsInt: {IsInt: IsInt}}
+ assert_raises(MakefileError, process_makefile, current, specs)
+
+
+def test_process_makefile__sets_missing_keys():
+ current = {"A": 1}
+ specs = {"A": IsInt(default=0),
+ "B": IsInt(default=-1),
+ "C": IsInt(default=-2)}
+ expected = {"A": 1, "B": -1, "C": -2}
+ result = process_makefile(current, specs)
+ assert_equal(result, expected)
+
+
+def test_process_makefile__mixed_keys():
+ current = {"A": 1}
+ specs = {IsStr: IsInt,
+ "B": IsInt(default=-1),
+ "C": IsInt(default=-2)}
+ expected = {"A": 1, "B": -1, "C": -2}
+ result = process_makefile(current, specs)
+ assert_equal(result, expected)
+
+
+def test_process_makefile__sets_missing_recursive():
+ current = {"A": 1, "B": {"C": 2}}
+ specs = {"A": IsInt(default=0),
+ "B": {"C": IsInt(default=-1),
+ "D": IsInt(default=-2)}}
+ expected = {"A": 1, "B": {"C": 2, "D": -2}}
+ result = process_makefile(current, specs)
+ assert_equal(result, expected)
+
+
+def test_process_makefile__sets_missing_recursive__with_missing_substructure():
+ current = {"A": 1}
+ specs = {"A": IsInt(default=0),
+ "B": {"C": IsInt(default=-1),
+ "D": IsInt(default=-2)}}
+ expected = {"A": 1, "B": {"C": -1, "D": -2}}
+ result = process_makefile(current, specs)
+ assert_equal(result, expected)
+
+
+def test_process_makefile__shared_subtrees_with_defaults():
+ subtree = {"A": IsInt(default=1234),
+ "B": IsInt(default=5678)}
+ specs = {"A": subtree,
+ "B": subtree}
+ current = {"A": {"B": 17},
+ "B": {"A": 71}}
+ expected = {"A": {"A": 1234, "B": 17},
+ "B": {"A": 71, "B": 5678}}
+ result = process_makefile(current, specs)
+ assert_equal(result, expected)
+
+
+def test_process_makefile__shared_subtrees_with_defaults__defaults_disabled():
+ subtree = {"A": IsInt(default=1234),
+ "B": IsInt(default=5678)}
+ specs = {"A": subtree,
+ "B": WithoutDefaults(subtree)}
+ current = {"A": {"B": 17},
+ "B": {"A": 71}}
+ expected = {"A": {"A": 1234, "B": 17},
+ "B": {"A": 71}}
+ result = process_makefile(current, specs)
+ assert_equal(result, expected)
+
+
+def test_process_makefile__accept_when_required_value_is_set():
+ current = {"A": 1, "B": {"C": 3}}
+ expected = {"A": 1, "B": {"C": 3}}
+ specs = {"A": IsInt, "B": {"C": IsInt(default=REQUIRED_VALUE)}}
+ result = process_makefile(current, specs)
+ assert_equal(result, expected)
+
+
+def test_process_makefile__fails_when_required_value_not_set():
+ current = {"A": 1}
+ specs = {"A": IsInt, "B": {"C": IsInt(default=REQUIRED_VALUE)}}
+ assert_raises(MakefileError, process_makefile, current, specs)
+
+
+def test_process_makefile__fails_required_value_not_set_in_dynamic_subtree():
+ current = {"A": 1, "B": {}}
+ specs = {"A": IsInt, IsStr: {"C": IsInt(default=REQUIRED_VALUE)}}
+ assert_raises(MakefileError, process_makefile, current, specs)
+
+
+def test_process_makefile__accept_missing_value_if_in_implicit_subtree():
+ current = {"A": 1}
+ expected = {"A": 1}
+ specs = {"A": IsInt, IsStr: {"C": IsInt(default=REQUIRED_VALUE)}}
+ result = process_makefile(current, specs)
+ assert_equal(result, expected)
+
+
+def test_process_makefile__path_shown_in_exception_for_list():
+ assert_raises_regexp(MakefileError, _DUMMY_PATH_STR,
+ process_makefile, {}, [], _DUMMY_PATH)
+
+
+def test_process_makefile__path_shown_in_exception_for_dict():
+ assert_raises_regexp(MakefileError, _DUMMY_PATH_STR,
+ process_makefile, [], {}, _DUMMY_PATH)
+
+
+def test_process_makefile__implicit_subdict_is_allowed():
+ current = {"A": 1, "B": None}
+ expected = {"A": 1, "B": {"C": 3}}
+ specs = {"A": IsInt, "B": {"C": IsInt(default=3)}}
+ result = process_makefile(current, specs)
+ assert_equal(result, expected)
+
+
+###############################################################################
+###############################################################################
+# process_makefile -- lists
+
+def test_process_makefile__list_types_accepted():
+ current = {"A": 1, "B": [17, "Foo"]}
+ expected = {"A": 1, "B": [17, "Foo"]}
+ specs = {"A": IsInt, "B": [IsInt, IsStr]}
+ result = process_makefile(current, specs)
+ assert_equal(result, expected)
+
+
+def test_process_makefile__wrong_list_types():
+ current = {"A": 1, "B": [17, "foo"]}
+ specs = {"A": IsInt, "B": [IsInt]}
+ assert_raises(MakefileError, process_makefile, current, specs)
+
+
+def test_process_makefile__missing_list_defaults_to_empty():
+ current = {"A": 1}
+ expected = {"A": 1, "B": {"C": []}}
+ specs = {"A": IsInt, "B": {"C": [IsInt]}}
+ result = process_makefile(current, specs)
+ assert_equal(result, expected)
+
+
+def test_process_makefile__missing_list_default_value():
+ current = {"A": 1}
+ expected = {"A": 1, "B": [1, 2, 3]}
+ specs = {"A": IsInt, "B": IsListOf(IsInt, default=[1, 2, 3])}
+ result = process_makefile(current, specs)
+ assert_equal(result, expected)
+
+
+def test_process_makefile__key_specified_but_no_entries():
+ current = {"A": 1, "B": None}
+ expected = {"A": 1, "B": []}
+ specs = {"A": IsInt, "B": [IsInt]}
+ result = process_makefile(current, specs)
+ assert_equal(result, expected)
+
+
+def test_process_makefile__list_spec_must_contain_specs():
+ specs = {"A": IsInt, "B": [1, 2, 3]}
+ assert_raises(TypeError, process_makefile, {}, specs)
+
+
+def test_process_makefile__list_spec_must_contain_only_specs():
+ specs = {"A": IsInt, "B": [1, 2, IsStr]}
+ assert_raises(TypeError, process_makefile, {}, specs)
+
+
+###############################################################################
+###############################################################################
+# read_makefile
+
+def test_read_makefile__missing_file():
+ assert_raises(IOError, read_makefile, "does_not_exist.yaml", {})
+
+
+def test_read_makefile__not_a_yaml_file():
+ fpath = test_file("fasta_file.fasta")
+ assert_raises(MakefileError, read_makefile, fpath, {})
+
+
+def test_read_makefile__missing_simple_file():
+ specs = {"Defaults": {"First": IsFloat, "Second": IsStr}}
+ expected = {
+ "Makefile": {"Defaults": {"First": 1e-4,
+ "Second": "a string"}},
+ "Statistics": {
+ "Filename": test_file("simple.yaml"),
+ "Hash": "563a2052b67dcde9f193fbe8d51fa2b6f0806505",
+ "MTime": "2005-07-07 08:50:00",
+ }
+ }
+ result = read_makefile(test_file("simple.yaml"), specs)
+ assert_equal(expected, result)
+
+
+###############################################################################
+###############################################################################
+# PreProcessMakefile
+
+class _PreProcess(PreProcessMakefile):
+ def __call__(self, path, value):
+ if isinstance(value, types.StringTypes):
+ return int(value), IsInt
+
+ return value, IsInt
+
+
+def test__preprocess_makefile__missing_value():
+ spec = {"Key": _PreProcess()}
+ assert_equal({}, process_makefile({}, spec))
+
+
+def test__preprocess_makefile__expected_value():
+ spec = {"Key": _PreProcess()}
+ assert_equal({"Key": 13}, process_makefile({"Key": 13}, spec))
+
+
+def test__preprocess_makefile__processed_value():
+ spec = {"Key": _PreProcess()}
+ assert_equal({"Key": 14}, process_makefile({"Key": "14"}, spec))
+
+
+def test__preprocess_makefile__invalid_value():
+ spec = {"Key": _PreProcess()}
+ assert_raises(MakefileError, process_makefile, {"Key": False}, spec)
+
+
+def test__preprocess_makefile__invalid_string():
+ spec = {"Key": _PreProcess()}
+ # Failures in processing should propagate out
+ assert_raises(ValueError, process_makefile, {"Key": "x14"}, spec)
+
+
+class _PreProcessWithDefault(PreProcessMakefile):
+ def __init__(self, default):
+ self._default = default
+
+ def __call__(self, path, value):
+ if isinstance(value, types.StringTypes):
+ return int(value), IsInt
+
+ return value, IsInt(default=self._default)
+
+
+def test__preprocess_makefile__with_default__missing_value():
+ spec = {"Key": _PreProcessWithDefault(314)}
+ assert_equal({"Key": 314}, process_makefile({}, spec))
+
+
+def test__preprocess_makefile__with_default__expected_value():
+ spec = {"Key": _PreProcessWithDefault(314)}
+ assert_equal({"Key": 14}, process_makefile({"Key": 14}, spec))
diff --git a/tests/common_tests/sampling_tests.py b/tests/common_tests/sampling_tests.py
new file mode 100644
index 0000000..f9f40e9
--- /dev/null
+++ b/tests/common_tests/sampling_tests.py
@@ -0,0 +1,132 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+from nose.tools import \
+ assert_equal, \
+ assert_raises
+from flexmock import \
+ flexmock
+
+import paleomix.common.sampling as sampling
+
+
+###############################################################################
+###############################################################################
+# weighted_sampling
+
+def test_weighted_sampling__select_by_weight():
+ def _do_select_by_weight(value, expectation):
+ choices = "abc"
+ weights = (1, 2, 3)
+ rng = flexmock(random=lambda: value)
+ iterator = sampling.weighted_sampling(choices, weights, rng)
+ assert_equal(iterator.next(), expectation)
+
+ yield _do_select_by_weight, 0.00000, 'a'
+ yield _do_select_by_weight, 0.16666, 'a' # < 1/6
+ yield _do_select_by_weight, 1/6.0, 'b'
+ yield _do_select_by_weight, 0.49999, 'b' # < 3/6
+ yield _do_select_by_weight, 3/6.0, 'c'
+ yield _do_select_by_weight, 0.99999, 'c'
+
+
+def test_weighted_sampling__empty_input_raises_value_error_for_lists():
+ def _do_empty_input_raises(choices, weights):
+ iterator = sampling.weighted_sampling(choices, weights)
+ assert_raises(ValueError, iterator.next)
+
+ yield _do_empty_input_raises, [], []
+ yield _do_empty_input_raises, [], [1, 2]
+ yield _do_empty_input_raises, [1, 2], []
+
+
+def test_weighted_sampling__different_length_input_raises_value_error():
+ def _do_different_length_input_raises(choices, weights):
+ iterator = sampling.weighted_sampling(choices, weights)
+ assert_raises(ValueError, iterator.next)
+
+ yield _do_different_length_input_raises, [0, 1], [1, 2, 3]
+ yield _do_different_length_input_raises, [0, 1, 2], [1, 2]
+ yield _do_different_length_input_raises, iter([0, 1]), [1, 2, 3]
+ yield _do_different_length_input_raises, [0, 1], iter([1, 2, 3])
+ yield _do_different_length_input_raises, iter([0, 1]), iter([1, 2, 3])
+
+
+def test_weighted_sampling__negative_weight_value_error():
+ choices = range(3)
+ weights = [1, -2, 3]
+ iterator = sampling.weighted_sampling(choices, weights)
+ assert_raises(ValueError, iterator.next)
+
+
+def test_weighted_sampling__zero_weight_raises_value_error():
+ choices = range(3)
+ weights = [1, 0, 3]
+ iterator = sampling.weighted_sampling(choices, weights)
+ assert_raises(ValueError, iterator.next)
+
+
+def test_weighted_sampling__non_numerical_weight_raises_type_error():
+ choices = range(3)
+ weights = [1, "foo", 3]
+ iterator = sampling.weighted_sampling(choices, weights)
+ assert_raises(TypeError, iterator.next)
+
+
+###############################################################################
+###############################################################################
+# reservoir_sampling
+
+def test_reservoir_sampling__select_first_item():
+ rng = flexmock(randint=lambda _min, _max: 1)
+ values = [1, 2]
+ result = sampling.reservoir_sampling(values, 1, rng)
+ assert_equal(result, [1])
+
+
+def test_reservoir_sampling__select_second_item():
+ rng = flexmock(randint=lambda _min, _max: 0)
+ values = [1, 2]
+ result = sampling.reservoir_sampling(values, 1, rng)
+ assert_equal(result, [2])
+
+
+def test_reservoir_sampling__upsample_equals_input():
+ result = sampling.reservoir_sampling(range(5), 10)
+ assert_equal(result, range(5))
+
+
+def test_reservoir_sampling__downsample_to_zero():
+ result = sampling.reservoir_sampling(range(5), 0)
+ assert_equal(result, [])
+
+
+def test_reservoir_sampling__downsample_to_negative_raises_value_error():
+ assert_raises(ValueError, sampling.reservoir_sampling, range(5), -1)
+
+
+def test_reservoir_sampling__downsample_to_float_raises_type_error():
+ assert_raises(TypeError, sampling.reservoir_sampling, range(5), 1.0)
+
+
+def test_reservoir_sampling__downsample_to_non_number_raises_type_error():
+ assert_raises(TypeError, sampling.reservoir_sampling, range(5), "Eh?")
diff --git a/tests/common_tests/sequences_test.py b/tests/common_tests/sequences_test.py
new file mode 100644
index 0000000..04ca185
--- /dev/null
+++ b/tests/common_tests/sequences_test.py
@@ -0,0 +1,228 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import itertools
+import nose.tools
+from nose.tools import assert_equal
+
+from paleomix.common.sequences import \
+ complement, \
+ reverse_complement, \
+ encode_genotype, \
+ count_nts, \
+ count_gc_diploid, \
+ split
+
+
+###############################################################################
+###############################################################################
+# Tests for 'complement'
+
+_REF_SRC = "ACGTMRWSYKVHDBNX"
+_REF_DST = "TGCAKYWSRMBDHVNX"
+_REF_PAIRS = zip(_REF_SRC, _REF_DST)
+
+
+def test_complement__single_nt():
+ def test_function(source, destination):
+ assert_equal(complement(source), destination)
+
+ for (src, dest) in _REF_PAIRS:
+ yield test_function, src, dest
+ yield test_function, src.lower(), dest.lower()
+
+
+def test_complement__multiple_nts_upper():
+ assert_equal(complement(_REF_SRC), _REF_DST)
+
+
+def test_complement__multiple_nts_lower():
+ assert_equal(complement(_REF_SRC.lower()), _REF_DST.lower())
+
+
+def test_complement__multiple_nts_mixed_case():
+ assert_equal(complement("aGtCn"), "tCaGn")
+
+
+###############################################################################
+###############################################################################
+# Tests for 'complement'
+
+def test_reverse_complement():
+ assert_equal(reverse_complement(_REF_SRC), _REF_DST[::-1])
+
+
+###############################################################################
+###############################################################################
+# Tests for 'encode_genotype'
+
+_IUB_SRC = ("A", "C", "G", "T",
+ "AC", "AG", "AT", "CG", "CT", "GT",
+ "ACG", "ACT", "AGT", "CGT", "ACGT")
+_IUB_DST = "ACGTMRWSYKVHDB"
+_IUB_PAIRS = zip(_IUB_SRC, _IUB_DST)
+
+
+def test_genotype__permutations():
+ def test_function(src, dst):
+ assert_equal(encode_genotype(src), dst)
+
+ for (src, dst) in _IUB_PAIRS:
+ for seq in itertools.permutations(src):
+ yield test_function, "".join(seq), dst
+
+
+ at nose.tools.raises(ValueError)
+def test_genotype__bad_input__lowercase():
+ encode_genotype("a")
+
+
+ at nose.tools.raises(ValueError)
+def test_genotype__bad_input__mixedcase():
+ encode_genotype("At")
+
+
+ at nose.tools.raises(ValueError)
+def test_genotype__bad_input__unknown_nucleotide():
+ encode_genotype("Z")
+
+
+ at nose.tools.raises(ValueError)
+def test_genotype__bad_input__non_nucleotide():
+ encode_genotype("+")
+
+
+def test_comma_or_not():
+ def test_function(sequence):
+ assert_equal(encode_genotype(sequence), "Y")
+
+ for sequence in ("CT", "C,T", ",C,T", "C,T,", ",C,T,"):
+ yield test_function, sequence
+
+
+###############################################################################
+###############################################################################
+# Tests for 'count_nts'
+
+def test_count_nts__empty_str():
+ assert_equal(count_nts(""), {})
+
+
+def test_count_nts__simple():
+ assert_equal(count_nts("TTATGTGTCT"), {"A": 1, "C": 1, "G": 2, "T": 6})
+
+
+def test_count_nts__simple_mixed_case():
+ assert_equal(count_nts("atATNcaGCC"), {"A": 3, "C": 3, "G": 1, "T": 2, "N": 1})
+
+
+def test_count_nts__complex():
+ assert_equal(count_nts("NNBKVGRVMM"), {"N": 2, "B": 1, "K": 1, "V": 2, "G": 1,
+ "R": 1, "M": 2})
+
+
+def test_count_nts__complex_mixed_case():
+ assert_equal(count_nts(""), {})
+
+
+ at nose.tools.raises(ValueError)
+def test_count_nts__bad_nt():
+ count_nts("TTAPGTGTCT") # 'P' not UIPAC
+
+
+ at nose.tools.raises(ValueError)
+def test_count_nts__bad_nt_lowercase():
+ count_nts("atATNcoGCC") # 'o' not UIPAC
+
+
+###############################################################################
+###############################################################################
+# Tests for 'count_gc_diploid'
+
+def test_count_gc_diploid__empty_string():
+ assert_equal(count_gc_diploid(""), (0, 0))
+
+
+def test_count_gc_diploid__simple():
+ assert_equal(count_gc_diploid("ACGGGTATCA"), (10, 20))
+
+
+def test_count_gc_diploid__simple_mixed_case():
+ assert_equal(count_gc_diploid("AcGGgtAtcA"), (10, 20))
+
+
+def test_count_gc_diploid__complex():
+ assert_equal(count_gc_diploid("RTGTKMGTCA"), (9, 20))
+
+
+def test_count_gc_diploid__complex_mixed_case():
+ assert_equal(count_gc_diploid("rtGTKMgTcA"), (9, 20))
+
+
+def test_count_gc_dipoid__ambigious():
+ assert_equal(count_gc_diploid("ACGGNTATCA"), (8, 18))
+
+
+def test_count_gc_diploid__ambigous_mixed_case():
+ assert_equal(count_gc_diploid("AcGGntAtcA"), (8, 18))
+
+
+ at nose.tools.raises(ValueError)
+def test_count_gc_diploid__triploid():
+ count_gc_diploid("B")
+
+
+###############################################################################
+###############################################################################
+# Tests for 'split'
+
+def test_split__empty_sequence():
+ assert_equal(split(""), {"1": "", "2": "", "3": ""})
+
+
+ at nose.tools.raises(TypeError)
+def test_split__no_split_by():
+ split("", split_by="")
+
+
+def test_split__single_group():
+ assert_equal(split("ACGCAT", "111"), {'1': 'ACGCAT'})
+
+
+def test_split__two_groups():
+ assert_equal(split("ACGCAT", "112"), {"1": "ACCA", "2": "GT"})
+
+
+def test_split__three_groups():
+ expected = {"1": "AC", "2": "CA", "3": "GT"}
+ assert_equal(split("ACGCAT", "123"), expected)
+ assert_equal(split("ACGCAT"), expected)
+
+
+def test_split__empty_group():
+ expected = {"1": "A", "2": "C", "3": ""}
+ assert_equal(split("AC"), expected)
+
+
+def test_split__partial_group():
+ expected = {"1": "AA", "2": "CA", "3": "G"}
+ assert_equal(split("ACGAA"), expected)
diff --git a/tests/common_tests/signals_test.py b/tests/common_tests/signals_test.py
new file mode 100644
index 0000000..4bc76a8
--- /dev/null
+++ b/tests/common_tests/signals_test.py
@@ -0,0 +1,56 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+
+import signal
+import paleomix.common.signals as signals
+
+import nose
+from nose.tools import assert_equal
+
+
+def test_signal__sigterm_to_str():
+ assert_equal(signals.to_str(signal.SIGTERM), "SIGTERM")
+
+
+def test_signal__str_to_sigterm():
+ assert_equal(signals.from_str("SIGTERM"), signal.SIGTERM)
+
+
+ at nose.tools.raises(KeyError)
+def test_signal__to_str__unknown_signal():
+ signals.to_str(1024)
+
+
+ at nose.tools.raises(KeyError)
+def test_signal__from_str__unknown_signal():
+ signals.from_str("SIGSMURF")
+
+
+ at nose.tools.raises(TypeError)
+def test_signal__to_str__wrong_type():
+ signals.to_str("SIGTERM")
+
+
+ at nose.tools.raises(TypeError)
+def test_signal__from_str__wrong_type():
+ signals.from_str(signal.SIGTERM)
diff --git a/tests/common_tests/text_tests.py b/tests/common_tests/text_tests.py
new file mode 100644
index 0000000..26c739b
--- /dev/null
+++ b/tests/common_tests/text_tests.py
@@ -0,0 +1,289 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# Disable warnings on strange function names
+# pylint: disable=C0103
+
+
+import collections
+
+import nose.tools
+from nose.tools import assert_equal
+
+from paleomix.common.text import \
+ TableError, \
+ padded_table, \
+ parse_padded_table, \
+ parse_lines, \
+ parse_lines_by_contig
+
+
+###############################################################################
+###############################################################################
+# Tests for 'padded_table'
+
+def _padded_table(*args, **kwargs):
+ return list(padded_table(*args, **kwargs))
+
+
+def test_padded_table__empty():
+ assert_equal(_padded_table(()), [])
+
+
+def test_padded_table__single_line():
+ table = [(1, 20, 3000)]
+ expected = ["1 20 3000"]
+ assert_equal(expected, _padded_table(table))
+
+
+def test_padded_table__two_lines():
+ table = [(1, 20, 3000),
+ (3000, 20, 1)]
+ expected = ["1 20 3000",
+ "3000 20 1"]
+ assert_equal(expected, _padded_table(table))
+
+
+def test_padded_table__three_lines():
+ table = [(1, 20, 3000),
+ (3000, 20, 1),
+ (1, 2, 30)]
+ expected = ["1 20 3000",
+ "3000 20 1",
+ "1 2 30"]
+ assert_equal(expected, _padded_table(table))
+
+
+def test_padded_table__with_text():
+ row_1, line_1 = (1, 20, 3000), "1 20 3000"
+ row_2, line_2 = (3000, 20, 1), "3000 20 1"
+ comment = "# An insightful comment goes here"
+
+ def _do_test_padded_table__padding__with_text(table, expected):
+ assert_equal(expected, _padded_table(table))
+ yield _do_test_padded_table__padding__with_text, \
+ [comment, row_1, row_2], [comment, line_1, line_2]
+ yield _do_test_padded_table__padding__with_text, \
+ [row_1, comment, row_2], [line_1, comment, line_2]
+ yield _do_test_padded_table__padding__with_text, \
+ [row_1, row_2, comment], [line_1, line_2, comment]
+
+
+ at nose.tools.raises(TableError)
+def test_padded_table__misshapen_table_1():
+ _padded_table([(1, 20, 3000),
+ (3000, 20),
+ (1, 2, 3)])
+
+
+ at nose.tools.raises(TableError)
+def test_padded_table__misshapen_table_2():
+ _padded_table([(1, 20, 3000),
+ (3000, 20, 1, 0),
+ (1, 2, 3)])
+
+
+###############################################################################
+###############################################################################
+# Tests for 'parse_padded_table'
+
+def _parse_padded_table(*args, **kwargs):
+ return list(parse_padded_table(*args, **kwargs))
+
+
+def test_parse_padded_table__empty():
+ assert_equal([], _parse_padded_table([]))
+
+
+def test_parse_padded_table__header_only():
+ assert_equal([], _parse_padded_table(["A B C D"]))
+
+
+def test_parse_padded_table__single_row():
+ table = ["A B C D",
+ "4 3 2 1"]
+ expected = [{"A": '4', "B": '3', "C": '2', "D": '1'}]
+ assert_equal(expected, _parse_padded_table(table))
+
+
+def test_parse_padded_table__two_rows():
+ table = ["A B C D",
+ "4 3 2 1",
+ "AB CD EF GH"]
+ expected = [{"A": '4', "B": '3', "C": '2', "D": '1'},
+ {"A": "AB", "B": "CD", "C": "EF", "D": "GH"}]
+ assert_equal(expected, _parse_padded_table(table))
+
+
+# Any amount of whitespace is allowed
+def test_parse_padded_table__single_row__with_whitespace():
+ table = ["A B C E F",
+ "1 0 1 2 3"]
+ expected = [{"A": '1', "B": '0', "C": '1', "E": '2', "F": '3'}]
+ assert_equal(expected, _parse_padded_table(table))
+
+
+# Other whitespace should be ignored
+def test_parse_padded_table__single_row__with_tabs():
+ table = ["A\t\t\t\tB",
+ "1\t\t\t\t0"]
+ expected = [{"A": '1', "B": '0'}]
+ assert_equal(expected, _parse_padded_table(table))
+
+
+def test_padded_table__comments_and_empty_lines():
+ def _do_test_padded_table__comments(lines):
+ expected = [{"A": '3000', "B": '20', "C": '1'}]
+ assert_equal(expected, _parse_padded_table(lines))
+ line_1 = "A B C"
+ line_2 = "3000 20 1"
+
+ yield _do_test_padded_table__comments, ["# comment", line_1, line_2]
+ yield _do_test_padded_table__comments, [line_1, " # comment", line_2]
+ yield _do_test_padded_table__comments, [line_1, line_2, " # comment"]
+
+ yield _do_test_padded_table__comments, ["", line_1, line_2]
+ yield _do_test_padded_table__comments, [line_1, " ", line_2]
+ yield _do_test_padded_table__comments, [line_1, line_2, " "]
+
+
+def test_padded_table__newlines():
+ expected = [{"A": '3000', "B": '20', "C": '1'}]
+
+ def _do_test_padded_table__padding__comments(postfix):
+ line_1 = "A B C" + postfix
+ line_2 = "3000 20 1" + postfix
+ assert_equal(expected, _parse_padded_table([line_1, line_2]))
+
+ yield _do_test_padded_table__padding__comments, "\r"
+ yield _do_test_padded_table__padding__comments, "\n"
+ yield _do_test_padded_table__padding__comments, "\r\n"
+
+
+def test_padded_table__padding__comments__whitespace():
+ expected = [{"A": '3000', "B": '20', "C": '1'}]
+ lines = ["A B C",
+ "3000 20 1",
+ " # useless comment"]
+ assert_equal(expected, _parse_padded_table(lines))
+
+
+ at nose.tools.raises(TableError)
+def test_parse_padded_table__malformed_table_0():
+ table = ["A B C D",
+ "4 3 2"]
+ _parse_padded_table(table)
+
+
+ at nose.tools.raises(TableError)
+def test_parse_padded_table__malformed_table_1():
+ table = ["A B C D",
+ "4 3 2 1 0"]
+ _parse_padded_table(table)
+
+
+###############################################################################
+###############################################################################
+# Tests for 'parse_linse'
+def _this(*args):
+ return args
+
+
+def _parse_lines(*args, **kwargs):
+ return list(parse_lines(*args, **kwargs))
+
+
+def test_parse_lines__empty_file():
+ def _assert_false():
+ assert False # pragma: no coverage
+ assert_equal(_parse_lines([], _assert_false), [])
+
+
+def test_parse_lines__single():
+ assert_equal(_parse_lines(["abc line1 \n"], _this), [("abc line1", 9)])
+
+
+def test_parse_lines__comments_and_empty_lines():
+ def _do_test_parse_lines__comments(lines):
+ expected = [("abc line1", 9), ("def line2", 9)]
+ assert_equal(_parse_lines(lines, _this), expected)
+ yield _do_test_parse_lines__comments, \
+ ["# comment\n", "abc line1 \n", "def line2 \n"]
+ yield _do_test_parse_lines__comments, \
+ ["abc line1 \n", " # comment\n", "def line2 \n"]
+ yield _do_test_parse_lines__comments, \
+ ["abc line1 \n", "def line2 \n", " # comment\n"]
+
+ yield _do_test_parse_lines__comments, \
+ ["\n", "abc line1 \n", "def line2 \n"]
+ yield _do_test_parse_lines__comments, \
+ ["abc line1 \n", " \n", "def line2 \n"]
+ yield _do_test_parse_lines__comments, \
+ ["abc line1 \n", "def line2 \n", " \n"]
+
+
+def test_parse_lines__padding__newlines():
+ expected = [("abc line1", 9), ("def line2", 9)]
+
+ def _do_test_padded_table__padding__comments(postfix):
+ line_1 = "abc line1 " + postfix
+ line_2 = "def line2 " + postfix
+ assert_equal(expected, _parse_lines([line_1, line_2], _this))
+
+ yield _do_test_padded_table__padding__comments, "\r"
+ yield _do_test_padded_table__padding__comments, "\n"
+ yield _do_test_padded_table__padding__comments, "\r\n"
+
+
+ at nose.tools.raises(TypeError)
+def test_parse_lines__uncallable():
+ _parse_lines([], 1)
+
+
+###############################################################################
+###############################################################################
+# Tests for 'parse_lines_by_contig'
+_RecordMock = collections.namedtuple("_RecordMock", "contig value")
+
+
+def test_parse_lines_by_contig__single_contig():
+ lines = ["abc line1 \n", "abc line2 \n"]
+
+ def _parse(line, length):
+ assert_equal(len(line), length)
+ return _RecordMock(*line.split())
+
+ expected = {"abc": [_RecordMock("abc", "line1"),
+ _RecordMock("abc", "line2")]}
+ assert_equal(parse_lines_by_contig(lines, _parse), expected)
+
+
+def test_parse_lines__two_contigs():
+ lines = ["abc line1 \n", "def line2 \n"]
+
+ def _parse(line, length):
+ assert_equal(len(line), length)
+ return _RecordMock(*line.split())
+
+ expected = {"abc": [_RecordMock("abc", "line1")],
+ "def": [_RecordMock("def", "line2")]}
+ assert_equal(parse_lines_by_contig(lines, _parse), expected)
diff --git a/tests/common_tests/utilities_test.py b/tests/common_tests/utilities_test.py
new file mode 100644
index 0000000..af730dd
--- /dev/null
+++ b/tests/common_tests/utilities_test.py
@@ -0,0 +1,727 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import random
+import pickle
+import operator
+import nose.tools
+from nose.tools import \
+ assert_equal, \
+ assert_raises
+
+
+import paleomix.common.utilities as utils
+
+
+################################################################################
+################################################################################
+# Tests for 'safe_coerce_to_tuple'
+
+def test_safe_coerce_to_tuple__str():
+ assert_equal(utils.safe_coerce_to_tuple("foo"), ("foo",))
+
+
+def test_safe_coerce_to_tuple__unicode():
+ assert_equal(utils.safe_coerce_to_tuple(u"foo"), (u"foo",))
+
+
+def test_safe_coerce_to_tuple__int():
+ assert_equal(utils.safe_coerce_to_tuple(17), (17,))
+
+
+def test_safe_coerce_to_tuple__list():
+ assert_equal(utils.safe_coerce_to_tuple([1, 3, 2]), (1, 3, 2))
+
+
+def test_safe_coerce_to_tuple__tuple():
+ assert_equal(utils.safe_coerce_to_tuple((1, 3, 2)), (1, 3, 2))
+
+
+def test_safe_coerce_to_tuple__iterable():
+ assert_equal(utils.safe_coerce_to_tuple(xrange(3)), (0, 1, 2))
+
+
+def test_safe_coerce_to_tuple__dict():
+ assert_equal(utils.safe_coerce_to_tuple({1: 2, 3: 4}), ({1: 2, 3: 4},))
+
+
+###############################################################################
+###############################################################################
+# Tests for 'safe_coerce_to_frozenset'
+
+def test_safe_coerce_to_frozenset__str():
+ assert_equal(utils.safe_coerce_to_frozenset("foo"), frozenset(("foo",)))
+
+
+def test_safe_coerce_to_frozenset__unicode():
+ assert_equal(utils.safe_coerce_to_frozenset(u"foo"), frozenset((u"foo",)))
+
+
+def test_safe_coerce_to_frozenset__int():
+ assert_equal(utils.safe_coerce_to_frozenset(17), frozenset((17,)))
+
+
+def test_safe_coerce_to_frozenset__list():
+ assert_equal(utils.safe_coerce_to_frozenset([1, 3, 2]), frozenset((1, 3, 2)))
+
+
+def test_safe_coerce_to_frozenset__tuple():
+ assert_equal(utils.safe_coerce_to_frozenset((1, 3, 2)), frozenset(((1, 3, 2))))
+
+
+def test_safe_coerce_to_frozenset__iterable():
+ assert_equal(utils.safe_coerce_to_frozenset(xrange(3)), frozenset((0, 1, 2)))
+
+
+ at nose.tools.raises(TypeError)
+def test_safe_coerce_to_frozenset__dict():
+ utils.safe_coerce_to_frozenset({1: 2, 3: 4})
+
+
+###############################################################################
+###############################################################################
+# Tests for 'try_cast'
+
+def test_try_cast__int_to_int():
+ assert_equal(utils.try_cast(17, int), 17)
+
+
+def test_try_cast__float_to_int():
+ assert_equal(utils.try_cast(17.3, int), 17)
+
+
+def test_try_cast__good_str_to_int():
+ assert_equal(utils.try_cast("17", int), 17)
+
+
+def test_try_cast__bad_str_to_int():
+ assert_equal(utils.try_cast("x17", int), "x17")
+
+
+def test_try_cast__list_to_int():
+ assert_equal(utils.try_cast([1, 2, 3], int), [1, 2, 3])
+
+
+###############################################################################
+###############################################################################
+# Tests for 'crc32'
+
+def test_crc32_is_unsigned():
+ # The following is known to produce an negative value for python 2.7.2
+ data = "Nobody inspects the spammish repetition"
+ assert utils.crc32(data) >= 0
+
+
+###############################################################################
+###############################################################################
+# Tests for 'set_in'
+
+def test_set_in__single_kw_in_empty_dictionary():
+ value = {}
+ utils.set_in(value, ["Foo"], 17)
+ assert_equal(value, {"Foo": 17})
+
+
+def test_set_in__two_kws_in_empty_dictionary():
+ value = {}
+ utils.set_in(value, ["Foo", 13], 17)
+ assert_equal(value, {"Foo": {13: 17}})
+
+
+def test_set_in__three_kws_in_empty_dictionary():
+ value = {}
+ utils.set_in(value, ["Foo", 13, (1, 2)], 17)
+ assert_equal(value, {"Foo": {13: {(1, 2): 17}}})
+
+
+def test_set_in__three_kws_in_partial_dictionary():
+ value = {"Foo": {12: 0}}
+ utils.set_in(value, ["Foo", 13, (1, 2)], 17)
+ assert_equal(value, {"Foo": {12: 0, 13: {(1, 2): 17}}})
+
+ value = {"Foo": {13: {"Bar": None}}}
+ utils.set_in(value, ["Foo", 13, (1, 2)], 17)
+ assert_equal(value, {"Foo": {13: {(1, 2): 17, "Bar": None}}})
+
+
+def test_set_in__update_value_one_kw():
+ value = {1: None}
+ utils.set_in(value, [1], 3.14)
+ assert_equal(value, {1: 3.14})
+
+
+def test_set_in__update_value_two_kw():
+ value = {1: {2: 3}}
+ utils.set_in(value, [1, 2], 365)
+ assert_equal(value, {1: {2: 365}})
+
+
+ at nose.tools.raises(ValueError)
+def test_set_in__fail_on_no_kws():
+ utils.set_in({}, [], 17)
+
+
+ at nose.tools.raises(TypeError)
+def test_set_in__fail_on_invalid_sub_dictionary_first_level():
+ utils.set_in(None, [1], 17)
+
+
+ at nose.tools.raises(TypeError)
+def test_set_in__fail_on_invalid_sub_dictionary_second_level():
+ utils.set_in({1: None}, [1, 2], 17)
+
+
+ at nose.tools.raises(TypeError)
+def test_set_in__fail_on_invalid_sub_dictionary_third_level():
+ utils.set_in({1: {2: None}}, [1, 2, 3], 17)
+
+
+def test_set_in__iteratable_keywords():
+ value = {}
+ utils.set_in(value, iter(["Foo", 13, (1, 2)]), 17)
+ assert_equal(value, {"Foo": {13: {(1, 2): 17}}})
+
+
+###############################################################################
+###############################################################################
+# Tests for 'get_in'
+
+def test_get_in__get_value_one_keyword():
+ assert_equal(utils.get_in({1: 2}, [1]), 2)
+
+
+def test_get_in__get_value_two_keywords():
+ assert_equal(utils.get_in({1: {2: 3}}, [1, 2]), 3)
+
+
+def test_get_in__get_value_three_keywords():
+ assert_equal(utils.get_in({1: {2: {3: 4}}}, [1, 2, 3]), 4)
+
+
+def test_get_in__get_default_one_keyword():
+ assert_equal(utils.get_in({1: 2}, [2]), None)
+
+
+def test_get_in__get_default_one_keyword_with_default():
+ assert_equal(utils.get_in({1: 2}, [2], "other"), "other")
+
+
+def test_get_in__get_default_three_keywords_fail_at_first():
+ assert_equal(utils.get_in({1: {2: {3: 4}}}, [2, 2, 4]), None)
+
+
+def test_get_in__get_default_three_keywords_fail_at_first_with_default():
+ assert_equal(utils.get_in({1: {2: {3: 4}}}, [2, 2, 4], "other"), "other")
+
+
+def test_get_in__get_default_three_keywords_fail_at_second():
+ assert_equal(utils.get_in({1: {2: {3: 4}}}, [1, 3, 4]), None)
+
+
+def test_get_in__get_default_three_keywords_fail_at_second_with_default():
+ assert_equal(utils.get_in({1: {2: {3: 4}}}, [1, 3, 4], "other"), "other")
+
+
+def test_get_in__get_default_three_keywords_fail_at_third():
+ assert_equal(utils.get_in({1: {2: {3: 4}}}, [1, 2, 4]), None)
+
+
+def test_get_in__get_default_three_keywords_fail_at_third_with_default():
+ assert_equal(utils.get_in({1: {2: {3: 4}}}, [1, 2, 4], "other"), "other")
+
+
+def test_get_in__iterator_keywords():
+ assert_equal(utils.get_in({1: {2: {3: 4}}}, iter([1, 2, 3])), 4)
+
+
+###############################################################################
+###############################################################################
+# Tests for 'split_before'
+
+def _do_split(lst, key):
+ # Convertion to list allows the implementation to be
+ # lazy, while making comparisons for asserts easier
+ return list(utils.split_before(lst, key))
+
+
+def test_split_before__split_empty_list():
+ assert_equal(_do_split([], None), [])
+
+
+def test_split_before__split_list_with_no_true_pred():
+ assert_equal(_do_split(range(10), lambda x: False), [range(10)])
+
+
+def test_split_before__split_list_true_pred_at_first_position():
+ assert_equal(_do_split(range(4), lambda x: x % 2 == 0), [[0, 1], [2, 3]])
+
+
+def test_split_before__split_list_true_pred_at_second_position():
+ assert_equal(_do_split(range(4), lambda x: x % 2 == 1), [[0], [1, 2], [3]])
+
+
+def test_split_before__split_consequtive_true_pred():
+ assert_equal(_do_split(range(0, 5, 2), lambda x: x % 2 == 0), [[0], [2], [4]])
+
+
+def test_split_before__no_hits():
+ assert_equal(_do_split(range(1, 5), lambda x: x % 5 == 0), [range(1, 5)])
+
+
+###############################################################################
+###############################################################################
+# Tests for 'is_strictly_increasing'
+
+def test_is_strictly_increasing__increasing_sequence():
+ assert utils.is_strictly_increasing(range(100))
+
+
+def test_is_strictly_increasing__non_increasing_sequence():
+ lst = range(100)
+ first, second = random.sample(lst, 2)
+ lst[first], lst[second] = lst[second], lst[first]
+
+ assert not utils.is_strictly_increasing(lst)
+
+
+###############################################################################
+###############################################################################
+# Tests for 'grouper'
+
+def test_grouper__empty_list():
+ result = utils.grouper(3, [])
+ assert_equal(list(result), [])
+
+
+def test_grouper__non_empty_list():
+ result = utils.grouper(3, range(6))
+ expected = [(0, 1, 2), (3, 4, 5)]
+ assert_equal(list(result), expected)
+
+
+def test_grouper__non_empty_list_with_trailing():
+ result = utils.grouper(3, range(7))
+ expected = [(0, 1, 2), (3, 4, 5), (6, None, None)]
+ assert_equal(list(result), expected)
+
+
+def test_grouper__non_empty_list_with_trailing_fill_value():
+ result = utils.grouper(3, range(7), fillvalue=r'\0')
+ expected = [(0, 1, 2), (3, 4, 5), (6, r'\0', r'\0')]
+ assert_equal(list(result), expected)
+
+
+###############################################################################
+###############################################################################
+# Tests for 'group_by_pred'
+
+def test_group_by_pred__empty_list():
+ assert_equal(utils.group_by_pred(id, []), ([], []))
+
+
+def test_group_by_pred__always_false():
+ assert_equal(utils.group_by_pred(lambda x: False, [1, 2, 3]), ([], [1, 2, 3]))
+
+
+def test_group_by_pred__always_true():
+ assert_equal(utils.group_by_pred(lambda x: True, [1, 2, 3]), ([1, 2, 3], []))
+
+
+def test_group_by_pred__is_even():
+ assert_equal(utils.group_by_pred(lambda x: x % 2 == 0, [1, 2, 3]), ([2], [1, 3]))
+
+
+def test_group_by_pred__iterable():
+ assert_equal(utils.group_by_pred(lambda x: x % 2 == 0, xrange(1, 4)), ([2], [1, 3]))
+
+
+###############################################################################
+###############################################################################
+# Tests for 'fragment'
+
+def test_fragment__empty():
+ assert_equal(list(utils.fragment(5, "")), [])
+ assert_equal(list(utils.fragment(5, [])), [])
+
+
+def test_fragment__partial_fragment():
+ assert_equal(list(utils.fragment(3, "ab")), ["ab"])
+ assert_equal(list(utils.fragment(3, ["a", "b"])), [["a", "b"]])
+
+
+def test_fragment__single_fragment():
+ assert_equal(list(utils.fragment(3, "abc")), ["abc"])
+ assert_equal(list(utils.fragment(3, ["a", "b", "c"])), [["a", "b", "c"]])
+
+
+def test_fragment__multiple_fragments():
+ assert_equal(list(utils.fragment(3, "abcdef")), ["abc", "def"])
+ assert_equal(list(utils.fragment(3, list("abcdef"))), [list("abc"), list("def")])
+
+
+def test_fragment__multiple_fragments_partial():
+ assert_equal(list(utils.fragment(3, "abcdefgh")), ["abc", "def", "gh"])
+ assert_equal(list(utils.fragment(3, list("abcdefgh"))), [list("abc"), list("def"), list("gh")])
+
+
+ at nose.tools.raises(TypeError)
+def test_fragment__iterable():
+ list(utils.fragment(3, xrange(6)))
+
+
+ at nose.tools.raises(TypeError)
+def test_fragment__set():
+ list(utils.fragment(3, set(range(6))))
+
+
+###############################################################################
+###############################################################################
+# Tests for 'cumsum'
+
+def test_cumsum__empty():
+ assert_equal(list(utils.cumsum([])), [])
+
+
+def test_cumsum__integers():
+ assert_equal(list(utils.cumsum(range(-4, 5))), [-4, -7, -9, -10, -10, -9, -7, -4, 0])
+
+
+def test_cumsum__float():
+ assert_equal(list(utils.cumsum((1.0, 2.0, 3.0))), [1.0, 3.0, 6.0])
+
+
+def test_cumsum__initial():
+ assert_equal(list(utils.cumsum(range(5), -10)), [-10, -9, -7, -4, 0])
+
+
+###############################################################################
+###############################################################################
+# Tests for 'fast_pickle_test'
+
+def test_fast_pickle_test__picklable():
+ utils.fast_pickle_test(1)
+ utils.fast_pickle_test({})
+ utils.fast_pickle_test(test_cumsum__empty)
+
+
+ at nose.tools.raises(pickle.PicklingError)
+def test_fast_pickle_test__unpicklable_1():
+ _func = lambda: None # pragma: no coverage
+ utils.fast_pickle_test(_func)
+
+
+ at nose.tools.raises(pickle.PicklingError)
+def test_fast_pickle_test__unpicklable_2():
+ def _func():
+ return None # pragma: no coverage
+ utils.fast_pickle_test(_func)
+
+
+###############################################################################
+###############################################################################
+# fill_dict
+
+def test_fill_dict__empty_dicts():
+ result = utils.fill_dict({}, {})
+ assert_equal(result, {})
+
+
+def test_fill_dict__filling_empty_dict():
+ source = {"a": 1, "b": {"c": 2, "d": 3}}
+ expected = {"a": 1, "b": {"c": 2, "d": 3}}
+ result = utils.fill_dict({}, source)
+ assert_equal(result, expected)
+
+
+def test_fill_dict__filling_full_dict():
+ source = {"a": 1, "b": {"c": 2, "d": 3}}
+ destination = {"a": 2, "b": {"c": 3, "d": 4}}
+ expected = {"a": 2, "b": {"c": 3, "d": 4}}
+ result = utils.fill_dict(destination, source)
+ assert_equal(result, expected)
+
+
+def test_fill_dict__destination_not_modified():
+ source = {"a": 1, "b": {"c": 2, "d": 3}}
+ destination = {"b": {"d": 0}}
+ utils.fill_dict(destination, source)
+ assert_equal(destination, {"b": {"d": 0}})
+
+
+def test_fill_dict__source_not_modified():
+ expected = {"a": 1, "b": {"c": 2, "d": 3}}
+ source = {"a": 1, "b": {"c": 2, "d": 3}}
+ destination = {"b": {"d": 0}}
+ utils.fill_dict(destination, source)
+ assert_equal(source, expected)
+
+
+def test_fill_dict__destination_must_be_dict():
+ assert_raises(TypeError, utils.fill_dict, [], {})
+
+
+def test_fill_dict__source_must_be_dict():
+ assert_raises(TypeError, utils.fill_dict, {}, [])
+
+
+###############################################################################
+###############################################################################
+# chain_sorted
+
+def test_chain_sorted__no_sequences():
+ expected = ()
+ result = tuple(utils.chain_sorted())
+ assert_equal(expected, result)
+
+
+def test_chain_sorted__single_sequence():
+ sequence = (1, 2, 3)
+ result = tuple(utils.chain_sorted(sequence))
+ assert_equal(sequence, result)
+
+
+def test_chain_sorted__sequential_contents():
+ def _sequential_contents(seq_a, seq_b):
+ expected = (1, 2, 3, 4, 5, 6)
+ result = tuple(utils.chain_sorted(seq_a, seq_b))
+ assert_equal(expected, result)
+
+ sequence_a = (1, 2, 3)
+ sequence_b = (4, 5, 6)
+ yield _sequential_contents, sequence_a, sequence_b
+ yield _sequential_contents, sequence_b, sequence_a
+
+
+def test_chain_sorted__mixed_contents():
+ sequence_a = (3, 4, 8)
+ sequence_c = (0, 1, 6)
+ sequence_b = (2, 5, 7)
+ expected = (0, 1, 2, 3, 4, 5, 6, 7, 8)
+ result = tuple(utils.chain_sorted(sequence_a, sequence_b, sequence_c))
+ assert_equal(expected, result)
+
+
+def test_chain_sorted__mixed_length_contents():
+ sequence_a = (1,)
+ sequence_c = (0, 2)
+ sequence_b = ()
+ expected = (0, 1, 2)
+ result = tuple(utils.chain_sorted(sequence_a, sequence_b, sequence_c))
+ assert_equal(expected, result)
+
+
+def test_chain_sorted__mixed_contents__key():
+ sequence_a = (-2, -3, -5)
+ sequence_b = (0, -1, -4)
+ expected = (0, -1, -2, -3, -4, -5)
+ result = tuple(utils.chain_sorted(sequence_a, sequence_b, key=abs))
+ assert_equal(expected, result)
+
+
+def test_chain_sorted__identical_objects_are_preserved():
+ object_a = [1]
+ object_b = [1]
+ assert object_a is not object_b
+ expected = (object_a, object_b)
+ result = tuple(utils.chain_sorted([object_a], [object_b]))
+ assert_equal(expected, result)
+ assert(object_a is result[0] or object_a is result[1])
+ assert(object_b is result[0] or object_b is result[1])
+
+
+def test_chain_sorted__stable_sort():
+ object_a = [1]
+ object_b = [1]
+ object_c = [2]
+ object_d = [2]
+ seq_a = [object_a, object_c]
+ seq_b = [object_b, object_d]
+
+ expected = (object_a, object_b, object_c, object_d)
+ result = tuple(utils.chain_sorted(seq_a, seq_b))
+ assert_equal(expected, result)
+ assert(all(a is b for (a, b) in zip(expected, result)))
+
+ expected = (object_b, object_a, object_d, object_c)
+ result = tuple(utils.chain_sorted(seq_b, seq_a))
+ assert_equal(expected, result)
+ assert(all(a is b for (a, b) in zip(expected, result)))
+
+
+def test_chain_sorted__runs_of_values():
+ object_a = [1]
+ object_b = [1]
+ object_c = [2]
+ object_d = [2]
+ seq_a = [object_a, object_b]
+ seq_b = [object_c, object_d]
+
+ expected = (object_a, object_b, object_c, object_d)
+ result = tuple(utils.chain_sorted(seq_a, seq_b))
+ assert_equal(expected, result)
+ assert(all(a is b for (a, b) in zip(expected, result)))
+
+
+def test_chain_sorted__invalid_keywords():
+ assert_raises(TypeError, tuple, utils.chain_sorted((1, 2, 3), foobar=None))
+
+
+###############################################################################
+###############################################################################
+# Immutable
+
+def test_immutable__properties_set():
+ class ImmutableCls(utils.Immutable):
+ def __init__(self, value):
+ utils.Immutable.__init__(self, value=value)
+
+ obj = ImmutableCls(17)
+ assert_equal(obj.value, 17)
+
+
+def test_immutable__properties_immutable():
+ def _test_immutable_property(key, value):
+ class ImmutableCls(utils.Immutable):
+ def __init__(self, value):
+ utils.Immutable.__init__(self, value=value)
+
+ obj = ImmutableCls(17)
+ assert_raises(NotImplementedError, setattr, obj, key, value)
+
+ yield _test_immutable_property, "value", 13
+ yield _test_immutable_property, "new_value", "foo"
+
+
+def test_immutable__properties_del():
+ class ImmutableCls(utils.Immutable):
+ def __init__(self, value):
+ utils.Immutable.__init__(self, value=value)
+
+ def _del_property(obj):
+ del obj.value
+
+ obj = ImmutableCls(17)
+ assert_raises(NotImplementedError, _del_property, obj)
+
+
+###############################################################################
+###############################################################################
+# TotallyOrdered
+
+class SomethingOrdered(utils.TotallyOrdered):
+ def __init__(self, value):
+ self.value = value
+
+ def __lt__(self, other):
+ if not isinstance(other, SomethingOrdered):
+ return NotImplemented
+
+ return self.value < other.value
+
+
+# Less than
+def test_totally_ordered__lt_vs_lt():
+ assert SomethingOrdered(1) < SomethingOrdered(2)
+
+
+def test_totally_ordered__lt_vs_gt():
+ assert not (SomethingOrdered(1) < SomethingOrdered(0))
+
+
+def test_totally_ordered__lt_vs_wrong_type():
+ assert_equal(SomethingOrdered(1).__lt__("Foo"), NotImplemented)
+
+
+# Less than or equal
+def test_totally_ordered__le_vs_le():
+ assert SomethingOrdered(1) <= SomethingOrdered(1)
+
+
+def test_totally_ordered__le_vs_gt():
+ assert not (SomethingOrdered(1) <= SomethingOrdered(0))
+
+
+def test_totally_ordered__le_vs_wrong_type():
+ assert_equal(SomethingOrdered(1).__le__("Foo"), NotImplemented)
+
+
+# Greater than or equal
+def test_totally_ordered__ge_vs_ge():
+ assert SomethingOrdered(1) >= SomethingOrdered(1)
+
+
+def test_totally_ordered__ge_vs_lt():
+ assert not (SomethingOrdered(0) >= SomethingOrdered(1))
+
+
+def test_totally_ordered__ge_vs_wrong_type():
+ assert_equal(SomethingOrdered(1).__ge__("Foo"), NotImplemented)
+
+
+# Greater than
+def test_totally_ordered__gt_vs_gt():
+ assert SomethingOrdered(1) > SomethingOrdered(0)
+
+
+def test_totally_ordered__gt_vs_eq():
+ assert not (SomethingOrdered(0) > SomethingOrdered(0))
+
+
+def test_totally_ordered__gt_vs_wrong_type():
+ assert_equal(SomethingOrdered(1).__gt__("Foo"), NotImplemented)
+
+
+# Equal to
+def test_totally_ordered__eq_vs_eq():
+ assert SomethingOrdered(1) == SomethingOrdered(1)
+
+
+def test_totally_ordered__eq_vs_ne():
+ assert not (SomethingOrdered(1) == SomethingOrdered(2))
+
+
+def test_totally_ordered__eq_vs_wrong_type():
+ assert_equal(SomethingOrdered(1).__eq__("Foo"), NotImplemented)
+
+
+# Not equal to
+def test_totally_ordered__ne_vs_ne():
+ assert SomethingOrdered(1) != SomethingOrdered(2)
+
+
+def test_totally_ordered__ne_vs_eq():
+ assert not (SomethingOrdered(1) != SomethingOrdered(1))
+
+
+def test_totally_ordered__ne_vs_wrong_type():
+ assert_equal(SomethingOrdered(1).__ne__("Foo"), NotImplemented)
+
+
+class SomethingBadlyOrdered(utils.TotallyOrdered):
+ def __init__(self, value):
+ self.value = value
+
+
+def test_totally_ordered__missing_implementation():
+ obj_a = SomethingBadlyOrdered(1)
+ obj_b = SomethingBadlyOrdered(2)
+ assert_raises(NotImplementedError, operator.gt, obj_a, obj_b)
diff --git a/tests/common_tests/versions_tests.py b/tests/common_tests/versions_tests.py
new file mode 100644
index 0000000..8275764
--- /dev/null
+++ b/tests/common_tests/versions_tests.py
@@ -0,0 +1,724 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# Disable warnings on strange function names
+# pylint: disable=C0103
+import pickle
+import operator
+
+from nose.tools import \
+ assert_is, \
+ assert_is_not, \
+ assert_in, \
+ assert_equal, \
+ assert_not_equal, \
+ assert_raises
+
+import paleomix.common.versions as versions
+
+
+###############################################################################
+###############################################################################
+# Check class
+
+def test_check__func_must_be_callable():
+ assert_raises(TypeError, versions.Check, "FooBar", 3, 7, 5)
+
+
+def test_check_str():
+ obj = versions.Check("FooBar", operator.lt, 3, 7, 5)
+ assert_equal(str(obj), "FooBar")
+
+
+###############################################################################
+###############################################################################
+## Check class -- hash and comparisons
+
+def test_check__eq_same_func_desc_and_version():
+ obj_1 = versions.Check("Desc {}", operator.lt, 1, 2, 3)
+ obj_2 = versions.Check("Desc {}", operator.lt, 1, 2, 3)
+ assert_equal(hash(obj_1), hash(obj_2))
+ assert_equal(obj_1, obj_2)
+
+
+def test_check__not_eq_for_diff_func_same_desc_and_version():
+ obj_1 = versions.Check("Desc {}", operator.gt, 1, 2, 3)
+ obj_2 = versions.Check("Desc {}", operator.lt, 1, 2, 3)
+ assert_not_equal(hash(obj_1), hash(obj_2))
+ assert_not_equal(obj_1, obj_2)
+
+
+def test_check__not_eq_for_diff_desc_same_func_and_version():
+ obj_1 = versions.Check("Desc1 {}", operator.lt, 1, 2, 3)
+ obj_2 = versions.Check("Desc2 {}", operator.lt, 1, 2, 3)
+ assert_not_equal(hash(obj_1), hash(obj_2))
+ assert_not_equal(obj_1, obj_2)
+
+
+def test_check__not_eq_for_same_func_desc_diff_version():
+ obj_1 = versions.Check("Desc {}", operator.lt, 1, 2, 3)
+ obj_2 = versions.Check("Desc {}", operator.lt, 1, 3, 3)
+ assert_not_equal(hash(obj_1), hash(obj_2))
+ assert_not_equal(obj_1, obj_2)
+
+
+###############################################################################
+###############################################################################
+## EQ class
+
+def test_eq__str__one_value():
+ obj = versions.EQ(1)
+ assert_equal(str(obj), "v1.x")
+
+
+def test_eq__str__two_values():
+ obj = versions.EQ(2, 1)
+ assert_equal(str(obj), "v2.1.x")
+
+
+def test_eq__check_values__equal():
+ obj = versions.EQ(2, 3)
+ assert obj((2, 3))
+
+
+def test_eq__check_values__not_equal():
+ obj = versions.EQ(2, 3)
+ assert not obj((1, 3))
+ assert not obj((2, 2))
+ assert not obj((1, 4))
+
+
+def test_eq__check_values__equal_truncated():
+ obj = versions.EQ(2, 3)
+ assert obj((2, 3, 1))
+
+
+def test_eq__check_values__equal_too_few_values():
+ obj = versions.EQ(2, 3)
+ assert_raises(ValueError, obj, (2,))
+
+
+def test_eq__check_values__not_equal_too_few_values():
+ obj = versions.EQ(2, 3)
+ assert_raises(ValueError, obj, (1,))
+
+
+###############################################################################
+###############################################################################
+## GE class
+
+def test_ge__str__one_value():
+ obj = versions.GE(1)
+ assert_equal(str(obj), "at least v1.x")
+
+
+def test_ge__str__two_values():
+ obj = versions.GE(2, 1)
+ assert_equal(str(obj), "at least v2.1.x")
+
+
+def test_ge__check_values__greater_than_or_equal():
+ obj = versions.GE(2, 3)
+ assert obj((2, 3))
+ assert obj((2, 4))
+ assert obj((3, 0))
+
+
+def test_ge__check_values__not_greater_than_or_equal():
+ obj = versions.GE(2, 3)
+ assert not obj((1, 3))
+ assert not obj((2, 2))
+
+
+def test_ge__check_values__greater_than_or_equal_truncated():
+ obj = versions.GE(2, 3)
+ assert obj((2, 3, 1))
+ assert obj((2, 4, 2))
+
+
+def test_ge__check_values__equal_too_few_values():
+ obj = versions.GE(2, 3)
+ assert_raises(ValueError, obj, (2,))
+
+
+def test_ge__check_values__not_equal_too_few_values():
+ obj = versions.GE(2, 3)
+ assert_raises(ValueError, obj, (1,))
+
+
+###############################################################################
+###############################################################################
+## LT class
+
+def test_lt__str__one_value():
+ obj = versions.LT(1)
+ assert_equal(str(obj), "prior to v1.x")
+
+
+def test_lt__str__two_values():
+ obj = versions.LT(2, 1)
+ assert_equal(str(obj), "prior to v2.1.x")
+
+
+def test_lt__check_values__less_than():
+ obj = versions.LT(2, 3)
+ assert obj((2, 2))
+ assert obj((1, 9))
+
+
+def test_lt__check_values__not_less_than():
+ obj = versions.LT(2, 3)
+ assert not obj((2, 3))
+ assert not obj((2, 4))
+
+
+def test_lt__check_values__less_than_truncated():
+ obj = versions.LT(2, 3)
+ assert obj((2, 2, 1))
+ assert obj((2, 1, 2))
+
+
+def test_lt__check_values__less_than_too_few_values():
+ obj = versions.LT(2, 3)
+ assert_raises(ValueError, obj, (1,))
+
+
+def test_lt__check_values__not_less_than_too_few_values():
+ obj = versions.LT(2, 3)
+ assert_raises(ValueError, obj, (3,))
+
+
+###############################################################################
+###############################################################################
+## Any class
+
+def test_any__str():
+ obj = versions.Any()
+ assert_equal(str(obj), "any version")
+
+
+def test_lt__check_values__always_true():
+ obj = versions.Any()
+ assert obj((1,))
+ assert obj((2, 3))
+ assert obj((4, 5, 6))
+ assert obj((5, 6, 7, 8))
+
+
+###############################################################################
+###############################################################################
+## And class
+
+def test_and__init__non_check_value():
+ assert_raises(ValueError, versions.And, versions.LT(2), None)
+
+
+###############################################################################
+###############################################################################
+## And class -- str
+
+def test_and__str__single_item():
+ obj = versions.And(versions.GE(1))
+ assert_equal(str(obj), "at least v1.x")
+
+
+def test_and__str__two_items():
+ obj_ge = versions.GE(1, 2)
+ obj_lt = versions.LT(3, 4)
+ obj = versions.And(obj_ge, obj_lt)
+
+ assert_equal(str(obj), "at least v1.2.x and prior to v3.4.x")
+
+
+def test_and__str__two_items__first_is_operator():
+ obj_1 = versions.And(versions.GE(1, 2), versions.LT(2, 0))
+ obj_2 = versions.LT(3, 4)
+ obj = versions.And(obj_1, obj_2)
+
+ assert_equal(str(obj),
+ "(at least v1.2.x and prior to v2.0.x) and prior to v3.4.x")
+
+
+def test_and__str__two_items__second_is_operator():
+ obj_1 = versions.GE(1, 2)
+ obj_2 = versions.Or(versions.GE(2, 0), versions.LT(3, 4))
+ obj = versions.And(obj_1, obj_2)
+
+ assert_equal(str(obj),
+ "at least v1.2.x and (at least v2.0.x or prior to v3.4.x)")
+
+
+###############################################################################
+###############################################################################
+## And class -- check_version
+
+def test_and__check_version__both_true():
+ obj_1 = versions.GE(1, 2)
+ obj_2 = versions.LT(2, 0)
+ obj = versions.And(obj_1, obj_2)
+ assert obj((1, 3))
+
+
+def test_and__check_version__first_true():
+ obj_1 = versions.And(versions.GE(1, 2), versions.LT(2, 0))
+ obj_2 = versions.And(versions.GE(2, 3), versions.LT(3, 0))
+ obj = versions.And(obj_1, obj_2)
+ assert not obj((1, 3))
+
+
+def test_and__check_version__second_true():
+ obj_1 = versions.And(versions.GE(1, 2), versions.LT(2, 0))
+ obj_2 = versions.And(versions.GE(2, 3), versions.LT(3, 0))
+ obj = versions.And(obj_1, obj_2)
+ assert not obj((2, 3))
+
+
+def test_and__check_version__neither_true():
+ obj_1 = versions.And(versions.GE(1, 2), versions.LT(2, 0))
+ obj_2 = versions.And(versions.GE(2, 3), versions.LT(3, 0))
+ obj = versions.And(obj_1, obj_2)
+ assert not obj((2, 2))
+
+
+def test_and__check_version__truncated():
+ def _do_and_check_truncated(obj_1, obj_2):
+ obj = versions.And(obj_1, obj_2)
+ assert obj((1, 3, 3))
+
+ yield _do_and_check_truncated, versions.GE(1, 2), versions.LT(2, 0)
+ yield _do_and_check_truncated, versions.GE(1, 2, 2), versions.LT(2, 0)
+ yield _do_and_check_truncated, versions.GE(1, 2), versions.LT(2, 0, 1)
+
+
+def test_and__check_version__insufficient_number_of_values():
+ def _do_and_check_num_values(obj_1, obj_2):
+ obj = versions.And(obj_1, obj_2)
+ assert_raises(ValueError, obj, (1, 3))
+
+ yield _do_and_check_num_values, versions.GE(1, 2, 2), versions.LT(2, 0)
+ yield _do_and_check_num_values, versions.GE(1, 2), versions.LT(2, 0, 1)
+ yield _do_and_check_num_values, versions.GE(1, 2, 2), versions.LT(2, 0, 1)
+
+
+###############################################################################
+###############################################################################
+## Or class
+
+def test_or__init__non_check_value():
+ assert_raises(ValueError, versions.Or, versions.LT(2), None)
+
+
+###############################################################################
+###############################################################################
+## Or class -- str
+
+def test_or__str__single_item():
+ obj = versions.Or(versions.GE(1))
+ assert_equal(str(obj), "at least v1.x")
+
+
+def test_or__str__two_items():
+ obj_ge = versions.GE(1, 2)
+ obj_lt = versions.LT(3, 4)
+ obj = versions.Or(obj_ge, obj_lt)
+
+ assert_equal(str(obj), "at least v1.2.x or prior to v3.4.x")
+
+
+def test_or__str__two_items__first_is_operator():
+ obj_1 = versions.Or(versions.GE(1, 2), versions.LT(2, 0))
+ obj_2 = versions.LT(3, 4)
+ obj = versions.Or(obj_1, obj_2)
+
+ assert_equal(str(obj),
+ "(at least v1.2.x or prior to v2.0.x) or prior to v3.4.x")
+
+
+def test_or__str__two_items__second_is_operator():
+ obj_1 = versions.GE(1, 2)
+ obj_2 = versions.And(versions.GE(2, 0), versions.LT(3, 4))
+ obj = versions.Or(obj_1, obj_2)
+
+ assert_equal(str(obj),
+ "at least v1.2.x or (at least v2.0.x and prior to v3.4.x)")
+
+
+###############################################################################
+###############################################################################
+## Or class -- check_version
+
+def test_or__check_version__both_true():
+ obj_1 = versions.GE(1, 2)
+ obj_2 = versions.LT(2, 0)
+ obj = versions.Or(obj_1, obj_2)
+ assert obj((1, 3))
+
+
+def test_or__check_version__first_true():
+ obj_1 = versions.And(versions.GE(1, 2), versions.LT(2, 0))
+ obj_2 = versions.And(versions.GE(2, 3), versions.LT(3, 0))
+ obj = versions.Or(obj_1, obj_2)
+ assert obj((1, 3))
+
+
+def test_or__check_version__second_true():
+ obj_1 = versions.And(versions.GE(1, 2), versions.LT(2, 0))
+ obj_2 = versions.And(versions.GE(2, 3), versions.LT(3, 0))
+ obj = versions.Or(obj_1, obj_2)
+ assert obj((2, 3))
+
+
+def test_or__check_version__neither_true():
+ obj_1 = versions.And(versions.GE(1, 2), versions.LT(2, 0))
+ obj_2 = versions.And(versions.GE(2, 3), versions.LT(3, 0))
+ obj = versions.Or(obj_1, obj_2)
+ assert not obj((2, 2))
+
+
+def test_or__check_version__truncated():
+ def _do_or_check_truncated(obj_1, obj_2):
+ obj = versions.Or(obj_1, obj_2)
+ assert obj((1, 3, 3))
+
+ yield _do_or_check_truncated, versions.GE(1, 2), versions.LT(2, 0)
+ yield _do_or_check_truncated, versions.GE(1, 2, 2), versions.LT(2, 0)
+ yield _do_or_check_truncated, versions.GE(1, 2), versions.LT(2, 0, 1)
+
+
+def test_or__check_version__insufficient_number_of_values():
+ def _do_or_check_num_values(obj_1, obj_2):
+ obj = versions.Or(obj_1, obj_2)
+ assert_raises(ValueError, obj, (1, 3))
+
+ yield _do_or_check_num_values, versions.GE(1, 2, 2), versions.LT(2, 0)
+ yield _do_or_check_num_values, versions.GE(1, 2, 2), versions.LT(2, 0, 1)
+
+
+def test_or__check_version__insufficient_number_of_values__is_lazy():
+ obj_1 = versions.GE(1, 2)
+ obj_2 = versions.LT(2, 0, 1)
+ obj = versions.Or(obj_1, obj_2)
+ assert obj((1, 3))
+
+
+###############################################################################
+###############################################################################
+## RequirementObj -- constructor
+
+def test_requirementobj__init__defaults():
+ obj = versions.RequirementObj(call=("echo", "foo"),
+ search=r"(\d+)\.(\d+)",
+ checks=versions.Any())
+
+ assert_equal(obj.name, "echo")
+ assert_equal(obj.priority, 0)
+
+
+def test_requirementobj__init__non_defaults():
+ obj = versions.RequirementObj(call=("bash", "foo"),
+ search=r"(\d+)\.(\d+)",
+ checks=versions.Any(),
+ name="A name",
+ priority=17)
+
+ assert_equal(obj.name, "A name")
+ assert_equal(obj.priority, 17)
+
+
+###############################################################################
+###############################################################################
+## RequirementObj -- version
+
+def _echo_version(version, to="stdout", returncode=0):
+ tmpl = "import sys; sys.%s.write(%r); sys.exit(%s);"
+ return ("/usr/bin/python", "-c", tmpl % (to, version, returncode))
+_PIPES = ("stderr", "stdout")
+
+
+def test_requirementobj__version__call():
+ def _do_test_version__single_digit(pipe, regexp, equals):
+ call = _echo_version("v3.5.2\n", to=pipe)
+ obj = versions.RequirementObj(call=call,
+ search=regexp,
+ checks=versions.Any())
+ assert_equal(obj.version, equals)
+
+ for pipe in _PIPES:
+ yield _do_test_version__single_digit, pipe, r"v(\d+)", (3,)
+ yield _do_test_version__single_digit, pipe, r"v(\d+)\.(\d+)", (3, 5)
+ yield _do_test_version__single_digit, pipe, r"v(\d+)\.(\d+)\.(\d+)", \
+ (3, 5, 2)
+
+
+def test_requirementobj__version__version_str_not_found():
+ call = _echo_version("A typical error\n")
+ obj = versions.RequirementObj(call=call,
+ search=r"v(\d+)\.(\d+)",
+ checks=versions.Any())
+
+ assert_raises(versions.VersionRequirementError, getattr, obj, "version")
+
+
+def test_requirementobj__version__command_not_found():
+ obj = versions.RequirementObj(call=("xyzabcdefoo",),
+ search=r"v(\d+)\.(\d+)",
+ checks=versions.Any())
+
+ try:
+ obj.version # pylint: disable=
+ assert False # pragma: no coverage
+ except versions.VersionRequirementError, error:
+ # Should include OSError message
+ assert_in("No such file or directory", str(error))
+
+
+def test_requirementobj__version__return_code_is_ignored():
+ obj = versions.RequirementObj(_echo_version("v1.2.3", returncode=1),
+ search=r"v(\d+)\.(\d+)",
+ checks=versions.Any())
+ assert_equal(obj.version, (1, 2))
+
+
+def test_requirementobj__version__func_call():
+ def _return_version():
+ return "This is v5.3!"
+
+ obj = versions.RequirementObj(call=_return_version,
+ search=r"v(\d+)\.(\d+)",
+ checks=versions.Any())
+ assert_equal(obj.version, (5, 3))
+
+
+def test_requirementobj__version__func_call_with_arguments():
+ def _return_version(arg1, arg2):
+ assert_equal((arg1, arg2), (2, "foo"))
+ return "This is v5.3!"
+
+ obj = versions.RequirementObj(call=(_return_version, 2, "foo"),
+ search=r"v(\d+)\.(\d+)",
+ checks=versions.Any())
+ assert_equal(obj.version, (5, 3))
+
+
+def test_requirementobj__version__outdated_jre__with_or_without_version_str():
+ error_msg = "upgrade your version of Java"
+
+ def _do_test_outdated_jre(message):
+ obj = versions.RequirementObj(call=lambda: message,
+ search=r"v(\d+)\.(\d+)",
+ checks=versions.Any())
+
+ try:
+ obj.version
+ assert False # pragma: no coverage
+ except versions.VersionRequirementError, error:
+ assert_in(error_msg, str(error))
+
+ messages = [
+ "UnsupportedClassVersionError",
+ "UnsupportedClassVersionError v1.2.3"]
+
+ for message in messages:
+ yield _do_test_outdated_jre, message
+
+
+###############################################################################
+###############################################################################
+## RequirementObj -- __call__
+
+class CheckCounted(versions.Check):
+ def __init__(self, return_value=True, expected=(1, 1)):
+ versions.Check.__init__(self, "counted {}", operator.eq, *expected)
+ object.__setattr__(self, "count", 0)
+ object.__setattr__(self, "return_value", return_value)
+
+ def _do_check_version(self, values, current):
+ assert_equal(values, current)
+ object.__setattr__(self, "count", self.count + 1)
+ return self.return_value
+
+
+def test_requirementobj__call__result_is_cached():
+ counter = CheckCounted()
+ obj = versions.RequirementObj(call=lambda: "v1.1.3",
+ search=r"(\d)\.(\d)",
+ checks=counter)
+
+ obj()
+ obj()
+
+ assert_equal(counter.count, 1)
+
+
+def test_requirementobj__call__result_is_cached_unless_forced():
+ counter = CheckCounted()
+ obj = versions.RequirementObj(call=lambda: "v1.1.3",
+ search=r"(\d)\.(\d)",
+ checks=counter)
+
+ obj()
+ obj(force=True)
+
+ assert_equal(counter.count, 2)
+
+
+def test_requirementobj__call__check_fails__function():
+ expected = \
+ "Version requirements not met for test#1; please refer\n" \
+ "to the PALEOMIX documentation for more information.\n" \
+ "\n" \
+ " Version: v1.0.x\n" \
+ " Required: at least v1.1.x"
+
+ obj = versions.RequirementObj(call=lambda: "v1.0.3",
+ search=r"(\d)\.(\d)",
+ checks=versions.GE(1, 1),
+ name="test#1")
+ try:
+ obj()
+ assert False # pragma: no coverage
+ except versions.VersionRequirementError, error:
+ assert_equal(str(error), expected)
+
+
+def test_requirementobj__call__check_fails():
+ expected = \
+ "Version requirements not met for test#1; please refer\n" \
+ "to the PALEOMIX documentation for more information.\n" \
+ "\n" \
+ " Executable: /usr/bin/python\n" \
+ " Call: /usr/bin/python -c import sys; " \
+ "sys.stdout.write('v1.0.2'); sys.exit(0);\n" \
+ " Version: v1.0.x\n" \
+ " Required: at least v1.1.x"
+
+ obj = versions.RequirementObj(call=_echo_version("v1.0.2"),
+ search=r"(\d)\.(\d)",
+ checks=versions.GE(1, 1),
+ name="test#1")
+ try:
+ obj()
+ assert False # pragma: no coverage
+ except versions.VersionRequirementError, error:
+ assert_equal(str(error), expected)
+
+
+def test_requirementobj__call__check_fails__jre_outdated():
+ expected = \
+ "Version could not be determined for test#1:\n" \
+ "\n" \
+ " Executable: /usr/bin/python\n" \
+ " Call: /usr/bin/python -c import sys; " \
+ "sys.stdout.write('UnsupportedClassVersionError'); sys.exit(0);\n" \
+ "\n" \
+ "The version of the Java Runtime Environment on this\n" \
+ "system is too old; please check the the requirement\n" \
+ "for the program and upgrade your version of Java.\n" \
+ "\n" \
+ "See the documentation for more information."
+
+ value = "UnsupportedClassVersionError"
+ obj = versions.RequirementObj(call=_echo_version(value),
+ search=r"(\d)\.(\d)",
+ checks=versions.GE(1, 1),
+ name="test#1")
+ try:
+ obj()
+ assert False # pragma: no coverage
+ except versions.VersionRequirementError, error:
+ assert_equal(str(error), expected)
+
+
+###############################################################################
+###############################################################################
+## Pickling of checks
+
+def test_check__can_pickle():
+ def _do_test_can_pickle(obj):
+ pickle.dumps(obj)
+
+ yield _do_test_can_pickle, versions.EQ(1, 2, 3)
+ yield _do_test_can_pickle, versions.GE(1, 2, 3)
+ yield _do_test_can_pickle, versions.LT(1, 2, 3)
+ yield _do_test_can_pickle, versions.Any()
+ yield _do_test_can_pickle, versions.And(versions.EQ(1, 2, 3))
+ yield _do_test_can_pickle, versions.Or(versions.GE(1, 2, 3))
+
+
+###############################################################################
+###############################################################################
+## Requirement
+
+def test_requirement__obj_is_cached_for_same_values():
+ obj1 = versions.Requirement("echo", "", versions.LT(1))
+ obj2 = versions.Requirement("echo", "", versions.LT(1))
+ assert_is(obj1, obj2)
+
+
+def test_requirement__new_obj_if_call_differ():
+ obj1 = versions.Requirement("echo", "", versions.LT(1))
+ obj2 = versions.Requirement("true", "", versions.LT(1))
+ assert_is_not(obj1, obj2)
+
+
+def test_requirement__new_obj_if_search_differ():
+ obj1 = versions.Requirement("echo", r"(\d+)", versions.LT(1))
+ obj2 = versions.Requirement("echo", "", versions.LT(1))
+ assert_is_not(obj1, obj2)
+
+
+def test_requirement__new_obj_if_checks_differ():
+ obj1 = versions.Requirement("echo", "", versions.GE(1))
+ obj2 = versions.Requirement("echo", "", versions.LT(1))
+ assert_is_not(obj1, obj2)
+
+
+def test_requirement__same_obj_if_name_differ():
+ obj1 = versions.Requirement("echo", "", versions.GE(1))
+ assert_equal(obj1.name, "echo")
+ obj2 = versions.Requirement("echo", "", versions.GE(1), name="foo")
+ assert_equal(obj2.name, "foo")
+ assert_is(obj1, obj2)
+
+ obj3 = versions.Requirement("echo", "", versions.GE(1), name="bar")
+ assert_equal(obj3.name, "bar")
+ assert_is(obj2, obj3)
+
+ obj4 = versions.Requirement("echo", "", versions.GE(1))
+ assert_equal(obj3.name, "bar")
+ assert_is(obj3, obj4)
+
+
+def test_requirement_highest_priority_overrides():
+ obj1 = versions.Requirement("echo", "", versions.LT(1), priority=0)
+ assert_equal(obj1.priority, 0)
+ obj2 = versions.Requirement("echo", "", versions.LT(1), priority=5)
+ assert_is(obj1, obj2)
+ assert_equal(obj2.priority, 5)
+
+
+def test_requirement_highest_priority_retained():
+ obj1 = versions.Requirement("echo", "", versions.LT(1), priority=5)
+ assert_equal(obj1.priority, 5)
+ obj2 = versions.Requirement("echo", "", versions.LT(1), priority=0)
+ assert_is(obj1, obj2)
+ assert_equal(obj2.priority, 5)
diff --git a/tests/data/alignments/library_1.bam b/tests/data/alignments/library_1.bam
new file mode 100644
index 0000000..38b98c6
Binary files /dev/null and b/tests/data/alignments/library_1.bam differ
diff --git a/tests/data/alignments/library_2.bam b/tests/data/alignments/library_2.bam
new file mode 100644
index 0000000..699ad77
Binary files /dev/null and b/tests/data/alignments/library_2.bam differ
diff --git a/tests/data/empty_file_1 b/tests/data/empty_file_1
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/empty_file_2 b/tests/data/empty_file_2
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/empty_file_3 b/tests/data/empty_file_3
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/fasta_file.fasta b/tests/data/fasta_file.fasta
new file mode 100644
index 0000000..802a134
--- /dev/null
+++ b/tests/data/fasta_file.fasta
@@ -0,0 +1,4 @@
+>This_is_FASTA!
+ACGTN
+>This_is_ALSO_FASTA!
+CGTNA
diff --git a/tests/data/fasta_file.fasta.bz2 b/tests/data/fasta_file.fasta.bz2
new file mode 100644
index 0000000..bdc1d37
Binary files /dev/null and b/tests/data/fasta_file.fasta.bz2 differ
diff --git a/tests/data/fasta_file.fasta.gz b/tests/data/fasta_file.fasta.gz
new file mode 100644
index 0000000..1b95419
Binary files /dev/null and b/tests/data/fasta_file.fasta.gz differ
diff --git a/tests/data/non_empty_file_1 b/tests/data/non_empty_file_1
new file mode 100644
index 0000000..d00491f
--- /dev/null
+++ b/tests/data/non_empty_file_1
@@ -0,0 +1 @@
+1
diff --git a/tests/data/non_empty_file_2 b/tests/data/non_empty_file_2
new file mode 100644
index 0000000..0cfbf08
--- /dev/null
+++ b/tests/data/non_empty_file_2
@@ -0,0 +1 @@
+2
diff --git a/tests/data/non_empty_file_3 b/tests/data/non_empty_file_3
new file mode 100644
index 0000000..00750ed
--- /dev/null
+++ b/tests/data/non_empty_file_3
@@ -0,0 +1 @@
+3
diff --git a/tests/data/rCRS.fasta b/tests/data/rCRS.fasta
new file mode 100644
index 0000000..d7209f1
--- /dev/null
+++ b/tests/data/rCRS.fasta
@@ -0,0 +1,239 @@
+>gi|251831106|ref|NC_012920.1| Homo sapiens mitochondrion, complete genome
+GATCACAGGTCTATCACCCTATTAACCACTCACGGGAGCTCTCCATGCATTTGGTATTTTCGTCTGGGGG
+GTATGCACGCGATAGCATTGCGAGACGCTGGAGCCGGAGCACCCTATGTCGCAGTATCTGTCTTTGATTC
+CTGCCTCATCCTATTATTTATCGCACCTACGTTCAATATTACAGGCGAACATACTTACTAAAGTGTGTTA
+ATTAATTAATGCTTGTAGGACATAATAATAACAATTGAATGTCTGCACAGCCACTTTCCACACAGACATC
+ATAACAAAAAATTTCCACCAAACCCCCCCTCCCCCGCTTCTGGCCACAGCACTTAAACACATCTCTGCCA
+AACCCCAAAAACAAAGAACCCTAACACCAGCCTAACCAGATTTCAAATTTTATCTTTTGGCGGTATGCAC
+TTTTAACAGTCACCCCCCAACTAACACATTATTTTCCCCTCCCACTCCCATACTACTAATCTCATCAATA
+CAACCCCCGCCCATCCTACCCAGCACACACACACCGCTGCTAACCCCATACCCCGAACCAACCAAACCCC
+AAAGACACCCCCCACAGTTTATGTAGCTTACCTCCTCAAAGCAATACACTGAAAATGTTTAGACGGGCTC
+ACATCACCCCATAAACAAATAGGTTTGGTCCTAGCCTTTCTATTAGCTCTTAGTAAGATTACACATGCAA
+GCATCCCCGTTCCAGTGAGTTCACCCTCTAAATCACCACGATCAAAAGGAACAAGCATCAAGCACGCAGC
+AATGCAGCTCAAAACGCTTAGCCTAGCCACACCCCCACGGGAAACAGCAGTGATTAACCTTTAGCAATAA
+ACGAAAGTTTAACTAAGCTATACTAACCCCAGGGTTGGTCAATTTCGTGCCAGCCACCGCGGTCACACGA
+TTAACCCAAGTCAATAGAAGCCGGCGTAAAGAGTGTTTTAGATCACCCCCTCCCCAATAAAGCTAAAACT
+CACCTGAGTTGTAAAAAACTCCAGTTGACACAAAATAGACTACGAAAGTGGCTTTAACATATCTGAACAC
+ACAATAGCTAAGACCCAAACTGGGATTAGATACCCCACTATGCTTAGCCCTAAACCTCAACAGTTAAATC
+AACAAAACTGCTCGCCAGAACACTACGAGCCACAGCTTAAAACTCAAAGGACCTGGCGGTGCTTCATATC
+CCTCTAGAGGAGCCTGTTCTGTAATCGATAAACCCCGATCAACCTCACCACCTCTTGCTCAGCCTATATA
+CCGCCATCTTCAGCAAACCCTGATGAAGGCTACAAAGTAAGCGCAAGTACCCACGTAAAGACGTTAGGTC
+AAGGTGTAGCCCATGAGGTGGCAAGAAATGGGCTACATTTTCTACCCCAGAAAACTACGATAGCCCTTAT
+GAAACTTAAGGGTCGAAGGTGGATTTAGCAGTAAACTAAGAGTAGAGTGCTTAGTTGAACAGGGCCCTGA
+AGCGCGTACACACCGCCCGTCACCCTCCTCAAGTATACTTCAAAGGACATTTAACTAAAACCCCTACGCA
+TTTATATAGAGGAGACAAGTCGTAACATGGTAAGTGTACTGGAAAGTGCACTTGGACGAACCAGAGTGTA
+GCTTAACACAAAGCACCCAACTTACACTTAGGAGATTTCAACTTAACTTGACCGCTCTGAGCTAAACCTA
+GCCCCAAACCCACTCCACCTTACTACCAGACAACCTTAGCCAAACCATTTACCCAAATAAAGTATAGGCG
+ATAGAAATTGAAACCTGGCGCAATAGATATAGTACCGCAAGGGAAAGATGAAAAATTATAACCAAGCATA
+ATATAGCAAGGACTAACCCCTATACCTTCTGCATAATGAATTAACTAGAAATAACTTTGCAAGGAGAGCC
+AAAGCTAAGACCCCCGAAACCAGACGAGCTACCTAAGAACAGCTAAAAGAGCACACCCGTCTATGTAGCA
+AAATAGTGGGAAGATTTATAGGTAGAGGCGACAAACCTACCGAGCCTGGTGATAGCTGGTTGTCCAAGAT
+AGAATCTTAGTTCAACTTTAAATTTGCCCACAGAACCCTCTAAATCCCCTTGTAAATTTAACTGTTAGTC
+CAAAGAGGAACAGCTCTTTGGACACTAGGAAAAAACCTTGTAGAGAGAGTAAAAAATTTAACACCCATAG
+TAGGCCTAAAAGCAGCCACCAATTAAGAAAGCGTTCAAGCTCAACACCCACTACCTAAAAAATCCCAAAC
+ATATAACTGAACTCCTCACACCCAATTGGACCAATCTATCACCCTATAGAAGAACTAATGTTAGTATAAG
+TAACATGAAAACATTCTCCTCCGCATAAGCCTGCGTCAGATTAAAACACTGAACTGACAATTAACAGCCC
+AATATCTACAATCAACCAACAAGTCATTATTACCCTCACTGTCAACCCAACACAGGCATGCTCATAAGGA
+AAGGTTAAAAAAAGTAAAAGGAACTCGGCAAATCTTACCCCGCCTGTTTACCAAAAACATCACCTCTAGC
+ATCACCAGTATTAGAGGCACCGCCTGCCCAGTGACACATGTTTAACGGCCGCGGTACCCTAACCGTGCAA
+AGGTAGCATAATCACTTGTTCCTTAAATAGGGACCTGTATGAATGGCTCCACGAGGGTTCAGCTGTCTCT
+TACTTTTAACCAGTGAAATTGACCTGCCCGTGAAGAGGCGGGCATAACACAGCAAGACGAGAAGACCCTA
+TGGAGCTTTAATTTATTAATGCAAACAGTACCTAACAAACCCACAGGTCCTAAACTACCAAACCTGCATT
+AAAAATTTCGGTTGGGGCGACCTCGGAGCAGAACCCAACCTCCGAGCAGTACATGCTAAGACTTCACCAG
+TCAAAGCGAACTACTATACTCAATTGATCCAATAACTTGACCAACGGAACAAGTTACCCTAGGGATAACA
+GCGCAATCCTATTCTAGAGTCCATATCAACAATAGGGTTTACGACCTCGATGTTGGATCAGGACATCCCG
+ATGGTGCAGCCGCTATTAAAGGTTCGTTTGTTCAACGATTAAAGTCCTACGTGATCTGAGTTCAGACCGG
+AGTAATCCAGGTCGGTTTCTATCTACNTTCAAATTCCTCCCTGTACGAAAGGACAAGAGAAATAAGGCCT
+ACTTCACAAAGCGCCTTCCCCCGTAAATGATATCATCTCAACTTAGTATTATACCCACACCCACCCAAGA
+ACAGGGTTTGTTAAGATGGCAGAGCCCGGTAATCGCATAAAACTTAAAACTTTACAGTCAGAGGTTCAAT
+TCCTCTTCTTAACAACATACCCATGGCCAACCTCCTACTCCTCATTGTACCCATTCTAATCGCAATGGCA
+TTCCTAATGCTTACCGAACGAAAAATTCTAGGCTATATACAACTACGCAAAGGCCCCAACGTTGTAGGCC
+CCTACGGGCTACTACAACCCTTCGCTGACGCCATAAAACTCTTCACCAAAGAGCCCCTAAAACCCGCCAC
+ATCTACCATCACCCTCTACATCACCGCCCCGACCTTAGCTCTCACCATCGCTCTTCTACTATGAACCCCC
+CTCCCCATACCCAACCCCCTGGTCAACCTCAACCTAGGCCTCCTATTTATTCTAGCCACCTCTAGCCTAG
+CCGTTTACTCAATCCTCTGATCAGGGTGAGCATCAAACTCAAACTACGCCCTGATCGGCGCACTGCGAGC
+AGTAGCCCAAACAATCTCATATGAAGTCACCCTAGCCATCATTCTACTATCAACATTACTAATAAGTGGC
+TCCTTTAACCTCTCCACCCTTATCACAACACAAGAACACCTCTGATTACTCCTGCCATCATGACCCTTGG
+CCATAATATGATTTATCTCCACACTAGCAGAGACCAACCGAACCCCCTTCGACCTTGCCGAAGGGGAGTC
+CGAACTAGTCTCAGGCTTCAACATCGAATACGCCGCAGGCCCCTTCGCCCTATTCTTCATAGCCGAATAC
+ACAAACATTATTATAATAAACACCCTCACCACTACAATCTTCCTAGGAACAACATATGACGCACTCTCCC
+CTGAACTCTACACAACATATTTTGTCACCAAGACCCTACTTCTAACCTCCCTGTTCTTATGAATTCGAAC
+AGCATACCCCCGATTCCGCTACGACCAACTCATACACCTCCTATGAAAAAACTTCCTACCACTCACCCTA
+GCATTACTTATATGATATGTCTCCATACCCATTACAATCTCCAGCATTCCCCCTCAAACCTAAGAAATAT
+GTCTGATAAAAGAGTTACTTTGATAGAGTAAATAATAGGAGCTTAAACCCCCTTATTTCTAGGACTATGA
+GAATCGAACCCATCCCTGAGAATCCAAAATTCTCCGTGCCACCTATCACACCCCATCCTAAAGTAAGGTC
+AGCTAAATAAGCTATCGGGCCCATACCCCGAAAATGTTGGTTATACCCTTCCCGTACTAATTAATCCCCT
+GGCCCAACCCGTCATCTACTCTACCATCTTTGCAGGCACACTCATCACAGCGCTAAGCTCGCACTGATTT
+TTTACCTGAGTAGGCCTAGAAATAAACATGCTAGCTTTTATTCCAGTTCTAACCAAAAAAATAAACCCTC
+GTTCCACAGAAGCTGCCATCAAGTATTTCCTCACGCAAGCAACCGCATCCATAATCCTTCTAATAGCTAT
+CCTCTTCAACAATATACTCTCCGGACAATGAACCATAACCAATACTACCAATCAATACTCATCATTAATA
+ATCATAATAGCTATAGCAATAAAACTAGGAATAGCCCCCTTTCACTTCTGAGTCCCAGAGGTTACCCAAG
+GCACCCCTCTGACATCCGGCCTGCTTCTTCTCACATGACAAAAACTAGCCCCCATCTCAATCATATACCA
+AATCTCTCCCTCACTAAACGTAAGCCTTCTCCTCACTCTCTCAATCTTATCCATCATAGCAGGCAGTTGA
+GGTGGATTAAACCAAACCCAGCTACGCAAAATCTTAGCATACTCCTCAATTACCCACATAGGATGAATAA
+TAGCAGTTCTACCGTACAACCCTAACATAACCATTCTTAATTTAACTATTTATATTATCCTAACTACTAC
+CGCATTCCTACTACTCAACTTAAACTCCAGCACCACGACCCTACTACTATCTCGCACCTGAAACAAGCTA
+ACATGACTAACACCCTTAATTCCATCCACCCTCCTCTCCCTAGGAGGCCTGCCCCCGCTAACCGGCTTTT
+TGCCCAAATGGGCCATTATCGAAGAATTCACAAAAAACAATAGCCTCATCATCCCCACCATCATAGCCAC
+CATCACCCTCCTTAACCTCTACTTCTACCTACGCCTAATCTACTCCACCTCAATCACACTACTCCCCATA
+TCTAACAACGTAAAAATAAAATGACAGTTTGAACATACAAAACCCACCCCATTCCTCCCCACACTCATCG
+CCCTTACCACGCTACTCCTACCTATCTCCCCTTTTATACTAATAATCTTATAGAAATTTAGGTTAAATAC
+AGACCAAGAGCCTTCAAAGCCCTCAGTAAGTTGCAATACTTAATTTCTGTAACAGCTAAGGACTGCAAAA
+CCCCACTCTGCATCAACTGAACGCAAATCAGCCACTTTAATTAAGCTAAGCCCTTACTAGACCAATGGGA
+CTTAAACCCACAAACACTTAGTTAACAGCTAAGCACCCTAATCAACTGGCTTCAATCTACTTCTCCCGCC
+GCCGGGAAAAAAGGCGGGAGAAGCCCCGGCAGGTTTGAAGCTGCTTCTTCGAATTTGCAATTCAATATGA
+AAATCACCTCGGAGCTGGTAAAAAGAGGCCTAACCCCTGTCTTTAGATTTACAGTCCAATGCTTCACTCA
+GCCATTTTACCTCACCCCCACTGATGTTCGCCGACCGTTGACTATTCTCTACAAACCACAAAGACATTGG
+AACACTATACCTATTATTCGGCGCATGAGCTGGAGTCCTAGGCACAGCTCTAAGCCTCCTTATTCGAGCC
+GAGCTGGGCCAGCCAGGCAACCTTCTAGGTAACGACCACATCTACAACGTTATCGTCACAGCCCATGCAT
+TTGTAATAATCTTCTTCATAGTAATACCCATCATAATCGGAGGCTTTGGCAACTGACTAGTTCCCCTAAT
+AATCGGTGCCCCCGATATGGCGTTTCCCCGCATAAACAACATAAGCTTCTGACTCTTACCTCCCTCTCTC
+CTACTCCTGCTCGCATCTGCTATAGTGGAGGCCGGAGCAGGAACAGGTTGAACAGTCTACCCTCCCTTAG
+CAGGGAACTACTCCCACCCTGGAGCCTCCGTAGACCTAACCATCTTCTCCTTACACCTAGCAGGTGTCTC
+CTCTATCTTAGGGGCCATCAATTTCATCACAACAATTATCAATATAAAACCCCCTGCCATAACCCAATAC
+CAAACGCCCCTCTTCGTCTGATCCGTCCTAATCACAGCAGTCCTACTTCTCCTATCTCTCCCAGTCCTAG
+CTGCTGGCATCACTATACTACTAACAGACCGCAACCTCAACACCACCTTCTTCGACCCCGCCGGAGGAGG
+AGACCCCATTCTATACCAACACCTATTCTGATTTTTCGGTCACCCTGAAGTTTATATTCTTATCCTACCA
+GGCTTCGGAATAATCTCCCATATTGTAACTTACTACTCCGGAAAAAAAGAACCATTTGGATACATAGGTA
+TGGTCTGAGCTATGATATCAATTGGCTTCCTAGGGTTTATCGTGTGAGCACACCATATATTTACAGTAGG
+AATAGACGTAGACACACGAGCATATTTCACCTCCGCTACCATAATCATCGCTATCCCCACCGGCGTCAAA
+GTATTTAGCTGACTCGCCACACTCCACGGAAGCAATATGAAATGATCTGCTGCAGTGCTCTGAGCCCTAG
+GATTCATCTTTCTTTTCACCGTAGGTGGCCTGACTGGCATTGTATTAGCAAACTCATCACTAGACATCGT
+ACTACACGACACGTACTACGTTGTAGCCCACTTCCACTATGTCCTATCAATAGGAGCTGTATTTGCCATC
+ATAGGAGGCTTCATTCACTGATTTCCCCTATTCTCAGGCTACACCCTAGACCAAACCTACGCCAAAATCC
+ATTTCACTATCATATTCATCGGCGTAAATCTAACTTTCTTCCCACAACACTTTCTCGGCCTATCCGGAAT
+GCCCCGACGTTACTCGGACTACCCCGATGCATACACCACATGAAACATCCTATCATCTGTAGGCTCATTC
+ATTTCTCTAACAGCAGTAATATTAATAATTTTCATGATTTGAGAAGCCTTCGCTTCGAAGCGAAAAGTCC
+TAATAGTAGAAGAACCCTCCATAAACCTGGAGTGACTATATGGATGCCCCCCACCCTACCACACATTCGA
+AGAACCCGTATACATAAAATCTAGACAAAAAAGGAAGGAATCGAACCCCCCAAAGCTGGTTTCAAGCCAA
+CCCCATGGCCTCCATGACTTTTTCAAAAAGGTATTAGAAAAACCATTTCATAACTTTGTCAAAGTTAAAT
+TATAGGCTAAATCCTATATATCTTAATGGCACATGCAGCGCAAGTAGGTCTACAAGACGCTACTTCCCCT
+ATCATAGAAGAGCTTATCACCTTTCATGATCACGCCCTCATAATCATTTTCCTTATCTGCTTCCTAGTCC
+TGTATGCCCTTTTCCTAACACTCACAACAAAACTAACTAATACTAACATCTCAGACGCTCAGGAAATAGA
+AACCGTCTGAACTATCCTGCCCGCCATCATCCTAGTCCTCATCGCCCTCCCATCCCTACGCATCCTTTAC
+ATAACAGACGAGGTCAACGATCCCTCCCTTACCATCAAATCAATTGGCCACCAATGGTACTGAACCTACG
+AGTACACCGACTACGGCGGACTAATCTTCAACTCCTACATACTTCCCCCATTATTCCTAGAACCAGGCGA
+CCTGCGACTCCTTGACGTTGACAATCGAGTAGTACTCCCGATTGAAGCCCCCATTCGTATAATAATTACA
+TCACAAGACGTCTTGCACTCATGAGCTGTCCCCACATTAGGCTTAAAAACAGATGCAATTCCCGGACGTC
+TAAACCAAACCACTTTCACCGCTACACGACCGGGGGTATACTACGGTCAATGCTCTGAAATCTGTGGAGC
+AAACCACAGTTTCATGCCCATCGTCCTAGAATTAATTCCCCTAAAAATCTTTGAAATAGGGCCCGTATTT
+ACCCTATAGCACCCCCTCTACCCCCTCTAGAGCCCACTGTAAAGCTAACTTAGCATTAACCTTTTAAGTT
+AAAGATTAAGAGAACCAACACCTCTTTACAGTGAAATGCCCCAACTAAATACTACCGTATGGCCCACCAT
+AATTACCCCCATACTCCTTACACTATTCCTCATCACCCAACTAAAAATATTAAACACAAACTACCACCTA
+CCTCCCTCACCAAAGCCCATAAAAATAAAAAATTATAACAAACCCTGAGAACCAAAATGAACGAAAATCT
+GTTCGCTTCATTCATTGCCCCCACAATCCTAGGCCTACCCGCCGCAGTACTGATCATTCTATTTCCCCCT
+CTATTGATCCCCACCTCCAAATATCTCATCAACAACCGACTAATCACCACCCAACAATGACTAATCAAAC
+TAACCTCAAAACAAATGATAACCATACACAACACTAAAGGACGAACCTGATCTCTTATACTAGTATCCTT
+AATCATTTTTATTGCCACAACTAACCTCCTCGGACTCCTGCCTCACTCATTTACACCAACCACCCAACTA
+TCTATAAACCTAGCCATGGCCATCCCCTTATGAGCGGGCACAGTGATTATAGGCTTTCGCTCTAAGATTA
+AAAATGCCCTAGCCCACTTCTTACCACAAGGCACACCTACACCCCTTATCCCCATACTAGTTATTATCGA
+AACCATCAGCCTACTCATTCAACCAATAGCCCTGGCCGTACGCCTAACCGCTAACATTACTGCAGGCCAC
+CTACTCATGCACCTAATTGGAAGCGCCACCCTAGCAATATCAACCATTAACCTTCCCTCTACACTTATCA
+TCTTCACAATTCTAATTCTACTGACTATCCTAGAAATCGCTGTCGCCTTAATCCAAGCCTACGTTTTCAC
+ACTTCTAGTAAGCCTCTACCTGCACGACAACACATAATGACCCACCAATCACATGCCTATCATATAGTAA
+AACCCAGCCCATGACCCCTAACAGGGGCCCTCTCAGCCCTCCTAATGACCTCCGGCCTAGCCATGTGATT
+TCACTTCCACTCCATAACGCTCCTCATACTAGGCCTACTAACCAACACACTAACCATATACCAATGATGG
+CGCGATGTAACACGAGAAAGCACATACCAAGGCCACCACACACCACCTGTCCAAAAAGGCCTTCGATACG
+GGATAATCCTATTTATTACCTCAGAAGTTTTTTTCTTCGCAGGATTTTTCTGAGCCTTTTACCACTCCAG
+CCTAGCCCCTACCCCCCAATTAGGAGGGCACTGGCCCCCAACAGGCATCACCCCGCTAAATCCCCTAGAA
+GTCCCACTCCTAAACACATCCGTATTACTCGCATCAGGAGTATCAATCACCTGAGCTCACCATAGTCTAA
+TAGAAAACAACCGAAACCAAATAATTCAAGCACTGCTTATTACAATTTTACTGGGTCTCTATTTTACCCT
+CCTACAAGCCTCAGAGTACTTCGAGTCTCCCTTCACCATTTCCGACGGCATCTACGGCTCAACATTTTTT
+GTAGCCACAGGCTTCCACGGACTTCACGTCATTATTGGCTCAACTTTCCTCACTATCTGCTTCATCCGCC
+AACTAATATTTCACTTTACATCCAAACATCACTTTGGCTTCGAAGCCGCCGCCTGATACTGGCATTTTGT
+AGATGTGGTTTGACTATTTCTGTATGTCTCCATCTATTGATGAGGGTCTTACTCTTTTAGTATAAATAGT
+ACCGTTAACTTCCAATTAACTAGTTTTGACAACATTCAAAAAAGAGTAATAAACTTCGCCTTAATTTTAA
+TAATCAACACCCTCCTAGCCTTACTACTAATAATTATTACATTTTGACTACCACAACTCAACGGCTACAT
+AGAAAAATCCACCCCTTACGAGTGCGGCTTCGACCCTATATCCCCCGCCCGCGTCCCTTTCTCCATAAAA
+TTCTTCTTAGTAGCTATTACCTTCTTATTATTTGATCTAGAAATTGCCCTCCTTTTACCCCTACCATGAG
+CCCTACAAACAACTAACCTGCCACTAATAGTTATGTCATCCCTCTTATTAATCATCATCCTAGCCCTAAG
+TCTGGCCTATGAGTGACTACAAAAAGGATTAGACTGAACCGAATTGGTATATAGTTTAAACAAAACGAAT
+GATTTCGACTCATTAAATTATGATAATCATATTTACCAAATGCCCCTCATTTACATAAATATTATACTAG
+CATTTACCATCTCACTTCTAGGAATACTAGTATATCGCTCACACCTCATATCCTCCCTACTATGCCTAGA
+AGGAATAATACTATCGCTGTTCATTATAGCTACTCTCATAACCCTCAACACCCACTCCCTCTTAGCCAAT
+ATTGTGCCTATTGCCATACTAGTCTTTGCCGCCTGCGAAGCAGCGGTGGGCCTAGCCCTACTAGTCTCAA
+TCTCCAACACATATGGCCTAGACTACGTACATAACCTAAACCTACTCCAATGCTAAAACTAATCGTCCCA
+ACAATTATATTACTACCACTGACATGACTTTCCAAAAAACACATAATTTGAATCAACACAACCACCCACA
+GCCTAATTATTAGCATCATCCCTCTACTATTTTTTAACCAAATCAACAACAACCTATTTAGCTGTTCCCC
+AACCTTTTCCTCCGACCCCCTAACAACCCCCCTCCTAATACTAACTACCTGACTCCTACCCCTCACAATC
+ATGGCAAGCCAACGCCACTTATCCAGTGAACCACTATCACGAAAAAAACTCTACCTCTCTATACTAATCT
+CCCTACAAATCTCCTTAATTATAACATTCACAGCCACAGAACTAATCATATTTTATATCTTCTTCGAAAC
+CACACTTATCCCCACCTTGGCTATCATCACCCGATGAGGCAACCAGCCAGAACGCCTGAACGCAGGCACA
+TACTTCCTATTCTACACCCTAGTAGGCTCCCTTCCCCTACTCATCGCACTAATTTACACTCACAACACCC
+TAGGCTCACTAAACATTCTACTACTCACTCTCACTGCCCAAGAACTATCAAACTCCTGAGCCAACAACTT
+AATATGACTAGCTTACACAATAGCTTTTATAGTAAAGATACCTCTTTACGGACTCCACTTATGACTCCCT
+AAAGCCCATGTCGAAGCCCCCATCGCTGGGTCAATAGTACTTGCCGCAGTACTCTTAAAACTAGGCGGCT
+ATGGTATAATACGCCTCACACTCATTCTCAACCCCCTGACAAAACACATAGCCTACCCCTTCCTTGTACT
+ATCCCTATGAGGCATAATTATAACAAGCTCCATCTGCCTACGACAAACAGACCTAAAATCGCTCATTGCA
+TACTCTTCAATCAGCCACATAGCCCTCGTAGTAACAGCCATTCTCATCCAAACCCCCTGAAGCTTCACCG
+GCGCAGTCATTCTCATAATCGCCCACGGGCTTACATCCTCATTACTATTCTGCCTAGCAAACTCAAACTA
+CGAACGCACTCACAGTCGCATCATAATCCTCTCTCAAGGACTTCAAACTCTACTCCCACTAATAGCTTTT
+TGATGACTTCTAGCAAGCCTCGCTAACCTCGCCTTACCCCCCACTATTAACCTACTGGGAGAACTCTCTG
+TGCTAGTAACCACGTTCTCCTGATCAAATATCACTCTCCTACTTACAGGACTCAACATACTAGTCACAGC
+CCTATACTCCCTCTACATATTTACCACAACACAATGGGGCTCACTCACCCACCACATTAACAACATAAAA
+CCCTCATTCACACGAGAAAACACCCTCATGTTCATACACCTATCCCCCATTCTCCTCCTATCCCTCAACC
+CCGACATCATTACCGGGTTTTCCTCTTGTAAATATAGTTTAACCAAAACATCAGATTGTGAATCTGACAA
+CAGAGGCTTACGACCCCTTATTTACCGAGAAAGCTCACAAGAACTGCTAACTCATGCCCCCATGTCTAAC
+AACATGGCTTTCTCAACTTTTAAAGGATAACAGCTATCCATTGGTCTTAGGCCCCAAAAATTTTGGTGCA
+ACTCCAAATAAAAGTAATAACCATGCACACTACTATAACCACCCTAACCCTGACTTCCCTAATTCCCCCC
+ATCCTTACCACCCTCGTTAACCCTAACAAAAAAAACTCATACCCCCATTATGTAAAATCCATTGTCGCAT
+CCACCTTTATTATCAGTCTCTTCCCCACAACAATATTCATGTGCCTAGACCAAGAAGTTATTATCTCGAA
+CTGACACTGAGCCACAACCCAAACAACCCAGCTCTCCCTAAGCTTCAAACTAGACTACTTCTCCATAATA
+TTCATCCCTGTAGCATTGTTCGTTACATGGTCCATCATAGAATTCTCACTGTGATATATAAACTCAGACC
+CAAACATTAATCAGTTCTTCAAATATCTACTCATCTTCCTAATTACCATACTAATCTTAGTTACCGCTAA
+CAACCTATTCCAACTGTTCATCGGCTGAGAGGGCGTAGGAATTATATCCTTCTTGCTCATCAGTTGATGA
+TACGCCCGAGCAGATGCCAACACAGCAGCCATTCAAGCAATCCTATACAACCGTATCGGCGATATCGGTT
+TCATCCTCGCCTTAGCATGATTTATCCTACACTCCAACTCATGAGACCCACAACAAATAGCCCTTCTAAA
+CGCTAATCCAAGCCTCACCCCACTACTAGGCCTCCTCCTAGCAGCAGCAGGCAAATCAGCCCAATTAGGT
+CTCCACCCCTGACTCCCCTCAGCCATAGAAGGCCCCACCCCAGTCTCAGCCCTACTCCACTCAAGCACTA
+TAGTTGTAGCAGGAATCTTCTTACTCATCCGCTTCCACCCCCTAGCAGAAAATAGCCCACTAATCCAAAC
+TCTAACACTATGCTTAGGCGCTATCACCACTCTGTTCGCAGCAGTCTGCGCCCTTACACAAAATGACATC
+AAAAAAATCGTAGCCTTCTCCACTTCAAGTCAACTAGGACTCATAATAGTTACAATCGGCATCAACCAAC
+CACACCTAGCATTCCTGCACATCTGTACCCACGCCTTCTTCAAAGCCATACTATTTATGTGCTCCGGGTC
+CATCATCCACAACCTTAACAATGAACAAGATATTCGAAAAATAGGAGGACTACTCAAAACCATACCTCTC
+ACTTCAACCTCCCTCACCATTGGCAGCCTAGCATTAGCAGGAATACCTTTCCTCACAGGTTTCTACTCCA
+AAGACCACATCATCGAAACCGCAAACATATCATACACAAACGCCTGAGCCCTATCTATTACTCTCATCGC
+TACCTCCCTGACAAGCGCCTATAGCACTCGAATAATTCTTCTCACCCTAACAGGTCAACCTCGCTTCCCC
+ACCCTTACTAACATTAACGAAAATAACCCCACCCTACTAAACCCCATTAAACGCCTGGCAGCCGGAAGCC
+TATTCGCAGGATTTCTCATTACTAACAACATTTCCCCCGCATCCCCCTTCCAAACAACAATCCCCCTCTA
+CCTAAAACTCACAGCCCTCGCTGTCACTTTCCTAGGACTTCTAACAGCCCTAGACCTCAACTACCTAACC
+AACAAACTTAAAATAAAATCCCCACTATGCACATTTTATTTCTCCAACATACTCGGATTCTACCCTAGCA
+TCACACACCGCACAATCCCCTATCTAGGCCTTCTTACGAGCCAAAACCTGCCCCTACTCCTCCTAGACCT
+AACCTGACTAGAAAAGCTATTACCTAAAACAATTTCACAGCACCAAATCTCCACCTCCATCATCACCTCA
+ACCCAAAAAGGCATAATTAAACTTTACTTCCTCTCTTTCTTCTTCCCACTCATCCTAACCCTACTCCTAA
+TCACATAACCTATTCCCCCGAGCAATCTCAATTACAATATATACACCAACAAACAATGTTCAACCAGTAA
+CTACTACTAATCAACGCCCATAATCATACAAAGCCCCCGCACCAATAGGATCCTCCCGAATCAACCCTGA
+CCCCTCTCCTTCATAAATTATTCAGCTTCCTACACTATTAAAGTTTACCACAACCACCACCCCATCATAC
+TCTTTCACCCACAGCACCAATCCTACCTCCATCGCTAACCCCACTAAAACACTCACCAAGACCTCAACCC
+CTGACCCCCATGCCTCAGGATACTCCTCAATAGCCATCGCTGTAGTATATCCAAAGACAACCATCATTCC
+CCCTAAATAAATTAAAAAAACTATTAAACCCATATAACCTCCCCCAAAATTCAGAATAATAACACACCCG
+ACCACACCGCTAACAATCAATACTAAACCCCCATAAATAGGAGAAGGCTTAGAAGAAAACCCCACAAACC
+CCATTACTAAACCCACACTCAACAGAAACAAAGCATACATCATTATTCTCGCACGGACTACAACCACGAC
+CAATGATATGAAAAACCATCGTTGTATTTCAACTACAAGAACACCAATGACCCCAATACGCAAAACTAAC
+CCCCTAATAAAATTAATTAACCACTCATTCATCGACCTCCCCACCCCATCCAACATCTCCGCATGATGAA
+ACTTCGGCTCACTCCTTGGCGCCTGCCTGATCCTCCAAATCACCACAGGACTATTCCTAGCCATGCACTA
+CTCACCAGACGCCTCAACCGCCTTTTCATCAATCGCCCACATCACTCGAGACGTAAATTATGGCTGAATC
+ATCCGCTACCTTCACGCCAATGGCGCCTCAATATTCTTTATCTGCCTCTTCCTACACATCGGGCGAGGCC
+TATATTACGGATCATTTCTCTACTCAGAAACCTGAAACATCGGCATTATCCTCCTGCTTGCAACTATAGC
+AACAGCCTTCATAGGCTATGTCCTCCCGTGAGGCCAAATATCATTCTGAGGGGCCACAGTAATTACAAAC
+TTACTATCCGCCATCCCATACATTGGGACAGACCTAGTTCAATGAATCTGAGGAGGCTACTCAGTAGACA
+GTCCCACCCTCACACGATTCTTTACCTTTCACTTCATCTTGCCCTTCATTATTGCAGCCCTAGCAACACT
+CCACCTCCTATTCTTGCACGAAACGGGATCAAACAACCCCCTAGGAATCACCTCCCATTCCGATAAAATC
+ACCTTCCACCCTTACTACACAATCAAAGACGCCCTCGGCTTACTTCTCTTCCTTCTCTCCTTAATGACAT
+TAACACTATTCTCACCAGACCTCCTAGGCGACCCAGACAATTATACCCTAGCCAACCCCTTAAACACCCC
+TCCCCACATCAAGCCCGAATGATATTTCCTATTCGCCTACACAATTCTCCGATCCGTCCCTAACAAACTA
+GGAGGCGTCCTTGCCCTATTACTATCCATCCTCATCCTAGCAATAATCCCCATCCTCCATATATCCAAAC
+AACAAAGCATAATATTTCGCCCACTAAGCCAATCACTTTATTGACTCCTAGCCGCAGACCTCCTCATTCT
+AACCTGAATCGGAGGACAACCAGTAAGCTACCCTTTTACCATCATTGGACAAGTAGCATCCGTACTATAC
+TTCACAACAATCCTAATCCTAATACCAACTATCTCCCTAATTGAAAACAAAATACTCAAATGGGCCTGTC
+CTTGTAGTATAAACTAATACACCAGTCTTGTAAACCGGAGATGAAAACCTTTTTCCAAGGACAAATCAGA
+GAAAAAGTCTTTAACTCCACCATTAGCACCCAAAGCTAAGATTCTAATTTAAACTATTCTCTGTTCTTTC
+ATGGGGAAGCAGATTTGGGTACCACCCAAGTATTGACTCACCCATCAACAACCGCTATGTATTTCGTACA
+TTACTGCCAGCCACCATGAATATTGTACGGTACCATAAATACTTGACCACCTGTAGTACATAAAAACCCA
+ATCCACATCAAAACCCCCTCCCCATGCTTACAAGCAAGTACAGCAATCAACCCTCAACTATCACACATCA
+ACTGCAACTCCAAAGCCACCCCTCACCCACTAGGATACCAACAAACCTACCCACCCTTAACAGTACATAG
+TACATAAAGCCATTTACCGTACATAGCACATTACAGTCAAATCCCTTCTCGTCCCCATGGATGACCCCCC
+TCAGATAGGGGTCCCTTGACCACCATCCTCCGTGAAATCAATATCCCGCACAAGAGTGCTACTCTCCTCG
+CTCCGGGCCCATAACACTTGGGGGTAGCTAAAGTGAACTGTATCCGACATCTGGTTCCTACTTCAGGGTC
+ATAAAGCCTAAATAGCCCACACGTTCCCCTTAAATAAGACATCACGATG
+
diff --git a/tests/data/rCRS.fasta.fai b/tests/data/rCRS.fasta.fai
new file mode 100644
index 0000000..ac6eae5
--- /dev/null
+++ b/tests/data/rCRS.fasta.fai
@@ -0,0 +1 @@
+gi|251831106|ref|NC_012920.1| 16569 75 70 71
diff --git a/tests/data/raw_reads/pe_reads_R1_001.fastq.gz b/tests/data/raw_reads/pe_reads_R1_001.fastq.gz
new file mode 100644
index 0000000..bdcfeba
Binary files /dev/null and b/tests/data/raw_reads/pe_reads_R1_001.fastq.gz differ
diff --git a/tests/data/raw_reads/pe_reads_R1_002.fastq.gz b/tests/data/raw_reads/pe_reads_R1_002.fastq.gz
new file mode 100644
index 0000000..a1c4dd6
Binary files /dev/null and b/tests/data/raw_reads/pe_reads_R1_002.fastq.gz differ
diff --git a/tests/data/raw_reads/pe_reads_R2_001.fastq.gz b/tests/data/raw_reads/pe_reads_R2_001.fastq.gz
new file mode 100644
index 0000000..76c1baf
Binary files /dev/null and b/tests/data/raw_reads/pe_reads_R2_001.fastq.gz differ
diff --git a/tests/data/raw_reads/pe_reads_R2_002.fastq.gz b/tests/data/raw_reads/pe_reads_R2_002.fastq.gz
new file mode 100644
index 0000000..85a1b29
Binary files /dev/null and b/tests/data/raw_reads/pe_reads_R2_002.fastq.gz differ
diff --git a/tests/data/raw_reads/se_reads_R1_001.fastq.gz b/tests/data/raw_reads/se_reads_R1_001.fastq.gz
new file mode 100644
index 0000000..2f90199
Binary files /dev/null and b/tests/data/raw_reads/se_reads_R1_001.fastq.gz differ
diff --git a/tests/data/raw_reads/se_reads_R1_002.fastq.gz b/tests/data/raw_reads/se_reads_R1_002.fastq.gz
new file mode 100644
index 0000000..29280ff
Binary files /dev/null and b/tests/data/raw_reads/se_reads_R1_002.fastq.gz differ
diff --git a/tests/data/sim_reads/mate_1.fastq.gz b/tests/data/sim_reads/mate_1.fastq.gz
new file mode 100644
index 0000000..fe07ab3
Binary files /dev/null and b/tests/data/sim_reads/mate_1.fastq.gz differ
diff --git a/tests/data/sim_reads/mate_2.fastq.gz b/tests/data/sim_reads/mate_2.fastq.gz
new file mode 100644
index 0000000..67e1229
Binary files /dev/null and b/tests/data/sim_reads/mate_2.fastq.gz differ
diff --git a/tests/data/simple.yaml b/tests/data/simple.yaml
new file mode 100644
index 0000000..569ff04
--- /dev/null
+++ b/tests/data/simple.yaml
@@ -0,0 +1,3 @@
+Defaults:
+ "First": 1e-4
+ "Second": "a string"
\ No newline at end of file
diff --git a/tests/data/timestamp_a_older b/tests/data/timestamp_a_older
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/timestamp_a_younger b/tests/data/timestamp_a_younger
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/timestamp_b_older b/tests/data/timestamp_b_older
new file mode 100644
index 0000000..e69de29
diff --git a/tests/data/timestamp_b_younger b/tests/data/timestamp_b_younger
new file mode 100644
index 0000000..e69de29
diff --git a/tests/node_test.py b/tests/node_test.py
new file mode 100644
index 0000000..11cfecf
--- /dev/null
+++ b/tests/node_test.py
@@ -0,0 +1,579 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is herby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# Disable warning for missign docstrings
+# pylint: disable=C0111
+# Disable warning caused by "invalid" function names
+# pylint: disable=C0103
+# Disable warning caused by touching private member variables/functions
+# pylint: disable=W0212
+# Disable warnings caused by flexmock setups ("X is assigned to nothing")
+# pylint: disable=W0106
+import os
+import random
+
+from nose.tools import \
+ assert_in, \
+ assert_equal, \
+ assert_raises
+from flexmock import flexmock
+
+from paleomix.common.testing import \
+ with_temp_folder, \
+ set_file_contents, \
+ get_file_contents
+
+from paleomix.atomiccmd.command import \
+ AtomicCmd
+from paleomix.node import \
+ Node, \
+ CommandNode, \
+ NodeError, \
+ NodeUnhandledException, \
+ CmdNodeError
+from paleomix.common.utilities import \
+ safe_coerce_to_frozenset
+
+
+def test_dir():
+ return os.path.dirname(__file__)
+
+
+def test_file(*args):
+ return os.path.join(test_dir(), "data", *args)
+
+
+def _CommandNodeWrap(**kwargs):
+ return CommandNode(command=AtomicCmd("true"), **kwargs)
+_NODE_TYPES = (Node, _CommandNodeWrap)
+
+
+_DESCRIPTION = "My description of a node"
+_IN_FILES = frozenset((test_file("empty_file_1"),
+ test_file("empty_file_2")))
+_OUT_FILES = frozenset((test_file("missing_out_file_1"),
+ test_file("missing_out_file_2")))
+_EXEC_FILES = frozenset(("ls", "sh"))
+_AUX_FILES = frozenset((test_file("rCRS.fasta"),
+ test_file("rCRS.fasta.fai")))
+_REQUIREMENTS = frozenset((id, str))
+_EMPTY_FILE = test_file("empty_file_1")
+
+
+def _build_cmd_mock(input_files = _IN_FILES,
+ output_files = (),
+ executables = (),
+ auxiliary_files = (),
+ requirements = (),
+ optional_temp_files = ()):
+ return flexmock(input_files = frozenset(input_files),
+ output_files = frozenset(output_files),
+ executables = frozenset(executables),
+ auxiliary_files = frozenset(auxiliary_files),
+ requirements = frozenset(requirements),
+ expected_temp_files = frozenset(map(os.path.basename, output_files)),
+ optional_temp_files = frozenset(optional_temp_files))
+
+
+###############################################################################
+###############################################################################
+# Node: Constructor: File sets
+
+def test_constructor():
+ def first(values):
+ return random.choice(tuple(values))
+
+ def _do_test_constructor__single_value(key, value):
+ defaults = {"input_files": _EMPTY_FILE}
+ defaults[key] = value
+ node = Node(**defaults)
+ expected = safe_coerce_to_frozenset(value)
+ assert_equal(getattr(node, key), expected)
+
+ # Single values
+ yield _do_test_constructor__single_value, "input_files", first(_IN_FILES)
+ yield _do_test_constructor__single_value, "output_files", first(_OUT_FILES)
+ yield _do_test_constructor__single_value, "executables", first(_EXEC_FILES)
+ yield _do_test_constructor__single_value, "auxiliary_files", first(_AUX_FILES)
+
+ # Single value in list
+ yield _do_test_constructor__single_value, "input_files", [first(_IN_FILES)]
+ yield _do_test_constructor__single_value, "output_files", [first(_OUT_FILES)]
+ yield _do_test_constructor__single_value, "executables", [first(_EXEC_FILES)]
+ yield _do_test_constructor__single_value, "auxiliary_files", [first(_AUX_FILES)]
+
+ # Multiple values in list
+ yield _do_test_constructor__single_value, "input_files", _IN_FILES
+ yield _do_test_constructor__single_value, "output_files", _OUT_FILES
+ yield _do_test_constructor__single_value, "executables", _EXEC_FILES
+ yield _do_test_constructor__single_value, "auxiliary_files", _AUX_FILES
+
+
+def test_constructor__invalid_values():
+ def _do_test_constructor__invalid_values(key, value):
+ assert_raises(TypeError, Node, **{key: value})
+
+ yield _do_test_constructor__invalid_values, "input_files", [id]
+ yield _do_test_constructor__invalid_values, "output_files", [-1]
+ yield _do_test_constructor__invalid_values, "executables", [{}]
+ yield _do_test_constructor__invalid_values, "auxiliary_files", [1.3]
+
+
+###############################################################################
+###############################################################################
+# Node: Constructor: Requirements
+
+def test_constructor__requirements():
+ node = Node(requirements=id)
+ assert_equal(node.requirements, frozenset([id]))
+ node = Node(requirements=[id])
+ assert_equal(node.requirements, frozenset([id]))
+ node = Node(requirements=[id, str])
+ assert_equal(node.requirements, frozenset([id, str]))
+
+
+def test_constructor__requirements__wrong_type():
+ def _do_test_constructor__requirements__wrong_type(value):
+ assert_raises(TypeError, Node, requirements=value)
+
+ yield _do_test_constructor__requirements__wrong_type, 17
+ yield _do_test_constructor__requirements__wrong_type, {}
+ yield _do_test_constructor__requirements__wrong_type, "867-5309"
+
+
+###############################################################################
+###############################################################################
+# Node: Constructor: Dependencies
+
+def test_constructor__nodes_is_none():
+ my_node = Node(dependencies=None)
+ assert_equal(my_node.dependencies, frozenset())
+
+
+def test_constructor__single_node():
+ sub_node = Node()
+ my_node = Node(dependencies=sub_node)
+ assert_equal(my_node.dependencies, frozenset([sub_node]))
+
+
+def test_constructor__iterable():
+ sub_nodes = [Node(), Node()]
+ my_node = Node(dependencies=iter(sub_nodes))
+ assert_equal(my_node.dependencies, frozenset(sub_nodes))
+
+
+def test_constructor__not_a_node():
+ assert_raises(TypeError, Node, dependencies=(1,))
+
+
+###############################################################################
+###############################################################################
+# *Node: Description
+
+def test_constructor__description():
+ def _do_test_constructor__description(cls):
+ my_node = cls(description=_DESCRIPTION)
+ assert_equal(str(my_node), _DESCRIPTION)
+ for cls in _NODE_TYPES:
+ yield _do_test_constructor__description, cls
+
+
+def test_constructor__description__default():
+ def _do_test_constructor__description__default(cls):
+ my_node = cls()
+ assert_equal(str(my_node), repr(my_node))
+ for cls in _NODE_TYPES:
+ yield _do_test_constructor__description__default, cls
+
+
+def test_constructor__description__non_string():
+ def _do_test_constructor__description__non_string(cls, value):
+ assert_raises(TypeError, cls, description=value)
+ for cls in _NODE_TYPES:
+ yield _do_test_constructor__description__non_string, cls, 1
+ yield _do_test_constructor__description__non_string, cls, {}
+
+
+###############################################################################
+###############################################################################
+# *Node: Constructor tests: #threads
+
+def test_constructor__threads():
+ def _do_test_constructor__threads(cls, nthreads):
+ node = cls(threads=nthreads)
+ assert_equal(node.threads, nthreads)
+ for cls in (Node, _CommandNodeWrap):
+ yield _do_test_constructor__threads, cls, 1L
+ yield _do_test_constructor__threads, cls, 3
+
+
+def test_constructor__threads_invalid_range():
+ def _do_test_constructor__threads_invalid_range(cls, nthreads):
+ assert_raises(ValueError, cls, threads=nthreads)
+ for cls in (Node, _CommandNodeWrap):
+ yield _do_test_constructor__threads_invalid_range, cls, -1
+ yield _do_test_constructor__threads_invalid_range, cls, 0
+
+
+def test_constructor__threads_invalid_type():
+ def _do_test_constructor__threads_invalid_type(cls, nthreads):
+ assert_raises(TypeError, cls, threads=nthreads)
+ for cls in (Node, _CommandNodeWrap):
+ yield _do_test_constructor__threads_invalid_type, cls, "1"
+ yield _do_test_constructor__threads_invalid_type, cls, {}
+ yield _do_test_constructor__threads_invalid_type, cls, 2.7
+
+
+###############################################################################
+###############################################################################
+# Node: Run
+
+_DUMMY_TEMP_ROOT = "/xyz/tmp"
+_DUMMY_TEMP = os.path.join(_DUMMY_TEMP_ROOT, "xTMPx")
+
+
+def test_run__order():
+ cfg_mock = flexmock(temp_root=_DUMMY_TEMP_ROOT)
+ node_mock = flexmock(Node())
+ node_mock.should_receive("_create_temp_dir").with_args(cfg_mock).and_return(_DUMMY_TEMP).ordered.once
+ node_mock.should_receive("_setup").with_args(cfg_mock, _DUMMY_TEMP).ordered.once
+ node_mock.should_receive("_run").with_args(cfg_mock, _DUMMY_TEMP).ordered.once
+ node_mock.should_receive("_teardown").with_args(cfg_mock, _DUMMY_TEMP).ordered.once
+ node_mock.should_receive("_remove_temp_dir").with_args(_DUMMY_TEMP).ordered.once
+ node_mock.run(cfg_mock)
+
+
+def test_run__exceptions():
+ cfg_mock = flexmock(temp_root=_DUMMY_TEMP_ROOT)
+
+ def build_tests(key, exception, expectation):
+ def test_function():
+ node_mock = flexmock(Node())
+ node_mock.should_receive('_create_temp_dir').with_args(cfg_mock) \
+ .and_return(_DUMMY_TEMP).ordered.once
+ node_mock.should_receive(key).and_raise(exception).ordered.once
+ node_mock.should_receive('_remove_temp_dir').never
+
+ assert_raises(expectation, node_mock.run, cfg_mock)
+
+ return test_function
+
+ print "foo"
+ for key in ('_setup', '_run', '_teardown'):
+ yield build_tests(key, TypeError("The castle AAARGH!"), NodeUnhandledException)
+ yield build_tests(key, NodeError("He's a very naughty boy!"), NodeError)
+
+
+def test_run__exception__create_temp_dir():
+ cfg_mock = flexmock(temp_root=_DUMMY_TEMP_ROOT)
+ node_mock = flexmock(Node())
+ node_mock.should_receive('_create_temp_dir').with_args(cfg_mock) \
+ .and_raise(OSError()).ordered.once
+
+ assert_raises(NodeUnhandledException, node_mock.run, cfg_mock)
+
+
+def test_run__exception__remove_temp_dir():
+ cfg_mock = flexmock(temp_root=_DUMMY_TEMP_ROOT)
+ node_mock = flexmock(Node())
+ node_mock.should_receive('_create_temp_dir').with_args(cfg_mock) \
+ .and_return(_DUMMY_TEMP).ordered.once
+ node_mock.should_receive('_remove_temp_dir').with_args(_DUMMY_TEMP) \
+ .and_raise(OSError()).ordered.once
+
+ assert_raises(NodeUnhandledException, node_mock.run, cfg_mock)
+
+
+def test_run__error_log__node_error():
+ @with_temp_folder
+ def _do_test_run__error_log__node_error(temp_folder, exception):
+ temp = os.path.join(temp_folder, "xTMPx")
+ cfg_mock = flexmock(temp_root=temp_folder)
+ node_mock = flexmock(Node())
+ node_mock.should_receive("_create_temp_dir").with_args(cfg_mock) \
+ .and_return(temp).ordered.once
+ node_mock.should_receive("_run").and_raise(exception).ordered.once
+
+ os.mkdir(temp)
+ assert_raises(NodeError, node_mock.run, cfg_mock)
+ log_file = os.path.join(temp_folder, "xTMPx", "pipe.errors")
+ assert os.path.exists(log_file)
+ assert_in("Errors =", get_file_contents(log_file))
+
+ yield _do_test_run__error_log__node_error, NodeError("ARGH!")
+ yield _do_test_run__error_log__node_error, OSError("ARGH!")
+
+
+###############################################################################
+###############################################################################
+# Node: _setup / _teardown
+
+def test__setup__input_files():
+ def _do_test__setup__input_files_exist(kwargs):
+ Node(**kwargs)._setup(None, None)
+
+ yield _do_test__setup__input_files_exist, {"executables": ("ls", "sh")}
+ yield _do_test__setup__input_files_exist, {"input_files": _IN_FILES}
+ yield _do_test__setup__input_files_exist, {"auxiliary_files": _IN_FILES}
+
+
+def test__setup__input_files_missing():
+ def _do_test__setup__input_files_exist(kwargs):
+ assert_raises(NodeError, Node(**kwargs)._setup, None, None)
+
+ yield _do_test__setup__input_files_exist, {"executables": ("ls", "shxxxx")}
+ yield _do_test__setup__input_files_exist, {"input_files": _OUT_FILES}
+ yield _do_test__setup__input_files_exist, {"auxiliary_files": _OUT_FILES}
+
+
+def test__teardown__output_files():
+ Node(input_files=_EMPTY_FILE,
+ output_files=_IN_FILES)._teardown(None, None)
+
+
+def test__teardown__output_files_missing():
+ node = Node(input_files=_EMPTY_FILE,
+ output_files=_OUT_FILES)
+ assert_raises(NodeError, node._teardown, None, None)
+
+
+###############################################################################
+###############################################################################
+# CommandNode: Constructor
+
+_SIMPLE_DEPS = Node()
+_SIMPLE_SUBS = Node()
+_SIMPLE_CMD_MOCK = flexmock(input_files=_IN_FILES,
+ output_files=_OUT_FILES,
+ executables=_EXEC_FILES,
+ auxiliary_files=_AUX_FILES,
+ requirements=_REQUIREMENTS)
+_SIMPLE_CMD_NODE = CommandNode(command=_SIMPLE_CMD_MOCK,
+ dependencies=_SIMPLE_DEPS)
+
+
+def test_commandnode_constructor__input_files():
+ assert_equal(_SIMPLE_CMD_NODE.input_files, _IN_FILES)
+
+
+def test_commandnode_constructor__output_files():
+ assert_equal(_SIMPLE_CMD_NODE.output_files, _OUT_FILES)
+
+
+def test_commandnode_constructor__auxiliary_files():
+ assert_equal(_SIMPLE_CMD_NODE.auxiliary_files, _AUX_FILES)
+
+
+def test_commandnode_constructor__executables():
+ assert_equal(_SIMPLE_CMD_NODE.executables, _EXEC_FILES)
+
+
+def test_commandnode_constructor__requirements():
+ assert_equal(_SIMPLE_CMD_NODE.requirements, _REQUIREMENTS)
+
+
+def test_commandnode_constructor__dependencies():
+ assert_equal(_SIMPLE_CMD_NODE.dependencies, frozenset([_SIMPLE_DEPS]))
+
+
+def test_commandnode_constructor__dependencies__default():
+ cmd_mock = CommandNode(command=_SIMPLE_CMD_MOCK)
+ assert_equal(cmd_mock.dependencies, frozenset())
+
+
+###############################################################################
+###############################################################################
+# CommandNode: run
+
+def test_command_node__run():
+ cfg_mock = flexmock(temp_root=_DUMMY_TEMP_ROOT)
+ cmd_mock = _build_cmd_mock()
+ node_mock = flexmock(CommandNode(cmd_mock))
+ node_mock.should_receive("_create_temp_dir").with_args(cfg_mock) \
+ .and_return(_DUMMY_TEMP).ordered.once
+ node_mock.should_receive("_setup").with_args(cfg_mock, _DUMMY_TEMP).ordered.once
+ cmd_mock.should_receive("run").with_args(_DUMMY_TEMP).ordered.once
+ cmd_mock.should_receive("join").and_return([0]).ordered.once
+ node_mock.should_receive("_teardown").with_args(cfg_mock, _DUMMY_TEMP).ordered.once
+ node_mock.should_receive("_remove_temp_dir").with_args(_DUMMY_TEMP).ordered.once
+ node_mock.run(cfg_mock)
+
+
+###############################################################################
+###############################################################################
+# CommandNode: _setup()
+
+def test_commandnode_setup__files_exist():
+ def _do_test_commandnode_setup(kwargs):
+ cmd_mock = _build_cmd_mock(**kwargs)
+ node = CommandNode(cmd_mock)
+ node._setup(None, None)
+ yield _do_test_commandnode_setup, {"executables": ("ls", "sh")}
+ yield _do_test_commandnode_setup, {"input_files": _IN_FILES}
+ yield _do_test_commandnode_setup, {"auxiliary_files": _IN_FILES}
+
+
+def test_commandnode_setup__files_missing():
+ def _do_test_commandnode_setup(kwargs):
+ cmd_mock = _build_cmd_mock(**kwargs)
+ node = CommandNode(cmd_mock)
+ assert_raises(NodeError, node._setup, None, None)
+
+ yield _do_test_commandnode_setup, {"executables": ("ls", "shxxxxxxxxxxx")}
+ yield _do_test_commandnode_setup, {"input_files": _OUT_FILES}
+ yield _do_test_commandnode_setup, {"auxiliary_files": _OUT_FILES}
+
+
+###############################################################################
+###############################################################################
+# CommandNode: _run()
+
+def test_commandnode_run__call_order():
+ cmd_mock = _build_cmd_mock()
+ cmd_mock.should_receive("run").with_args("xTMPx").ordered.once
+ cmd_mock.should_receive("join").with_args().and_return((0,)).ordered.once
+ node = CommandNode(cmd_mock)
+ node._run(None, "xTMPx")
+
+
+def test_commandnode_run__exception_on_error():
+ cmd_mock = _build_cmd_mock()
+ cmd_mock.should_receive("run").ordered.once
+ cmd_mock.should_receive("join").and_return((1,)).ordered.once
+ node = CommandNode(cmd_mock)
+ assert_raises(CmdNodeError, node._run, None, None)
+
+
+###############################################################################
+###############################################################################
+# CommandNode: _teardown
+
+def _setup_temp_folders(temp_folder):
+ destination = os.path.join(temp_folder, "dst")
+ temp_folder = os.path.join(temp_folder, "tmp")
+ os.makedirs(temp_folder)
+ os.makedirs(destination)
+ return destination, temp_folder
+
+
+# Commit is called on the command obj
+ at with_temp_folder
+def test_commandnode_teardown__commit(temp_folder):
+ cmd_mock = _build_cmd_mock()
+ cmd_mock.should_receive("commit").with_args(temp_folder).once
+ node = CommandNode(cmd_mock)
+ node._teardown(None, temp_folder)
+
+
+# Files exist in temp folder, and in destination after commit
+ at with_temp_folder
+def test_commandnode_teardown(temp_folder):
+ destination, temp_folder = _setup_temp_folders(temp_folder)
+
+ cmd = AtomicCmd(("echo", "-n", "1 2 3"),
+ IN_DUMMY=_EMPTY_FILE,
+ OUT_STDOUT=os.path.join(destination, "foo.txt"))
+ cmd.run(temp_folder)
+ assert_equal(cmd.join(), [0])
+ node = CommandNode(cmd)
+ assert os.path.exists(os.path.join(temp_folder, "foo.txt"))
+ assert not os.path.exists(os.path.join(destination, "foo.txt"))
+ node._teardown(None, temp_folder)
+ assert not os.path.exists(os.path.join(temp_folder, "foo.txt"))
+ assert os.path.exists(os.path.join(destination, "foo.txt"))
+
+
+# Not all required files have been generated (atomic)
+ at with_temp_folder
+def test_commandnode_teardown__missing_files_in_temp(temp_folder):
+ destination, temp_folder = _setup_temp_folders(temp_folder)
+
+ cmd = AtomicCmd(("echo", "-n", "1 2 3"),
+ IN_DUMMY=_EMPTY_FILE,
+ OUT_BAR=os.path.join(destination, "bar.txt"),
+ OUT_STDOUT=os.path.join(destination, "foo.txt"))
+ cmd.run(temp_folder)
+ assert_equal(cmd.join(), [0])
+ node = CommandNode(cmd)
+ temp_files_before = set(os.listdir(temp_folder))
+ dest_files_before = set(os.listdir(destination))
+
+ assert_raises(CmdNodeError, node._teardown, None, temp_folder)
+ assert_equal(temp_files_before, set(os.listdir(temp_folder)))
+ assert_equal(dest_files_before, set(os.listdir(destination)))
+
+
+# Not all specified TEMP_ files exist at _teardown (allowed)
+ at with_temp_folder
+def test_commandnode_teardown__missing_optional_files(temp_folder):
+ destination, temp_folder = _setup_temp_folders(temp_folder)
+
+ cmd = AtomicCmd(("echo", "-n", "1 2 3"),
+ IN_DUMMY=_EMPTY_FILE,
+ TEMP_OUT_BAR="bar.txt",
+ OUT_STDOUT=os.path.join(destination, "foo.txt"))
+ cmd.run(temp_folder)
+ assert_equal(cmd.join(), [0])
+ node = CommandNode(cmd)
+ node._teardown(None, temp_folder)
+ assert_equal(os.listdir(temp_folder), [])
+ assert_equal(os.listdir(destination), ["foo.txt"])
+
+
+# Not all required files were in place after commit
+ at with_temp_folder
+def _test_commandnode_teardown__missing_files_in_dest(temp_folder):
+ destination, temp_folder = _setup_temp_folders(temp_folder)
+
+ class _CmdMock(AtomicCmd):
+ def commit(self, temp):
+ AtomicCmd.commit(self, temp)
+ os.remove(os.path.join(destination, "foo.txt"))
+
+ cmd = _CmdMock(("touch", "%(OUT_FOO)s", "%(OUT_BAR)s"),
+ IN_DUMMY=_EMPTY_FILE,
+ OUT_FOO=os.path.join(destination, "foo.txt"),
+ OUT_BAR=os.path.join(destination, "bar.txt"))
+ cmd.run(temp_folder)
+ assert_equal(cmd.join(), [0])
+ node = CommandNode(cmd)
+ assert_raises(NodeError, node._teardown, None, temp_folder)
+
+
+# Unexpected files were found in the temporary directory
+ at with_temp_folder
+def test_commandnode_teardown__extra_files_in_temp(temp_folder):
+ destination, temp_folder = _setup_temp_folders(temp_folder)
+
+ cmd = AtomicCmd(("echo", "-n", "1 2 3"),
+ IN_DUMMY=_EMPTY_FILE,
+ OUT_STDOUT=os.path.join(destination, "foo.txt"))
+ cmd.run(temp_folder)
+ assert_equal(cmd.join(), [0])
+ node = CommandNode(cmd)
+ set_file_contents(os.path.join(temp_folder, "bar.txt"), "1 2 3")
+ temp_files_before = set(os.listdir(temp_folder))
+ dest_files_before = set(os.listdir(destination))
+
+ assert_raises(CmdNodeError, node._teardown, None, temp_folder)
+ assert_equal(temp_files_before, set(os.listdir(temp_folder)))
+ assert_equal(dest_files_before, set(os.listdir(destination)))
diff --git a/tests/nodegraph_test.py b/tests/nodegraph_test.py
new file mode 100644
index 0000000..c125d19
--- /dev/null
+++ b/tests/nodegraph_test.py
@@ -0,0 +1,146 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is herby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in
+# all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+# Disable warning for missing docstring
+# pylint: disable=C0111
+# Disable warning caused by "invalid" function names
+# pylint: disable=C0103
+# Disable warning caused by touching private member variables/functions
+# TODO: Remove this / fix places touching privates
+# pylint: disable=W0212
+import os
+
+from flexmock import \
+ flexmock
+
+from paleomix.common.testing import \
+ with_temp_folder, \
+ set_file_contents
+
+from paleomix.nodegraph import \
+ NodeGraph, \
+ FileStatusCache
+
+
+def test_dir():
+ return os.path.dirname(__file__)
+
+
+def test_file(*args):
+ return os.path.join(test_dir(), "data", *args)
+
+
+_DESCRIPTION = "My description of a node"
+_IN_FILES = frozenset((test_file("empty_file_1"),
+ test_file("empty_file_2")))
+_OUT_FILES = frozenset((test_file("missing_out_file_1"),
+ test_file("missing_out_file_2")))
+_EXEC_FILES = frozenset(("ls", "sh"))
+_AUX_FILES = frozenset((test_file("rCRS.fasta"),
+ test_file("rCRS.fasta.fai")))
+_REQUIREMENTS = frozenset((id, str))
+
+
+###############################################################################
+###############################################################################
+# Setup timestamps for test files
+
+def setup_module():
+ timestamps = {test_file("timestamp_a_older"): 1000190760,
+ test_file("timestamp_b_older"): 1000190760,
+ test_file("timestamp_a_younger"): 1120719000,
+ test_file("timestamp_b_younger"): 1120719000}
+
+ for filename, timestamp in timestamps.iteritems():
+ # Set atime and mtime
+ os.utime(filename, (timestamp, timestamp))
+
+
+###############################################################################
+###############################################################################
+# NodeGraph: _is_done
+# TODO: Avoid testing private function, mock cache
+
+def test_nodegraph_is_done__no_output():
+ cache = FileStatusCache()
+ node = flexmock(output_files=())
+ assert NodeGraph.is_done(node, cache)
+
+
+ at with_temp_folder
+def test_nodegraph_is_done__output_changes(temp_folder):
+ temp_file_1 = os.path.join(temp_folder, "file_1.txt")
+ temp_file_2 = os.path.join(temp_folder, "file_2.txt")
+ my_node = flexmock(output_files=(temp_file_1, temp_file_2))
+ assert not NodeGraph.is_done(my_node, FileStatusCache())
+ set_file_contents(temp_file_1, "foo")
+ assert not NodeGraph.is_done(my_node, FileStatusCache())
+ set_file_contents(temp_file_2, "bar")
+ assert NodeGraph.is_done(my_node, FileStatusCache())
+
+
+ at with_temp_folder
+def test_nodegraph_is_done__subnode_not_considered(temp_folder):
+ temp_file = os.path.join(temp_folder, "file.txt")
+ subnode = flexmock(output_files=(temp_file,))
+ my_node = flexmock(output_files=(),
+ subnodes=(subnode,))
+ assert NodeGraph.is_done(my_node, FileStatusCache())
+
+
+def test_nodegraph_is_outdated__no_output():
+ my_node = flexmock(input_files=(),
+ output_files=())
+ assert not NodeGraph.is_outdated(my_node, FileStatusCache())
+
+
+def test_nodegraph_is_outdated__input_but_no_output():
+ my_node = flexmock(input_files=_IN_FILES,
+ output_files=())
+ assert not NodeGraph.is_outdated(my_node, FileStatusCache())
+
+
+def test_nodegraph_is_outdated__output_but_no_input():
+ my_node = flexmock(input_files=(),
+ output_files=_OUT_FILES)
+ assert not NodeGraph.is_outdated(my_node, FileStatusCache())
+
+
+def test_nodegraph_is_outdated__not_outdated():
+ my_node = flexmock(input_files=(test_file("timestamp_a_older"),),
+ output_files=(test_file("timestamp_a_younger"),))
+ assert not NodeGraph.is_outdated(my_node, FileStatusCache())
+
+
+def test_nodegraph_is_outdated__outdated():
+ my_node = flexmock(input_files=(test_file("timestamp_a_younger"),),
+ output_files=(test_file("timestamp_a_older"),))
+ assert NodeGraph.is_outdated(my_node, FileStatusCache())
+
+
+def test_nodegraph_is_outdated__updates():
+ my_node = flexmock(input_files=(test_file("timestamp_a_older"),),
+ output_files=(test_file("timestamp_a_younger"),))
+ assert not NodeGraph.is_outdated(my_node, FileStatusCache())
+ my_node = flexmock(input_files=(test_file("timestamp_a_younger"),),
+ output_files=(test_file("timestamp_a_older"),))
+ assert NodeGraph.is_outdated(my_node, FileStatusCache())
diff --git a/tests/run b/tests/run
new file mode 100755
index 0000000..2397383
--- /dev/null
+++ b/tests/run
@@ -0,0 +1,27 @@
+#!/bin/bash
+
+if ! nosetests --version &> /dev/null;
+then
+ echo "Could not run 'nosetests'; please ensure that nose is installed:" > /dev/stderr
+ echo " $ pip install nose" > /dev/stderr
+ exit 1
+fi
+
+py_cmd=$(head -n1 $(which nosetests) | sed -e's/^#!//')
+
+for module in flexmock coverage;
+do
+ if ! ${py_cmd} -c "import ${module}" &> /dev/null;
+ then
+ echo "Could import Python module '${module}'; please ensure that this module is installed:" > /dev/stderr
+ echo " $ pip install ${module}" > /dev/stderr
+ exit 1
+ fi
+done
+
+MODULES=$(find paleomix -mindepth 1 -maxdepth 1 -name '*.py' -or -type d | sed -e 's#\.py##g' -e's#/#.#g' | grep -v "paleomix.yaml" | grep -v __init__)
+nosetests -I ".*_flymake.py" tests/ --with-coverage $@ \
+ --cover-tests --cover-branches --cover-inclusive --cover-erase \
+ $(for module in unit $MODULES;do echo --cover-package=$module;done) \
+ 2>&1 | grep -v "[0-9]\+ \+0 \+[0-9]\+ \+0 \+100%"
+# --cover-html --cover-html-dir=tests/runs/coverage
diff --git a/tests/tools_test/factory_test.py b/tests/tools_test/factory_test.py
new file mode 100644
index 0000000..b772857
--- /dev/null
+++ b/tests/tools_test/factory_test.py
@@ -0,0 +1,121 @@
+#!/usr/bin/python
+#
+# Copyright (c) 2012 Mikkel Schubert <MSchubert at snm.ku.dk>
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy
+# of this software and associated documentation files (the "Software"), to deal
+# in the Software without restriction, including without limitation the rights
+# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+# copies of the Software, and to permit persons to whom the Software is
+# furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all
+# copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+# SOFTWARE.
+#
+import os
+import subprocess
+
+from nose.tools import \
+ assert_in, \
+ assert_equal
+
+import paleomix.tools.factory as factory
+
+
+###############################################################################
+###############################################################################
+
+class ProcError(RuntimeError):
+ pass
+
+
+def check_run(call, *args, **kwargs):
+ devnull = os.open(os.devnull, os.O_RDONLY)
+ kwargs.setdefault("stdin", devnull)
+ kwargs.setdefault("close_fds", True)
+ kwargs["stdout"] = subprocess.PIPE
+ kwargs["stderr"] = subprocess.PIPE
+
+ returncode = kwargs.pop("expected_returncode", 0)
+
+ proc = subprocess.Popen(call, *args, **kwargs)
+ os.close(devnull)
+
+ stdout, stderr = proc.communicate()
+ if proc.returncode != returncode:
+ raise ProcError("Command returned %i: %r:\nSTDOUT: %r\nSTDERR: %r"
+ % (proc.returncode, call, stdout, stderr))
+
+ return stdout, stderr
+
+
+# Simple test of the paleomxi command
+def test_paleomix_command():
+ stdout, stderr = check_run(["paleomix"])
+
+ assert_equal("", stdout)
+ assert_in("PALEOMIX - pipelines and tools for NGS data analyses.", stderr)
+
+
+# Simple test that all commands can be executed
+def test_factory__commands():
+ def _do_test_factory__commands(command, expected):
+ cmd = factory.new(command)
+ call = cmd.finalized_call
+ if command in ("bam_pipeline", "trim_pipeline"):
+ call.append("run")
+
+ stdout, stderr = check_run(call + ["--help"])
+
+ assert_equal(expected, stdout.split("\n")[0])
+ assert_equal("", stderr)
+
+ commands = (("bam_pipeline", "Usage: paleomix bam_pipeline <command> [options] [makefiles]"),
+ ("trim_pipeline", "Usage: paleomix trim_pipeline <command> [options] [makefiles]"),
+ ("phylo_pipeline", "Usage: paleomix phylo_pipeline <command> [options] [makefiles]"),
+ ("cleanup", "usage: paleomix cleanup --temp-prefix prefix --fasta reference.fasta < in.sam"),
+ ("coverage", "usage: paleomix coverage [options] sorted.bam [out.coverage]"),
+ ("depths", "usage: paleomix depths [options] sorted.bam [out.depths]"),
+ ("duphist", "usage: paleomix duphist sorted.bam > out.histogram"),
+ ("rmdup_collapsed", "usage: paleomix rmdup_collapsed [options] < sorted.bam > out.bam"),
+ ("genotype", "usage: paleomix genotype [options] sorted.bam out.vcf.bgz"),
+ ("gtf_to_bed", "usage: paleomix gtf_to_bed [options] in.gtf out_prefix [in.scaffolds]"),
+ ("sample_pileup", "usage: paleomix sample_pileup [options] --genotype in.vcf --intervals in.bed > out.fasta"),
+ ("vcf_filter", "Usage: paleomix vcf_filter [options] [in1.vcf, ...]"),
+ ("vcf_to_fasta", "usage: paleomix vcf_to_fasta [options] --genotype in.vcf --intervals in.bed"),
+ ("cat", "usage: paleomix cat [-h] [--output OUTPUT] file [file ...]"))
+
+ for command, expected in commands:
+ yield _do_test_factory__commands, command, expected
+
+
+# Simple test of aliased commands
+def test_factory__command_alias():
+ def _do_test_factory__command_alias(alias, command):
+ alias, command = [alias], [command]
+ if alias == ["bam_pipeline"] or alias == ["trim_pipeline"]:
+ alias.append("run")
+ command.append("run")
+
+ stdout_1, stderr_1 = check_run(alias + ["--help"])
+ stdout_2, stderr_2 = check_run(["paleomix"] + command + ["--help"])
+
+ assert_equal(stderr_1, stderr_1)
+ assert_equal(stderr_2, stderr_2)
+
+ commands = (("bam_pipeline", "bam_pipeline"),
+ ("bam_rmdup_collapsed", "rmdup_collapsed"),
+ ("conv_gtf_to_bed", "gtf_to_bed"),
+ ("phylo_pipeline", "phylo_pipeline"),
+ ("trim_pipeline", "trim_pipeline"))
+
+ for alias, command in commands:
+ yield _do_test_factory__command_alias, alias, command
diff --git a/tox.ini b/tox.ini
new file mode 100644
index 0000000..0aff906
--- /dev/null
+++ b/tox.ini
@@ -0,0 +1,23 @@
+# Tox (http://tox.testrun.org/) is a tool for running tests
+# in multiple virtualenvs. This configuration file will run the
+# test suite on all supported python versions. To use it, "pip install tox"
+# and then run "tox" from this directory.
+
+[tox]
+envlist = py27
+
+[testenv]
+changedir = tests
+
+commands =
+ nosetests .
+
+deps =
+ nose
+ coverage
+ flexmock
+
+# Workaround for tox not installing paleomix itself on first run
+# https://bitbucket.org/hpk42/tox/issues/176/tox-doesnt-install-the-package-being
+install_command =
+ pip install -U {opts} {packages}
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/paleomix.git
More information about the debian-med-commit
mailing list