[med-svn] [grinder] 01/07: Imported Upstream version 0.5.4
Andreas Tille
tille at debian.org
Sun Jan 24 09:24:24 UTC 2016
This is an automated email from the git hooks/post-receive script.
tille pushed a commit to branch master
in repository grinder.
commit b0a5b3a5dac9d62cb1b178a01229ed5a0d0e148d
Author: Andreas Tille <tille at debian.org>
Date: Sun Jan 24 09:52:51 2016 +0100
Imported Upstream version 0.5.4
---
CHANGES | 6 +
LICENSE | 2 +-
MANIFEST | 20 +-
META.yml | 16 +-
MYMETA.json | 18 +-
MYMETA.yml | 39 +-
Makefile.PL | 21 +-
README | 13 +-
README.htm | 1180 +++++++++++++++-------------------
{script => bin}/grinder | 0
{script => bin}/grinder.pod | 16 +-
inc/Module/AutoInstall.pm | 44 +-
inc/Module/Install.pm | 22 +-
inc/Module/Install/AutoInstall.pm | 2 +-
inc/Module/Install/Base.pm | 2 +-
inc/Module/Install/Can.pm | 2 +-
inc/Module/Install/Fetch.pm | 2 +-
inc/Module/Install/Include.pm | 2 +-
inc/Module/Install/Makefile.pm | 4 +-
inc/Module/Install/Metadata.pm | 6 +-
inc/Module/Install/ReadmeFromPod.pm | 2 +-
inc/Module/Install/Scripts.pm | 2 +-
inc/Module/Install/Win32.pm | 2 +-
inc/Module/Install/WriteAll.pm | 2 +-
lib/Bio/DB/Fasta.pm | 455 -------------
lib/Bio/DB/IndexedBase.pm | 1104 -------------------------------
lib/Bio/PrimarySeq.pm | 951 ---------------------------
lib/Bio/PrimarySeqI.pm | 944 ---------------------------
lib/Bio/Seq/SeqFastaSpeedFactory.pm | 149 -----
lib/Bio/Seq/SimulatedRead.pm | 653 -------------------
lib/Bio/SeqFeature/Amplicon.pm | 168 -----
lib/Bio/SeqFeature/Primer.pm | 335 ----------
lib/Bio/SeqFeature/SubSeq.pm | 208 ------
lib/Bio/Tools/AmpliconSearch.pm | 564 ----------------
lib/Bio/Tools/IUPAC.pm | 560 ----------------
lib/Grinder.pm | 137 ++--
lib/Grinder/Database.pm | 3 +-
man/average_genome_size.1 | 33 +-
man/change_paired_read_orientation.1 | 33 +-
man/grinder.1 | 69 +-
t/01-shotgun.t | 20 +-
t/02-mates.t | 26 +-
t/05-forbidden.t | 2 +-
t/15-multiplex.t | 14 +-
t/17-libraries.t | 8 +-
t/20-community-structure.t | 6 +
t/21-errors.t | 32 +-
t/29-kmer-collection.t | 30 +-
t/32-database.t | 13 +-
t/TestUtils.pm | 7 +-
t/data/database_dna.fa.index | Bin 12288 -> 0 bytes
t/data/database_mixed.fa.index | Bin 12288 -> 0 bytes
t/data/database_protein.fa.index | Bin 12288 -> 0 bytes
t/data/database_rna.fa.index | Bin 12288 -> 0 bytes
t/data/shotgun_database.fa.index | Bin 12288 -> 0 bytes
55 files changed, 940 insertions(+), 7009 deletions(-)
diff --git a/CHANGES b/CHANGES
index a91c6ee..073a961 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,5 +1,11 @@
Revision history for Grinder
+0.5.4 18-Jan-2016
+ Fixed bug causing the last mate pair to sometimes miss its second read
+ (bug #13)
+ Improved Grinder's test suite with respect to Perl's hash randomization
+ (contributions from Francisco J. Ossandón)
+
0.5.3 30-May-2013
Completed fix for bug #6, multiplexed read close to length of reference
(reported by Ali May).
diff --git a/LICENSE b/LICENSE
index 5037ed5..2f039a0 100644
--- a/LICENSE
+++ b/LICENSE
@@ -1,4 +1,4 @@
-This software is Copyright (c) 2013 by Florent Angly <florent.angly at gmail.com>.
+This software is Copyright (c) 2016 by Florent Angly <florent.angly at gmail.com>.
This is free software, licensed under:
diff --git a/MANIFEST b/MANIFEST
index 7286b4f..d3c31bd 100644
--- a/MANIFEST
+++ b/MANIFEST
@@ -1,3 +1,5 @@
+bin/grinder
+bin/grinder.pod
CHANGES
galaxy/all_fasta.loc.sample
galaxy/Galaxy_readme.txt
@@ -21,17 +23,6 @@ inc/Module/Install/ReadmeFromPod.pm
inc/Module/Install/Scripts.pm
inc/Module/Install/Win32.pm
inc/Module/Install/WriteAll.pm
-lib/Bio/DB/Fasta.pm
-lib/Bio/DB/IndexedBase.pm
-lib/Bio/PrimarySeq.pm
-lib/Bio/PrimarySeqI.pm
-lib/Bio/Seq/SeqFastaSpeedFactory.pm
-lib/Bio/Seq/SimulatedRead.pm
-lib/Bio/SeqFeature/Amplicon.pm
-lib/Bio/SeqFeature/Primer.pm
-lib/Bio/SeqFeature/SubSeq.pm
-lib/Bio/Tools/AmpliconSearch.pm
-lib/Bio/Tools/IUPAC.pm
lib/Grinder.pm
lib/Grinder/Database.pm
lib/Grinder/KmerCollection.pm
@@ -46,8 +37,6 @@ MYMETA.json
MYMETA.yml
README
README.htm
-script/grinder
-script/grinder.pod
t/00-load.t
t/01-shotgun.t
t/02-mates.t
@@ -87,13 +76,9 @@ t/data/abundances2.txt
t/data/abundances_multiple.txt
t/data/amplicon_database.fa
t/data/database_dna.fa
-t/data/database_dna.fa.index
t/data/database_mixed.fa
-t/data/database_mixed.fa.index
t/data/database_protein.fa
-t/data/database_protein.fa.index
t/data/database_rna.fa
-t/data/database_rna.fa.index
t/data/dirty_database.fa
t/data/forward_primer.fa
t/data/forward_reverse_primers.fa
@@ -109,7 +94,6 @@ t/data/revcom_amplicon_database.fa
t/data/reverse_forward_primers.fa
t/data/reverse_primer.fa
t/data/shotgun_database.fa
-t/data/shotgun_database.fa.index
t/data/shotgun_database_extended.fa
t/data/shotgun_database_shared_kmers.fa
t/data/single_amplicon_database.fa
diff --git a/META.yml b/META.yml
index d74e16a..6336593 100644
--- a/META.yml
+++ b/META.yml
@@ -5,11 +5,12 @@ author:
build_requires:
ExtUtils::MakeMaker: 6.59
Test::More: 0
+ Test::Warn: 0
configure_requires:
ExtUtils::MakeMaker: 6.59
distribution_type: module
dynamic_config: 1
-generated_by: 'Module::Install version 1.06'
+generated_by: 'Module::Install version 1.16'
license: gpl3
meta-spec:
url: http://module-build.sourceforge.net/META-spec-v1.4.html
@@ -24,16 +25,19 @@ requires:
Bio::Location::Split: 0
Bio::PrimarySeq: 0
Bio::Root::Root: 0
- Bio::Root::Version: 1.006901
+ Bio::Root::Version: '1.006923'
+ Bio::Seq::SimulatedRead: 0
+ Bio::SeqFeature::SubSeq: 0
Bio::SeqIO: 0
- Getopt::Euclid: 0.3.4
+ Bio::Tools::AmpliconSearch: 0
+ Getopt::Euclid: 0.4.4
List::Util: 0
- Math::Random::MT: 1.16
+ Math::Random::MT: '1.16'
perl: 5.6.0
- version: 0.77
+ version: '0.77'
resources:
bugtracker: http://sourceforge.net/tracker/?group_id=244196&atid=1124737
homepage: http://sourceforge.net/projects/biogrinder/
license: http://opensource.org/licenses/gpl-3.0.html
repository: git://biogrinder.git.sourceforge.net/gitroot/biogrinder/biogrinder
-version: 0.005003
+version: '0.005004'
diff --git a/MYMETA.json b/MYMETA.json
index bae85bf..961a309 100644
--- a/MYMETA.json
+++ b/MYMETA.json
@@ -4,7 +4,7 @@
"Florent Angly <florent.angly at gmail.com>"
],
"dynamic_config" : 0,
- "generated_by" : "Module::Install version 1.06, CPAN::Meta::Converter version 2.120921",
+ "generated_by" : "Module::Install version 1.16, CPAN::Meta::Converter version 2.150005",
"license" : [
"unknown"
],
@@ -23,12 +23,13 @@
"build" : {
"requires" : {
"ExtUtils::MakeMaker" : "6.59",
- "Test::More" : "0"
+ "Test::More" : "0",
+ "Test::Warn" : "0"
}
},
"configure" : {
"requires" : {
- "ExtUtils::MakeMaker" : "6.59"
+ "ExtUtils::MakeMaker" : "0"
}
},
"runtime" : {
@@ -37,9 +38,12 @@
"Bio::Location::Split" : "0",
"Bio::PrimarySeq" : "0",
"Bio::Root::Root" : "0",
- "Bio::Root::Version" : "1.006901",
+ "Bio::Root::Version" : "1.006923",
+ "Bio::Seq::SimulatedRead" : "0",
+ "Bio::SeqFeature::SubSeq" : "0",
"Bio::SeqIO" : "0",
- "Getopt::Euclid" : "v0.3.4",
+ "Bio::Tools::AmpliconSearch" : "0",
+ "Getopt::Euclid" : "v0.4.4",
"List::Util" : "0",
"Math::Random::MT" : "1.16",
"perl" : "5.006",
@@ -57,8 +61,10 @@
"http://opensource.org/licenses/gpl-3.0.html"
],
"repository" : {
+ "type" : "git",
"url" : "git://biogrinder.git.sourceforge.net/gitroot/biogrinder/biogrinder"
}
},
- "version" : "0.005002"
+ "version" : "0.005004",
+ "x_serialization_backend" : "JSON::PP version 2.27300"
}
diff --git a/MYMETA.yml b/MYMETA.yml
index fa97057..d269fc3 100644
--- a/MYMETA.yml
+++ b/MYMETA.yml
@@ -3,36 +3,41 @@ abstract: 'A versatile omics shotgun and amplicon sequencing read simulator'
author:
- 'Florent Angly <florent.angly at gmail.com>'
build_requires:
- ExtUtils::MakeMaker: 6.59
- Test::More: 0
+ ExtUtils::MakeMaker: '6.59'
+ Test::More: '0'
+ Test::Warn: '0'
configure_requires:
- ExtUtils::MakeMaker: 6.59
+ ExtUtils::MakeMaker: '0'
dynamic_config: 0
-generated_by: 'Module::Install version 1.06, CPAN::Meta::Converter version 2.120921'
+generated_by: 'Module::Install version 1.16, CPAN::Meta::Converter version 2.150005'
license: unknown
meta-spec:
url: http://module-build.sourceforge.net/META-spec-v1.4.html
- version: 1.4
+ version: '1.4'
name: Grinder
no_index:
directory:
- inc
- t
requires:
- Bio::DB::Fasta: 0
- Bio::Location::Split: 0
- Bio::PrimarySeq: 0
- Bio::Root::Root: 0
- Bio::Root::Version: 1.006901
- Bio::SeqIO: 0
- Getopt::Euclid: v0.3.4
- List::Util: 0
- Math::Random::MT: 1.16
- perl: 5.006
- version: 0.77
+ Bio::DB::Fasta: '0'
+ Bio::Location::Split: '0'
+ Bio::PrimarySeq: '0'
+ Bio::Root::Root: '0'
+ Bio::Root::Version: '1.006923'
+ Bio::Seq::SimulatedRead: '0'
+ Bio::SeqFeature::SubSeq: '0'
+ Bio::SeqIO: '0'
+ Bio::Tools::AmpliconSearch: '0'
+ Getopt::Euclid: v0.4.4
+ List::Util: '0'
+ Math::Random::MT: '1.16'
+ perl: '5.006'
+ version: '0.77'
resources:
bugtracker: http://sourceforge.net/tracker/?group_id=244196&atid=1124737
homepage: http://sourceforge.net/projects/biogrinder/
license: http://opensource.org/licenses/gpl-3.0.html
repository: git://biogrinder.git.sourceforge.net/gitroot/biogrinder/biogrinder
-version: 0.005002
+version: '0.005004'
+x_serialization_backend: 'CPAN::Meta::YAML version 0.012'
diff --git a/Makefile.PL b/Makefile.PL
index 9470e4d..69746a1 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -16,17 +16,17 @@ resources
# Dependencies for everyone
build_requires 'Test::More' => 0; # first released with Perl v5.6.2
-requires 'Bio::Root::Version' => '1.006901'; # Bioperl
+build_requires 'Test::Warn' => 0;
+requires 'Bio::Root::Version' => '1.006923'; # Bioperl v1.6.923
requires 'Bio::DB::Fasta' => 0;
requires 'Bio::Location::Split' => 0;
requires 'Bio::PrimarySeq' => 0;
requires 'Bio::Root::Root' => 0;
requires 'Bio::SeqIO' => 0;
-# Bioperl modules required, but packaged with Grinder since they are too recent to have had a release yet
-#requires 'Bio::SeqFeature::SubSeq' => 0;
-#requires 'Bio::Seq::SimulatedRead' => 0;
-#requires 'Bio::Tools::AmpliconSearch' => 0;
-requires 'Getopt::Euclid' => '0.3.4';
+requires 'Bio::SeqFeature::SubSeq' => 0;
+requires 'Bio::Seq::SimulatedRead' => 0;
+requires 'Bio::Tools::AmpliconSearch' => 0;
+requires 'Getopt::Euclid' => '0.4.4';
requires 'List::Util' => 0; # first released with Perl v5.7.3
requires 'Math::Random::MT' => '1.16';
requires 'version' => '0.77'; # first released with Perl v5.9.0
@@ -40,7 +40,8 @@ author_requires 'Module::Install::AutoLicense';
author_requires 'Module::Install::PodFromEuclid';
author_requires 'Module::Install::ReadmeFromPod' => '0.14';
author_requires 'Module::Install::AutoManifest';
-author_requires 'Statistics::R' => '0.21';
+author_requires 'Statistics::R' => '0.32';
+# Also install R and the fitdistrplus R library
# Bundle dependencies
@@ -57,7 +58,7 @@ auto_install;
# Extra scripts to install
-install_script 'script/grinder';
+install_script 'bin/grinder';
install_script 'utils/average_genome_size';
install_script 'utils/change_paired_read_orientation';
@@ -88,8 +89,8 @@ auto_doc();
sub auto_doc {
print "*** Building doc...\n";
- pod_from 'script/grinder';
- my $grinder = 'script/grinder.pod';
+ pod_from 'bin/grinder';
+ my $grinder = 'bin/grinder.pod';
my $script1 = 'utils/average_genome_size';
my $script2 = 'utils/change_paired_read_orientation';
my $clean = 1;
diff --git a/README b/README
index a944e5c..6ea51fd 100644
--- a/README
+++ b/README
@@ -84,7 +84,7 @@ CITATION
Available from <http://dx.doi.org/10.1093/nar/gks251>.
VERSION
- This document refers to grinder version 0.5.2
+ This document refers to grinder version 0.5.3
AUTHOR
Florent Angly <florent.angly at gmail.com>
@@ -517,9 +517,10 @@ CLI OPTIONAL ARGUMENTS
-mi <multiplex_ids> | -multiplex_ids <multiplex_ids>
Specify an optional FASTA file that contains multiplex sequence
identifiers (a.k.a MIDs or barcodes) to add to the sequences (one
- sequence per library). The MIDs are included in the length specified
- with the -read_dist option and can be altered by sequencing errors.
- See the MIDesigner or BarCrawl programs to generate MID sequences.
+ sequence per library, in the order given). The MIDs are included in
+ the length specified with the -read_dist option and can be altered
+ by sequencing errors. See the MIDesigner or BarCrawl programs to
+ generate MID sequences.
-di <diversity>... | -diversity <diversity>...
This option specifies alpha diversity, specifically the richness,
@@ -539,7 +540,7 @@ CLI OPTIONAL ARGUMENTS
This option controls another aspect of beta-diversity. For multiple
libraries, choose the percent of the most-abundant reference
sequences to permute (randomly shuffle) the rank-abundance of.
- Default: 0 %
+ Default: 100 %
-rs <random_seed> | -random_seed <random_seed>
Seed number to use for the pseudo-random number generator.
@@ -710,7 +711,7 @@ API METHODS
Returns : seed number
COPYRIGHT
- Copyright 2009-2012 Florent ANGLY <florent.angly at gmail.com>
+ Copyright 2009-2013 Florent ANGLY <florent.angly at gmail.com>
Grinder is free software: you can redistribute it and/or modify it under
the terms of the GNU General Public License (GPL) as published by the
diff --git a/README.htm b/README.htm
index ed036f9..979ac80 100644
--- a/README.htm
+++ b/README.htm
@@ -2,756 +2,626 @@
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
<html xmlns="http://www.w3.org/1999/xhtml">
<head>
-<title>grinder - A versatile omics shotgun and amplicon sequencing read simulator</title>
+<title></title>
<meta http-equiv="content-type" content="text/html; charset=utf-8" />
<link rev="made" href="mailto:root at localhost" />
</head>
-<body style="background-color: white">
+<body>
+
+
+
+<ul id="index">
+ <li><a href="#NAME">NAME</a></li>
+ <li><a href="#DESCRIPTION">DESCRIPTION</a></li>
+ <li><a href="#CITATION">CITATION</a></li>
+ <li><a href="#VERSION">VERSION</a></li>
+ <li><a href="#AUTHOR">AUTHOR</a></li>
+ <li><a href="#INSTALLATION">INSTALLATION</a>
+ <ul>
+ <li><a href="#Dependencies">Dependencies</a></li>
+ <li><a href="#Procedure">Procedure</a></li>
+ <li><a href="#No-administrator-privileges">No administrator privileges?</a></li>
+ </ul>
+ </li>
+ <li><a href="#RUNNING-GRINDER">RUNNING GRINDER</a></li>
+ <li><a href="#REFERENCE-SEQUENCE-DATABASE">REFERENCE SEQUENCE DATABASE</a></li>
+ <li><a href="#CLI-EXAMPLES">CLI EXAMPLES</a></li>
+ <li><a href="#CLI-REQUIRED-ARGUMENTS">CLI REQUIRED ARGUMENTS</a></li>
+ <li><a href="#CLI-OPTIONAL-ARGUMENTS">CLI OPTIONAL ARGUMENTS</a></li>
+ <li><a href="#CLI-OUTPUT">CLI OUTPUT</a></li>
+ <li><a href="#API-EXAMPLES">API EXAMPLES</a></li>
+ <li><a href="#API-METHODS">API METHODS</a>
+ <ul>
+ <li><a href="#new">new</a></li>
+ <li><a href="#next_lib">next_lib</a></li>
+ <li><a href="#next_read">next_read</a></li>
+ <li><a href="#get_random_seed">get_random_seed</a></li>
+ </ul>
+ </li>
+ <li><a href="#COPYRIGHT">COPYRIGHT</a></li>
+ <li><a href="#BUGS">BUGS</a></li>
+</ul>
+<h1 id="NAME">NAME</h1>
-<!-- INDEX BEGIN -->
-<div name="index">
-<p><a name="__index__"></a></p>
+<p>grinder - A versatile omics shotgun and amplicon sequencing read simulator</p>
-<ul>
+<h1 id="DESCRIPTION">DESCRIPTION</h1>
- <li><a href="#name">NAME</a></li>
- <li><a href="#description">DESCRIPTION</a></li>
- <li><a href="#citation">CITATION</a></li>
- <li><a href="#version">VERSION</a></li>
- <li><a href="#author">AUTHOR</a></li>
- <li><a href="#installation">INSTALLATION</a></li>
- <ul>
-
- <li><a href="#dependencies">Dependencies</a></li>
- <li><a href="#procedure">Procedure</a></li>
- <li><a href="#no_administrator_privileges">No administrator privileges?</a></li>
- </ul>
-
- <li><a href="#running_grinder">RUNNING GRINDER</a></li>
- <li><a href="#reference_sequence_database">REFERENCE SEQUENCE DATABASE</a></li>
- <li><a href="#cli_examples">CLI EXAMPLES</a></li>
- <li><a href="#cli_required_arguments">CLI REQUIRED ARGUMENTS</a></li>
- <li><a href="#cli_optional_arguments">CLI OPTIONAL ARGUMENTS</a></li>
- <li><a href="#cli_output">CLI OUTPUT</a></li>
- <li><a href="#api_examples">API EXAMPLES</a></li>
- <li><a href="#api_methods">API METHODS</a></li>
- <ul>
-
- <li><a href="#new">new</a></li>
- <li><a href="#next_lib">next_lib</a></li>
- <li><a href="#next_read">next_read</a></li>
- <li><a href="#get_random_seed">get_random_seed</a></li>
- </ul>
-
- <li><a href="#copyright">COPYRIGHT</a></li>
- <li><a href="#bugs">BUGS</a></li>
-</ul>
+<p>Grinder is a versatile program to create random shotgun and amplicon sequence libraries based on DNA, RNA or proteic reference sequences provided in a FASTA file.</p>
-<hr name="index" />
-</div>
-<!-- INDEX END -->
+<p>Grinder can produce genomic, metagenomic, transcriptomic, metatranscriptomic, proteomic, metaproteomic shotgun and amplicon datasets from current sequencing technologies such as Sanger, 454, Illumina. These simulated datasets can be used to test the accuracy of bioinformatic tools under specific hypothesis, e.g. with or without sequencing errors, or with low or high community diversity. Grinder may also be used to help decide between alternative sequencing methods for a sequence-based [...]
-<p>
-</p>
-<h1><a name="name">NAME</a></h1>
-<p>grinder - A versatile omics shotgun and amplicon sequencing read simulator</p>
-<p>
-</p>
-<hr />
-<h1><a name="description">DESCRIPTION</a></h1>
-<p>Grinder is a versatile program to create random shotgun and amplicon sequence
-libraries based on DNA, RNA or proteic reference sequences provided in a FASTA
-file.</p>
-<p>Grinder can produce genomic, metagenomic, transcriptomic, metatranscriptomic,
-proteomic, metaproteomic shotgun and amplicon datasets from current sequencing
-technologies such as Sanger, 454, Illumina. These simulated datasets can be used
-to test the accuracy of bioinformatic tools under specific hypothesis, e.g. with
-or without sequencing errors, or with low or high community diversity. Grinder
-may also be used to help decide between alternative sequencing methods for a
-sequence-based project, e.g. should the library be paired-end or not, how many
-reads should be sequenced.</p>
<p>Grinder features include:</p>
+
<ul>
-<li>
-<p>shotgun or amplicon read libraries</p>
+
+<li><p>shotgun or amplicon read libraries</p>
+
</li>
-<li>
-<p>omics support to generate genomic, transcriptomic, proteomic,
-metagenomic, metatranscriptomic or metaproteomic datasets</p>
+<li><p>omics support to generate genomic, transcriptomic, proteomic, metagenomic, metatranscriptomic or metaproteomic datasets</p>
+
</li>
-<li>
-<p>arbitrary read length distribution and number of reads</p>
+<li><p>arbitrary read length distribution and number of reads</p>
+
</li>
-<li>
-<p>simulation of PCR and sequencing errors (chimeras, point mutations, homopolymers)</p>
+<li><p>simulation of PCR and sequencing errors (chimeras, point mutations, homopolymers)</p>
+
</li>
-<li>
-<p>support for paired-end (mate pair) datasets</p>
+<li><p>support for paired-end (mate pair) datasets</p>
+
</li>
-<li>
-<p>specific rank-abundance settings or manually given abundance for each genome, gene or protein</p>
+<li><p>specific rank-abundance settings or manually given abundance for each genome, gene or protein</p>
+
</li>
-<li>
-<p>creation of datasets with a given richness (alpha diversity)</p>
+<li><p>creation of datasets with a given richness (alpha diversity)</p>
+
</li>
-<li>
-<p>independent datasets can share a variable number of genomes (beta diversity)</p>
+<li><p>independent datasets can share a variable number of genomes (beta diversity)</p>
+
</li>
-<li>
-<p>modeling of the bias created by varying genome lengths or gene copy number</p>
+<li><p>modeling of the bias created by varying genome lengths or gene copy number</p>
+
</li>
-<li>
-<p>profile mechanism to store preferred options</p>
+<li><p>profile mechanism to store preferred options</p>
+
</li>
-<li>
-<p>available to biologists or power users through multiple interfaces: GUI, CLI and API</p>
+<li><p>available to biologists or power users through multiple interfaces: GUI, CLI and API</p>
+
</li>
</ul>
-<p>Briefly, given a FASTA file containing reference sequence (genomes, genes,
-transcripts or proteins), Grinder performs the following steps:</p>
+
+<p>Briefly, given a FASTA file containing reference sequence (genomes, genes, transcripts or proteins), Grinder performs the following steps:</p>
+
<ol>
-<li>
-<p>Read the reference sequences, and for amplicon datasets, extracts full-length
-reference PCR amplicons using the provided degenerate PCR primers.</p>
-</li>
-<li>
-<p>Determine the community structure based on the provided alpha diversity (number
-of reference sequences in the library), beta diversity (number of reference
-sequences in common between several independent libraries) and specified rank-
-abundance model.</p>
-</li>
-<li>
-<p>Take shotgun reads from the reference sequences or amplicon reads from the full-
-length reference PCR amplicons. The reads may be paired-end reads when an insert
-size distribution is specified. The length of the reads depends on the provided
-read length distribution and their abundance depends on the relative abundance
-in the community structure. Genome length may also biases the number of reads to
-take for shotgun datasets at this step. Similarly, for amplicon datasets, the
-number of copies of the target gene in the reference genomes may bias the number
-of reads to take.</p>
-</li>
-<li>
-<p>Alter reads by inserting sequencing errors (indels, substitutions and homopolymer
-errors) following a position-specific model to simulate reads created by current
-sequencing technologies (Sanger, 454, Illumina). Write the reads and their
-quality scores in FASTA, QUAL and FASTQ files.</p>
+
+<li><p>Read the reference sequences, and for amplicon datasets, extracts full-length reference PCR amplicons using the provided degenerate PCR primers.</p>
+
+</li>
+<li><p>Determine the community structure based on the provided alpha diversity (number of reference sequences in the library), beta diversity (number of reference sequences in common between several independent libraries) and specified rank- abundance model.</p>
+
+</li>
+<li><p>Take shotgun reads from the reference sequences or amplicon reads from the full- length reference PCR amplicons. The reads may be paired-end reads when an insert size distribution is specified. The length of the reads depends on the provided read length distribution and their abundance depends on the relative abundance in the community structure. Genome length may also biases the number of reads to take for shotgun datasets at this step. Similarly, for amplicon datasets, the numbe [...]
+
+</li>
+<li><p>Alter reads by inserting sequencing errors (indels, substitutions and homopolymer errors) following a position-specific model to simulate reads created by current sequencing technologies (Sanger, 454, Illumina). Write the reads and their quality scores in FASTA, QUAL and FASTQ files.</p>
+
</li>
</ol>
-<p>
-</p>
-<hr />
-<h1><a name="citation">CITATION</a></h1>
+
+<h1 id="CITATION">CITATION</h1>
+
<p>If you use Grinder in your research, please cite:</p>
-<pre>
- Angly FE, Willner D, Rohwer F, Hugenholtz P, Tyson GW (2012), Grinder: a
- versatile amplicon and shotgun sequence simulator, Nucleic Acids Reseach</pre>
+
+<pre><code> Angly FE, Willner D, Rohwer F, Hugenholtz P, Tyson GW (2012), Grinder: a
+ versatile amplicon and shotgun sequence simulator, Nucleic Acids Reseach</code></pre>
+
<p>Available from <a href="http://dx.doi.org/10.1093/nar/gks251">http://dx.doi.org/10.1093/nar/gks251</a>.</p>
-<p>
-</p>
-<hr />
-<h1><a name="version">VERSION</a></h1>
-<p>This document refers to grinder version 0.5.2</p>
-<p>
-</p>
-<hr />
-<h1><a name="author">AUTHOR</a></h1>
-<p>Florent Angly <<a href="mailto:florent.angly at gmail.com">florent.angly at gmail.com</a>></p>
-<p>
-</p>
-<hr />
-<h1><a name="installation">INSTALLATION</a></h1>
-<p>
-</p>
-<h2><a name="dependencies">Dependencies</a></h2>
+
+<h1 id="VERSION">VERSION</h1>
+
+<p>This document refers to grinder version 0.5.3</p>
+
+<h1 id="AUTHOR">AUTHOR</h1>
+
+<p>Florent Angly <florent.angly at gmail.com></p>
+
+<h1 id="INSTALLATION">INSTALLATION</h1>
+
+<h2 id="Dependencies">Dependencies</h2>
+
<p>You need to install these dependencies first:</p>
+
<ul>
-<li>
-<p>Perl (>= 5.6)</p>
+
+<li><p>Perl (>= 5.6)</p>
+
<p><a href="http://www.perl.com/download.csp">http://www.perl.com/download.csp</a></p>
+
</li>
-<li>
-<p>make</p>
-<p>Many systems have make installed by default. If your system does not, you should
-install the implementation of make of your choice, e.g. GNU make: <a href="http://www.gnu.org/s/make/">http://www.gnu.org/s/make/</a></p>
+<li><p>make</p>
+
+<p>Many systems have make installed by default. If your system does not, you should install the implementation of make of your choice, e.g. GNU make: <a href="http://www.gnu.org/s/make/">http://www.gnu.org/s/make/</a></p>
+
</li>
</ul>
-<p>The following CPAN Perl modules are dependencies that will be installed automatically
-for you:</p>
+
+<p>The following CPAN Perl modules are dependencies that will be installed automatically for you:</p>
+
<ul>
-<li>
-<p>Bioperl modules (>=1.6.901).</p>
+
+<li><p>Bioperl modules (>=1.6.901).</p>
+
<p>Note that some unreleased Bioperl modules have been included in Grinder.</p>
+
</li>
-<li>
-<p>Getopt::Euclid (>= 0.3.4)</p>
+<li><p>Getopt::Euclid (>= 0.3.4)</p>
+
</li>
-<li>
-<p>List::Util</p>
+<li><p>List::Util</p>
+
<p>First released with Perl v5.7.3</p>
+
</li>
-<li>
-<p>Math::Random::MT (>= 1.13)</p>
+<li><p>Math::Random::MT (>= 1.13)</p>
+
</li>
-<li>
-<p>version (>= 0.77)</p>
+<li><p>version (>= 0.77)</p>
+
<p>First released with Perl v5.9.0</p>
+
</li>
</ul>
-<p>
-</p>
-<h2><a name="procedure">Procedure</a></h2>
-<p>To install Grinder globally on your system, run the following commands in a
-terminal or command prompt:</p>
+
+<h2 id="Procedure">Procedure</h2>
+
+<p>To install Grinder globally on your system, run the following commands in a terminal or command prompt:</p>
+
<p>On Linux, Unix, MacOS:</p>
-<pre>
- perl Makefile.PL
- make</pre>
+
+<pre><code> perl Makefile.PL
+ make</code></pre>
+
<p>And finally, with administrator privileges:</p>
-<pre>
- make install</pre>
+
+<pre><code> make install</code></pre>
+
<p>On Windows, run the same commands but with nmake instead of make.</p>
-<p>
-</p>
-<h2><a name="no_administrator_privileges">No administrator privileges?</a></h2>
-<p>If you do not have administrator privileges, Grinder needs to be installed in
-your home directory.</p>
-<p>First, follow the instructions to install local::lib
-at <a href="http://search.cpan.org/~apeiron/local-lib-1.008004/lib/local/lib.pm#The_bootstrapping_technique">http://search.cpan.org/~apeiron/local-lib-1.008004/lib/local/lib.pm#The_bootstrapping_technique</a>. After local::lib is installed, every Perl
-module that you install manually or through the CPAN command-line application
-will be installed in your home directory.</p>
-<p>Then, install Grinder by following the instructions detailed in the "Procedure"
-section.</p>
-<p>
-</p>
-<hr />
-<h1><a name="running_grinder">RUNNING GRINDER</a></h1>
-<p>After installation, you can run Grinder using a command-line interface (CLI),
-an application programming interface (API) or a graphical user interface (GUI)
-in Galaxy.</p>
+
+<h2 id="No-administrator-privileges">No administrator privileges?</h2>
+
+<p>If you do not have administrator privileges, Grinder needs to be installed in your home directory.</p>
+
+<p>First, follow the instructions to install local::lib at <a href="http://search.cpan.org/~apeiron/local-lib-1.008004/lib/local/lib.pm#The_bootstrapping_technique">http://search.cpan.org/~apeiron/local-lib-1.008004/lib/local/lib.pm#The_bootstrapping_technique</a>. After local::lib is installed, every Perl module that you install manually or through the CPAN command-line application will be installed in your home directory.</p>
+
+<p>Then, install Grinder by following the instructions detailed in the "Procedure" section.</p>
+
+<h1 id="RUNNING-GRINDER">RUNNING GRINDER</h1>
+
+<p>After installation, you can run Grinder using a command-line interface (CLI), an application programming interface (API) or a graphical user interface (GUI) in Galaxy.</p>
+
<p>To get the usage of the CLI, type:</p>
-<pre>
- grinder --help</pre>
-<p>More information, including the documentation of the Grinder API, which allows
-you to run Grinder from within other Perl programs, is available by typing:</p>
-<pre>
- perldoc Grinder</pre>
+
+<pre><code> grinder --help</code></pre>
+
+<p>More information, including the documentation of the Grinder API, which allows you to run Grinder from within other Perl programs, is available by typing:</p>
+
+<pre><code> perldoc Grinder</code></pre>
+
<p>To run the GUI, refer to the Galaxy documentation at <a href="http://wiki.g2.bx.psu.edu/FrontPage">http://wiki.g2.bx.psu.edu/FrontPage</a>.</p>
-<p>The 'utils' folder included in the Grinder package contains some utilities:</p>
+
+<p>The 'utils' folder included in the Grinder package contains some utilities:</p>
+
<dl>
-<dt><strong><a name="average_genome_size" class="item">average genome size:</a></strong></dt>
+<dt id="average-genome-size">average genome size:</dt>
<dd>
-<p>This calculates the average genome size (in bp) of a simulated random library
-produced by Grinder.</p>
-</dd>
-<dt><strong><a name="change_paired_read_orientation" class="item">change_paired_read_orientation:</a></strong></dt>
+<p>This calculates the average genome size (in bp) of a simulated random library produced by Grinder.</p>
+
+</dd>
+<dt id="change_paired_read_orientation">change_paired_read_orientation:</dt>
<dd>
-<p>This reverses the orientation of each second mate-pair read (ID ending in /2)
-in a FASTA file.</p>
+
+<p>This reverses the orientation of each second mate-pair read (ID ending in /2) in a FASTA file.</p>
+
</dd>
</dl>
-<p>
-</p>
-<hr />
-<h1><a name="reference_sequence_database">REFERENCE SEQUENCE DATABASE</a></h1>
-<p>A variety of FASTA databases can be used as input for Grinder. For example, the
-GreenGenes database (<a href="http://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/Isolated_named_strains_16S_aligned.fasta">http://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/Isolated_named_strains_16S_aligned.fasta</a>)
-contains over 180,000 16S rRNA clone sequences from various species which would
-be appropriate to produce a 16S rRNA amplicon dataset. A set of over 41,000 OTU
-representative sequences and their affiliation in seven different taxonomic
-sytems can also be used for the same purpose (<a href="http://greengenes.lbl.gov/Download/OTUs/gg_otus_6oct2010/rep_set/gg_97_otus_6oct2010.fasta">http://greengenes.lbl.gov/Download/OTUs/gg_otus_6oct2010/rep_set/gg_97_otus_6oct2010.fasta</a>
-and <a href="http://greengenes.lbl.gov/Download/OTUs/gg_otus_6oct2010/taxonomies/">http://greengenes.lbl.gov/Download/OTUs/gg_otus_6oct2010/taxonomies/</a>). The
-RDP (<a href="http://rdp.cme.msu.edu/download/release10_27_unaligned.fa.gz">http://rdp.cme.msu.edu/download/release10_27_unaligned.fa.gz</a>) and Silva
-(<a href="http://www.arb-silva.de/no_cache/download/archive/release_108/Exports/">http://www.arb-silva.de/no_cache/download/archive/release_108/Exports/</a>)
-databases also provide many 16S rRNA sequences and Silva includes eukaryotic
-sequences. While 16S rRNA is a popular gene, datasets containing any type of gene
-could be used in the same fashion to generate simulated amplicon datasets, provided
-appropriate primers are used.</p>
-<p>The >2,400 curated microbial genome sequences in the NCBI RefSeq collection
-(<a href="ftp://ftp.ncbi.nih.gov/refseq/release/microbial/">ftp://ftp.ncbi.nih.gov/refseq/release/microbial/</a>) would also be suitable for
-producing 16S rRNA simulated datasets (using the adequate primers). However, the
-lower diversity of this database compared to the previous two makes it more
-appropriate for producing artificial microbial metagenomes. Individual genomes
-from this database are also very suitable for the simulation of single or
-double-barreled shotgun libraries. Similarly, the RefSeq database contains
-over 3,100 curated viral sequences (<a href="ftp://ftp.ncbi.nih.gov/refseq/release/viral/">ftp://ftp.ncbi.nih.gov/refseq/release/viral/</a>)
-which can be used to produce artificial viral metagenomes.</p>
-<p>Quite a few eukaryotic organisms have been sequenced and their genome or genes
-can be the basis for simulating genomic, transcriptomic (RNA-seq) or proteomic
-datasets. For example, you can use the human genome available at
-<a href="ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/RefSeqGene/">ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/RefSeqGene/</a>, the human transcripts
-downloadable from <a href="ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.rna.fna.gz">ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.rna.fna.gz</a>
-or the human proteome at <a href="ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.protein.faa.gz">ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.protein.faa.gz</a>.</p>
-<p>
-</p>
-<hr />
-<h1><a name="cli_examples">CLI EXAMPLES</a></h1>
+
+<h1 id="REFERENCE-SEQUENCE-DATABASE">REFERENCE SEQUENCE DATABASE</h1>
+
+<p>A variety of FASTA databases can be used as input for Grinder. For example, the GreenGenes database (<a href="http://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/Isolated_named_strains_16S_aligned.fasta">http://greengenes.lbl.gov/Download/Sequence_Data/Fasta_data_files/Isolated_named_strains_16S_aligned.fasta</a>) contains over 180,000 16S rRNA clone sequences from various species which would be appropriate to produce a 16S rRNA amplicon dataset. A set of over 41,000 OTU [...]
+
+<p>The >2,400 curated microbial genome sequences in the NCBI RefSeq collection (<a href="ftp://ftp.ncbi.nih.gov/refseq/release/microbial/">ftp://ftp.ncbi.nih.gov/refseq/release/microbial/</a>) would also be suitable for producing 16S rRNA simulated datasets (using the adequate primers). However, the lower diversity of this database compared to the previous two makes it more appropriate for producing artificial microbial metagenomes. Individual genomes from this database are also very [...]
+
+<p>Quite a few eukaryotic organisms have been sequenced and their genome or genes can be the basis for simulating genomic, transcriptomic (RNA-seq) or proteomic datasets. For example, you can use the human genome available at <a href="ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/RefSeqGene/">ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/RefSeqGene/</a>, the human transcripts downloadable from <a href="ftp://ftp.ncbi.nih.gov/refseq/H_sapiens/mRNA_Prot/human.rna.fna.gz">ftp://ftp.ncbi.nih.gov/refseq/ [...]
+
+<h1 id="CLI-EXAMPLES">CLI EXAMPLES</h1>
+
<p>Here are a few examples that illustrate the use of Grinder in a terminal:</p>
+
<ol>
-<li>
-<p>A shotgun DNA library with a coverage of 0.1X</p>
-<pre>
- grinder -reference_file genomes.fna -coverage_fold 0.1</pre>
-</li>
-<li>
-<p>Same thing but save the result files in a specific folder and with a specific name</p>
-<pre>
- grinder -reference_file genomes.fna -coverage_fold 0.1 -base_name my_name -output_dir my_dir</pre>
-</li>
-<li>
-<p>A DNA shotgun library with 1000 reads</p>
-<pre>
- grinder -reference_file genomes.fna -total_reads 1000</pre>
-</li>
-<li>
-<p>A DNA shotgun library where species are distributed according to a power law</p>
-<pre>
- grinder -reference_file genomes.fna -abundance_model powerlaw 0.1</pre>
-</li>
-<li>
-<p>A DNA shotgun library with 123 genomes taken random from the given genomes</p>
-<pre>
- grinder -reference_file genomes.fna -diversity 123</pre>
-</li>
-<li>
-<p>Two DNA shotgun libraries that have 50% of the species in common</p>
-<pre>
- grinder -reference_file genomes.fna -num_libraries 2 -shared_perc 50</pre>
-</li>
-<li>
-<p>Two DNA shotgun library with no species in common and distributed according to a
-exponential rank-abundance model. Note that because the parameter value for the
-exponential model is omitted, each library uses a different randomly chosen value:</p>
-<pre>
- grinder -reference_file genomes.fna -num_libraries 2 -abundance_model exponential</pre>
-</li>
-<li>
-<p>A DNA shotgun library where species relative abundances are manually specified</p>
-<pre>
- grinder -reference_file genomes.fna -abundance_file my_abundances.txt</pre>
-</li>
-<li>
-<p>A DNA shotgun library with Sanger reads</p>
-<pre>
- grinder -reference_file genomes.fna -read_dist 800 -mutation_dist linear 1 2 -mutation_ratio 80 20</pre>
-</li>
-<li>
-<p>A DNA shotgun library with first-generation 454 reads</p>
-<pre>
- grinder -reference_file genomes.fna -read_dist 100 normal 10 -homopolymer_dist balzer</pre>
-</li>
-<li>
-<p>A paired-end DNA shotgun library, where the insert size is normally distributed
-around 2.5 kbp and has 0.2 kbp standard deviation</p>
-<pre>
- grinder -reference_file genomes.fna -insert_dist 2500 normal 200</pre>
-</li>
-<li>
-<p>A transcriptomic dataset</p>
-<pre>
- grinder -reference_file transcripts.fna</pre>
-</li>
-<li>
-<p>A unidirectional transcriptomic dataset</p>
-<pre>
- grinder -reference_file transcripts.fna -unidirectional 1</pre>
-<p>Note the use of -unidirectional 1 to prevent reads to be taken from the reverse-
-complement of the reference sequences.</p>
-</li>
-<li>
-<p>A proteomic dataset</p>
-<pre>
- grinder -reference_file proteins.faa -unidirectional 1</pre>
-</li>
-<li>
-<p>A 16S rRNA amplicon library</p>
-<pre>
- grinder -reference_file 16Sgenes.fna -forward_reverse 16Sprimers.fna -length_bias 0 -unidirectional 1</pre>
-<p>Note the use of -length_bias 0 because reference sequence length should not affect
-the relative abundance of amplicons.</p>
-</li>
-<li>
-<p>The same amplicon library with 20% of chimeric reads (90% bimera, 10% trimera)</p>
-<pre>
- grinder -reference_file 16Sgenes.fna -forward_reverse 16Sprimers.fna -length_bias 0 -unidirectional 1 -chimera_perc 20 -chimera_dist 90 10</pre>
-</li>
-<li>
-<p>Three 16S rRNA amplicon libraries with specified MIDs and no reference sequences
-in common</p>
-<pre>
- grinder -reference_file 16Sgenes.fna -forward_reverse 16Sprimers.fna -length_bias 0 -unidirectional 1 -num_libraries 3 -multiplex_ids MIDs.fna</pre>
-</li>
-<li>
-<p>Reading reference sequences from the standard input, which allows you to
-decompress FASTA files on the fly:</p>
-<pre>
- zcat microbial_db.fna.gz | grinder -reference_file - -total_reads 100</pre>
+
+<li><p>A shotgun DNA library with a coverage of 0.1X</p>
+
+<pre><code> grinder -reference_file genomes.fna -coverage_fold 0.1</code></pre>
+
+</li>
+<li><p>Same thing but save the result files in a specific folder and with a specific name</p>
+
+<pre><code> grinder -reference_file genomes.fna -coverage_fold 0.1 -base_name my_name -output_dir my_dir</code></pre>
+
+</li>
+<li><p>A DNA shotgun library with 1000 reads</p>
+
+<pre><code> grinder -reference_file genomes.fna -total_reads 1000</code></pre>
+
+</li>
+<li><p>A DNA shotgun library where species are distributed according to a power law</p>
+
+<pre><code> grinder -reference_file genomes.fna -abundance_model powerlaw 0.1</code></pre>
+
+</li>
+<li><p>A DNA shotgun library with 123 genomes taken random from the given genomes</p>
+
+<pre><code> grinder -reference_file genomes.fna -diversity 123</code></pre>
+
+</li>
+<li><p>Two DNA shotgun libraries that have 50% of the species in common</p>
+
+<pre><code> grinder -reference_file genomes.fna -num_libraries 2 -shared_perc 50</code></pre>
+
+</li>
+<li><p>Two DNA shotgun library with no species in common and distributed according to a exponential rank-abundance model. Note that because the parameter value for the exponential model is omitted, each library uses a different randomly chosen value:</p>
+
+<pre><code> grinder -reference_file genomes.fna -num_libraries 2 -abundance_model exponential</code></pre>
+
+</li>
+<li><p>A DNA shotgun library where species relative abundances are manually specified</p>
+
+<pre><code> grinder -reference_file genomes.fna -abundance_file my_abundances.txt</code></pre>
+
+</li>
+<li><p>A DNA shotgun library with Sanger reads</p>
+
+<pre><code> grinder -reference_file genomes.fna -read_dist 800 -mutation_dist linear 1 2 -mutation_ratio 80 20</code></pre>
+
+</li>
+<li><p>A DNA shotgun library with first-generation 454 reads</p>
+
+<pre><code> grinder -reference_file genomes.fna -read_dist 100 normal 10 -homopolymer_dist balzer</code></pre>
+
+</li>
+<li><p>A paired-end DNA shotgun library, where the insert size is normally distributed around 2.5 kbp and has 0.2 kbp standard deviation</p>
+
+<pre><code> grinder -reference_file genomes.fna -insert_dist 2500 normal 200</code></pre>
+
+</li>
+<li><p>A transcriptomic dataset</p>
+
+<pre><code> grinder -reference_file transcripts.fna</code></pre>
+
+</li>
+<li><p>A unidirectional transcriptomic dataset</p>
+
+<pre><code> grinder -reference_file transcripts.fna -unidirectional 1</code></pre>
+
+<p>Note the use of -unidirectional 1 to prevent reads to be taken from the reverse- complement of the reference sequences.</p>
+
+</li>
+<li><p>A proteomic dataset</p>
+
+<pre><code> grinder -reference_file proteins.faa -unidirectional 1</code></pre>
+
+</li>
+<li><p>A 16S rRNA amplicon library</p>
+
+<pre><code> grinder -reference_file 16Sgenes.fna -forward_reverse 16Sprimers.fna -length_bias 0 -unidirectional 1</code></pre>
+
+<p>Note the use of -length_bias 0 because reference sequence length should not affect the relative abundance of amplicons.</p>
+
+</li>
+<li><p>The same amplicon library with 20% of chimeric reads (90% bimera, 10% trimera)</p>
+
+<pre><code> grinder -reference_file 16Sgenes.fna -forward_reverse 16Sprimers.fna -length_bias 0 -unidirectional 1 -chimera_perc 20 -chimera_dist 90 10</code></pre>
+
+</li>
+<li><p>Three 16S rRNA amplicon libraries with specified MIDs and no reference sequences in common</p>
+
+<pre><code> grinder -reference_file 16Sgenes.fna -forward_reverse 16Sprimers.fna -length_bias 0 -unidirectional 1 -num_libraries 3 -multiplex_ids MIDs.fna</code></pre>
+
+</li>
+<li><p>Reading reference sequences from the standard input, which allows you to decompress FASTA files on the fly:</p>
+
+<pre><code> zcat microbial_db.fna.gz | grinder -reference_file - -total_reads 100</code></pre>
+
</li>
</ol>
-<p>
-</p>
-<hr />
-<h1><a name="cli_required_arguments">CLI REQUIRED ARGUMENTS</a></h1>
+
+<h1 id="CLI-REQUIRED-ARGUMENTS">CLI REQUIRED ARGUMENTS</h1>
+
<dl>
-<dt><strong><a name="rf_reference_file_reference_file_reference_file_gf_reference_file_genome_file_reference_file" class="item">-rf <reference_file> | -reference_file <reference_file> | -gf <reference_file> | -genome_file <reference_file></a></strong></dt>
+<dt id="rf-reference_file--reference_file-reference_file--gf-reference_file--genome_file-reference_file">-rf <reference_file> | -reference_file <reference_file> | -gf <reference_file> | -genome_file <reference_file></dt>
<dd>
-<p>FASTA file that contains the input reference sequences (full genomes, 16S rRNA
-genes, transcripts, proteins...) or '-' to read them from the standard input. See the
-README file for examples of databases you can use and where to get them from.
-Default: -</p>
+
+<p>FASTA file that contains the input reference sequences (full genomes, 16S rRNA genes, transcripts, proteins...) or '-' to read them from the standard input. See the README file for examples of databases you can use and where to get them from. Default: -</p>
+
</dd>
</dl>
-<p>
-</p>
-<hr />
-<h1><a name="cli_optional_arguments">CLI OPTIONAL ARGUMENTS</a></h1>
+
+<h1 id="CLI-OPTIONAL-ARGUMENTS">CLI OPTIONAL ARGUMENTS</h1>
+
<dl>
-<dt><strong><a name="tr_total_reads_total_reads_total_reads" class="item">-tr <total_reads> | -total_reads <total_reads></a></strong></dt>
+<dt id="tr-total_reads--total_reads-total_reads">-tr <total_reads> | -total_reads <total_reads></dt>
<dd>
-<p>Number of shotgun or amplicon reads to generate for each library. Do not specify
-this if you specify the fold coverage. Default: 100</p>
-</dd>
-<dt><strong><a name="cf_coverage_fold_coverage_fold_coverage_fold" class="item">-cf <coverage_fold> | -coverage_fold <coverage_fold></a></strong></dt>
-<dd>
-<p>Desired fold coverage of the input reference sequences (the output FASTA length
-divided by the input FASTA length). Do not specify this if you specify the number
-of reads directly.</p>
+<p>Number of shotgun or amplicon reads to generate for each library. Do not specify this if you specify the fold coverage. Default: 100</p>
+
</dd>
-<dt><strong><a name="rd_read_dist_read_dist_read_dist" class="item">-rd <read_dist>... | -read_dist <read_dist>...</a></strong></dt>
+<dt id="cf-coverage_fold--coverage_fold-coverage_fold">-cf <coverage_fold> | -coverage_fold <coverage_fold></dt>
+<dd>
+
+<p>Desired fold coverage of the input reference sequences (the output FASTA length divided by the input FASTA length). Do not specify this if you specify the number of reads directly.</p>
+</dd>
+<dt id="rd-read_dist...--read_dist-read_dist">-rd <read_dist>... | -read_dist <read_dist>...</dt>
<dd>
-<p>Desired shotgun or amplicon read length distribution specified as:
- average length, distribution ('uniform' or 'normal') and standard deviation.</p>
+
+<p>Desired shotgun or amplicon read length distribution specified as: average length, distribution ('uniform' or 'normal') and standard deviation.</p>
+
<p>Only the first element is required. Examples:</p>
-<pre>
- All reads exactly 101 bp long (Illumina GA 2x): 101
+
+<pre><code> All reads exactly 101 bp long (Illumina GA 2x): 101
Uniform read distribution around 100+-10 bp: 100 uniform 10
Reads normally distributed with an average of 800 and a standard deviation of 100
bp (Sanger reads): 800 normal 100
Reads normally distributed with an average of 450 and a standard deviation of 50
- bp (454 GS-FLX Ti): 450 normal 50</pre>
-<p>Reference sequences smaller than the specified read length are not used. Default:
-100</p>
-</dd>
-<dt><strong><a name="id_insert_dist_insert_dist_insert_dist" class="item">-id <insert_dist>... | -insert_dist <insert_dist>...</a></strong></dt>
+ bp (454 GS-FLX Ti): 450 normal 50</code></pre>
+
+<p>Reference sequences smaller than the specified read length are not used. Default: 100</p>
-<dd>
-<p>Create paired-end or mate-pair reads spanning the given insert length.
-Important: the insert is defined in the biological sense, i.e. its length includes
-the length of both reads and of the stretch of DNA between them:
- 0 : off,
- or: insert size distribution in bp, in the same format as the read length
- distribution (a typical value is 2,500 bp for mate pairs)
-Two distinct reads are generated whether or not the mate pair overlaps. Default:
-0</p>
</dd>
-<dt><strong><a name="mo_mate_orientation_mate_orientation_mate_orientation" class="item">-mo <mate_orientation> | -mate_orientation <mate_orientation></a></strong></dt>
+<dt id="id-insert_dist...--insert_dist-insert_dist">-id <insert_dist>... | -insert_dist <insert_dist>...</dt>
+<dd>
+<p>Create paired-end or mate-pair reads spanning the given insert length. Important: the insert is defined in the biological sense, i.e. its length includes the length of both reads and of the stretch of DNA between them: 0 : off, or: insert size distribution in bp, in the same format as the read length distribution (a typical value is 2,500 bp for mate pairs) Two distinct reads are generated whether or not the mate pair overlaps. Default: 0</p>
+
+</dd>
+<dt id="mo-mate_orientation--mate_orientation-mate_orientation">-mo <mate_orientation> | -mate_orientation <mate_orientation></dt>
<dd>
-<p>When generating paired-end or mate-pair reads (see <insert_dist>), specify the
-orientation of the reads (F: forward, R: reverse):</p>
-<pre>
- FR: ---> <--- e.g. Sanger, Illumina paired-end, IonTorrent mate-pair
+
+<p>When generating paired-end or mate-pair reads (see <insert_dist>), specify the orientation of the reads (F: forward, R: reverse):</p>
+
+<pre><code> FR: ---> <--- e.g. Sanger, Illumina paired-end, IonTorrent mate-pair
FF: ---> ---> e.g. 454
RF: <--- ---> e.g. Illumina mate-pair
- RR: <--- <---</pre>
+ RR: <--- <---</code></pre>
+
<p>Default: FR</p>
-</dd>
-<dt><strong><a name="ec_exclude_chars_exclude_chars_exclude_chars" class="item">-ec <exclude_chars> | -exclude_chars <exclude_chars></a></strong></dt>
-<dd>
-<p>Do not create reads containing any of the specified characters (case insensitive).
-For example, use 'NX' to prevent reads with ambiguities (N or X). Grinder will
-error if it fails to find a suitable read (or pair of reads) after 10 attempts.
-Consider using <delete_chars>, which may be more appropriate for your case.
-Default: ''</p>
</dd>
-<dt><strong><a name="dc_delete_chars_delete_chars_delete_chars" class="item">-dc <delete_chars> | -delete_chars <delete_chars></a></strong></dt>
-
+<dt id="ec-exclude_chars--exclude_chars-exclude_chars">-ec <exclude_chars> | -exclude_chars <exclude_chars></dt>
<dd>
-<p>Remove the specified characters from the reference sequences (case-insensitive),
-e.g. '-~*' to remove gaps (- or ~) or terminator (*). Removing these characters
-is done once, when reading the reference sequences, prior to taking reads. Hence
-it is more efficient than <exclude_chars>. Default:</p>
-</dd>
-<dt><strong><a name="fr_forward_reverse_forward_reverse_forward_reverse" class="item">-fr <forward_reverse> | -forward_reverse <forward_reverse></a></strong></dt>
-<dd>
-<p>Use DNA amplicon sequencing using a forward and reverse PCR primer sequence
-provided in a FASTA file. The reference sequences and their reverse complement
-will be searched for PCR primer matches. The primer sequences should use the
-IUPAC convention for degenerate residues and the reference sequences that that
-do not match the specified primers are excluded. If your reference sequences are
-full genomes, it is recommended to use <copy_bias> = 1 and <length_bias> = 0 to
-generate amplicon reads. To sequence from the forward strand, set <unidirectional>
-to 1 and put the forward primer first and reverse primer second in the FASTA
-file. To sequence from the reverse strand, invert the primers in the FASTA file
-and use <unidirectional> = -1. The second primer sequence in the FASTA file is
-always optional. Example: AAACTYAAAKGAATTGRCGG and ACGGGCGGTGTGTRC for the 926F
-and 1392R primers that target the V6 to V9 region of the 16S rRNA gene.</p>
-</dd>
-<dt><strong><a name="un_unidirectional_unidirectional_unidirectional" class="item">-un <unidirectional> | -unidirectional <unidirectional></a></strong></dt>
+<p>Do not create reads containing any of the specified characters (case insensitive). For example, use 'NX' to prevent reads with ambiguities (N or X). Grinder will error if it fails to find a suitable read (or pair of reads) after 10 attempts. Consider using <delete_chars>, which may be more appropriate for your case. Default: ''</p>
-<dd>
-<p>Instead of producing reads bidirectionally, from the reference strand and its
-reverse complement, proceed unidirectionally, from one strand only (forward or
-reverse). Values: 0 (off, i.e. bidirectional), 1 (forward), -1 (reverse). Use
-<unidirectional> = 1 for amplicon and strand-specific transcriptomic or
-proteomic datasets. Default: 0</p>
</dd>
-<dt><strong><a name="lb_length_bias_length_bias_length_bias" class="item">-lb <length_bias> | -length_bias <length_bias></a></strong></dt>
-
+<dt id="dc-delete_chars--delete_chars-delete_chars">-dc <delete_chars> | -delete_chars <delete_chars></dt>
<dd>
-<p>In shotgun libraries, sample reference sequences proportionally to their length.
-For example, in simulated microbial datasets, this means that at the same
-relative abundance, larger genomes contribute more reads than smaller genomes
-(and all genomes have the same fold coverage).
-0 = no, 1 = yes. Default: 1</p>
-</dd>
-<dt><strong><a name="cb_copy_bias_copy_bias_copy_bias" class="item">-cb <copy_bias> | -copy_bias <copy_bias></a></strong></dt>
+<p>Remove the specified characters from the reference sequences (case-insensitive), e.g. '-~*' to remove gaps (- or ~) or terminator (*). Removing these characters is done once, when reading the reference sequences, prior to taking reads. Hence it is more efficient than <exclude_chars>. Default:</p>
+
+</dd>
+<dt id="fr-forward_reverse--forward_reverse-forward_reverse">-fr <forward_reverse> | -forward_reverse <forward_reverse></dt>
<dd>
-<p>In amplicon libraries where full genomes are used as input, sample species
-proportionally to the number of copies of the target gene: at equal relative
-abundance, genomes that have multiple copies of the target gene contribute more
-amplicon reads than genomes that have a single copy. 0 = no, 1 = yes. Default:
-1</p>
+
+<p>Use DNA amplicon sequencing using a forward and reverse PCR primer sequence provided in a FASTA file. The reference sequences and their reverse complement will be searched for PCR primer matches. The primer sequences should use the IUPAC convention for degenerate residues and the reference sequences that that do not match the specified primers are excluded. If your reference sequences are full genomes, it is recommended to use <copy_bias> = 1 and <length_bias> = 0 to gener [...]
+
</dd>
-<dt><strong><a name="md_mutation_dist_mutation_dist_mutation_dist" class="item">-md <mutation_dist>... | -mutation_dist <mutation_dist>...</a></strong></dt>
+<dt id="un-unidirectional--unidirectional-unidirectional">-un <unidirectional> | -unidirectional <unidirectional></dt>
+<dd>
+
+<p>Instead of producing reads bidirectionally, from the reference strand and its reverse complement, proceed unidirectionally, from one strand only (forward or reverse). Values: 0 (off, i.e. bidirectional), 1 (forward), -1 (reverse). Use <unidirectional> = 1 for amplicon and strand-specific transcriptomic or proteomic datasets. Default: 0</p>
+</dd>
+<dt id="lb-length_bias--length_bias-length_bias">-lb <length_bias> | -length_bias <length_bias></dt>
<dd>
-<p>Introduce sequencing errors in the reads, under the form of mutations
-(substitutions, insertions and deletions) at positions that follow a specified
-distribution (with replacement): model (uniform, linear, poly4), model parameters.
-For example, for a uniform 0.1% error rate, use: uniform 0.1. To simulate Sanger
-errors, use a linear model where the errror rate is 1% at the 5' end of reads and
-2% at the 3' end: linear 1 2. To model Illumina errors using the 4th degree
-polynome 3e-3 + 3.3e-8 * i^4 (Korbel et al 2009), use: poly4 3e-3 3.3e-8.
-Use the <mutation_ratio> option to alter how many of these mutations are
-substitutions or indels. Default: uniform 0 0</p>
+
+<p>In shotgun libraries, sample reference sequences proportionally to their length. For example, in simulated microbial datasets, this means that at the same relative abundance, larger genomes contribute more reads than smaller genomes (and all genomes have the same fold coverage). 0 = no, 1 = yes. Default: 1</p>
+
</dd>
-<dt><strong><a name="mr_mutation_ratio_mutation_ratio_mutation_ratio" class="item">-mr <mutation_ratio>... | -mutation_ratio <mutation_ratio>...</a></strong></dt>
+<dt id="cb-copy_bias--copy_bias-copy_bias">-cb <copy_bias> | -copy_bias <copy_bias></dt>
+<dd>
+
+<p>In amplicon libraries where full genomes are used as input, sample species proportionally to the number of copies of the target gene: at equal relative abundance, genomes that have multiple copies of the target gene contribute more amplicon reads than genomes that have a single copy. 0 = no, 1 = yes. Default: 1</p>
+</dd>
+<dt id="md-mutation_dist...--mutation_dist-mutation_dist">-md <mutation_dist>... | -mutation_dist <mutation_dist>...</dt>
<dd>
-<p>Indicate the percentage of substitutions and the number of indels (insertions
-and deletions). For example, use '80 20' (4 substitutions for each indel) for
-Sanger reads. Note that this parameter has no effect unless you specify the
-<mutation_dist> option. Default: 80 20</p>
+
+<p>Introduce sequencing errors in the reads, under the form of mutations (substitutions, insertions and deletions) at positions that follow a specified distribution (with replacement): model (uniform, linear, poly4), model parameters. For example, for a uniform 0.1% error rate, use: uniform 0.1. To simulate Sanger errors, use a linear model where the errror rate is 1% at the 5' end of reads and 2% at the 3' end: linear 1 2. To model Illumina errors using the 4th degree polynome 3 [...]
+
</dd>
-<dt><strong><a name="hd_homopolymer_dist_homopolymer_dist_homopolymer_dist" class="item">-hd <homopolymer_dist> | -homopolymer_dist <homopolymer_dist></a></strong></dt>
+<dt id="mr-mutation_ratio...--mutation_ratio-mutation_ratio">-mr <mutation_ratio>... | -mutation_ratio <mutation_ratio>...</dt>
+<dd>
+<p>Indicate the percentage of substitutions and the number of indels (insertions and deletions). For example, use '80 20' (4 substitutions for each indel) for Sanger reads. Note that this parameter has no effect unless you specify the <mutation_dist> option. Default: 80 20</p>
+
+</dd>
+<dt id="hd-homopolymer_dist--homopolymer_dist-homopolymer_dist">-hd <homopolymer_dist> | -homopolymer_dist <homopolymer_dist></dt>
<dd>
-<p>Introduce sequencing errors in the reads under the form of homopolymeric
-stretches (e.g. AAA, CCCCC) using a specified model where the homopolymer length
-follows a normal distribution N(mean, standard deviation) that is function of
-the homopolymer length n:</p>
-<pre>
- Margulies: N(n, 0.15 * n) , Margulies et al. 2005.
+
+<p>Introduce sequencing errors in the reads under the form of homopolymeric stretches (e.g. AAA, CCCCC) using a specified model where the homopolymer length follows a normal distribution N(mean, standard deviation) that is function of the homopolymer length n:</p>
+
+<pre><code> Margulies: N(n, 0.15 * n) , Margulies et al. 2005.
Richter : N(n, 0.15 * sqrt(n)) , Richter et al. 2008.
- Balzer : N(n, 0.03494 + n * 0.06856) , Balzer et al. 2010.</pre>
+ Balzer : N(n, 0.03494 + n * 0.06856) , Balzer et al. 2010.</code></pre>
+
<p>Default: 0</p>
-</dd>
-<dt><strong><a name="cp_chimera_perc_chimera_perc_chimera_perc" class="item">-cp <chimera_perc> | -chimera_perc <chimera_perc></a></strong></dt>
-<dd>
-<p>Specify the percent of reads in amplicon libraries that should be chimeric
-sequences. The 'reference' field in the description of chimeric reads will
-contain the ID of all the reference sequences forming the chimeric template.
-A typical value is 10% for amplicons. This option can be used to generate
-chimeric shotgun reads as well. Default: 0 %</p>
</dd>
-<dt><strong><a name="cd_chimera_dist_chimera_dist_chimera_dist" class="item">-cd <chimera_dist>... | -chimera_dist <chimera_dist>...</a></strong></dt>
-
+<dt id="cp-chimera_perc--chimera_perc-chimera_perc">-cp <chimera_perc> | -chimera_perc <chimera_perc></dt>
<dd>
-<p>Specify the distribution of chimeras: bimeras, trimeras, quadrameras and
-multimeras of higher order. The default is the average values from Quince et al.
-2011: '314 38 1', which corresponds to 89% of bimeras, 11% of trimeras and 0.3%
-of quadrameras. Note that this option only takes effect when you request the
-generation of chimeras with the <chimera_perc> option. Default: 314 38 1</p>
-</dd>
-<dt><strong><a name="ck_chimera_kmer_chimera_kmer_chimera_kmer" class="item">-ck <chimera_kmer> | -chimera_kmer <chimera_kmer></a></strong></dt>
-<dd>
-<p>Activate a method to form chimeras by picking breakpoints at places where k-mers
-are shared between sequences. <chimera_kmer> represents k, the length of the
-k-mers (in bp). The longer the kmer, the more similar the sequences have to be
-to be eligible to form chimeras. The more frequent a k-mer is in the pool of
-reference sequences (taking into account their relative abundance), the more
-often this k-mer will be chosen. For example, CHSIM (Edgar et al. 2011) uses this
-method with a k-mer length of 10 bp. If you do not want to use k-mer information
-to form chimeras, use 0, which will result in the reference sequences and
-breakpoints to be taken randomly on the "aligned" reference sequences. Note that
-this option only takes effect when you request the generation of chimeras with
-the <chimera_perc> option. Also, this options is quite memory intensive, so you
-should probably limit yourself to a relatively small number of reference sequences
-if you want to use it. Default: 10 bp</p>
+<p>Specify the percent of reads in amplicon libraries that should be chimeric sequences. The 'reference' field in the description of chimeric reads will contain the ID of all the reference sequences forming the chimeric template. A typical value is 10% for amplicons. This option can be used to generate chimeric shotgun reads as well. Default: 0 %</p>
+
</dd>
-<dt><strong><a name="af_abundance_file_abundance_file_abundance_file" class="item">-af <abundance_file> | -abundance_file <abundance_file></a></strong></dt>
+<dt id="cd-chimera_dist...--chimera_dist-chimera_dist">-cd <chimera_dist>... | -chimera_dist <chimera_dist>...</dt>
+<dd>
+
+<p>Specify the distribution of chimeras: bimeras, trimeras, quadrameras and multimeras of higher order. The default is the average values from Quince et al. 2011: '314 38 1', which corresponds to 89% of bimeras, 11% of trimeras and 0.3% of quadrameras. Note that this option only takes effect when you request the generation of chimeras with the <chimera_perc> option. Default: 314 38 1</p>
+</dd>
+<dt id="ck-chimera_kmer--chimera_kmer-chimera_kmer">-ck <chimera_kmer> | -chimera_kmer <chimera_kmer></dt>
<dd>
-<p>Specify the relative abundance of the reference sequences manually in an input
-file. Each line of the file should contain a sequence name and its relative
-abundance (%), e.g. 'seqABC 82.1' or 'seqABC 82.1 10.2' if you are specifying two
-different libraries.</p>
+
+<p>Activate a method to form chimeras by picking breakpoints at places where k-mers are shared between sequences. <chimera_kmer> represents k, the length of the k-mers (in bp). The longer the kmer, the more similar the sequences have to be to be eligible to form chimeras. The more frequent a k-mer is in the pool of reference sequences (taking into account their relative abundance), the more often this k-mer will be chosen. For example, CHSIM (Edgar et al. 2011) uses this method wit [...]
+
</dd>
-<dt><strong><a name="am_abundance_model_abundance_model_abundance_model" class="item">-am <abundance_model>... | -abundance_model <abundance_model>...</a></strong></dt>
+<dt id="af-abundance_file--abundance_file-abundance_file">-af <abundance_file> | -abundance_file <abundance_file></dt>
+<dd>
+<p>Specify the relative abundance of the reference sequences manually in an input file. Each line of the file should contain a sequence name and its relative abundance (%), e.g. 'seqABC 82.1' or 'seqABC 82.1 10.2' if you are specifying two different libraries.</p>
+
+</dd>
+<dt id="am-abundance_model...--abundance_model-abundance_model">-am <abundance_model>... | -abundance_model <abundance_model>...</dt>
<dd>
-<p>Relative abundance model for the input reference sequences: uniform, linear, powerlaw,
-logarithmic or exponential. The uniform and linear models do not require a
-parameter, but the other models take a parameter in the range [0, infinity). If
-this parameter is not specified, then it is randomly chosen. Examples:</p>
-<pre>
- uniform distribution: uniform
+
+<p>Relative abundance model for the input reference sequences: uniform, linear, powerlaw, logarithmic or exponential. The uniform and linear models do not require a parameter, but the other models take a parameter in the range [0, infinity). If this parameter is not specified, then it is randomly chosen. Examples:</p>
+
+<pre><code> uniform distribution: uniform
powerlaw distribution with parameter 0.1: powerlaw 0.1
- exponential distribution with automatically chosen parameter: exponential</pre>
+ exponential distribution with automatically chosen parameter: exponential</code></pre>
+
<p>Default: uniform 1</p>
-</dd>
-<dt><strong><a name="nl_num_libraries_num_libraries_num_libraries" class="item">-nl <num_libraries> | -num_libraries <num_libraries></a></strong></dt>
-<dd>
-<p>Number of independent libraries to create. Specify how diverse and similar they
-should be with <diversity>, <shared_perc> and <permuted_perc>. Assign them
-different MID tags with <multiplex_mids>. Default: 1</p>
</dd>
-<dt><strong><a name="mi_multiplex_ids_multiplex_ids_multiplex_ids" class="item">-mi <multiplex_ids> | -multiplex_ids <multiplex_ids></a></strong></dt>
-
+<dt id="nl-num_libraries--num_libraries-num_libraries">-nl <num_libraries> | -num_libraries <num_libraries></dt>
<dd>
-<p>Specify an optional FASTA file that contains multiplex sequence identifiers
-(a.k.a MIDs or barcodes) to add to the sequences (one sequence per library). The
-MIDs are included in the length specified with the -read_dist option and can be
-altered by sequencing errors. See the MIDesigner or BarCrawl programs to
-generate MID sequences.</p>
-</dd>
-<dt><strong><a name="di_diversity_diversity_diversity" class="item">-di <diversity>... | -diversity <diversity>...</a></strong></dt>
-<dd>
-<p>This option specifies alpha diversity, specifically the richness, i.e. number of
-reference sequences to take randomly and include in each library. Use 0 for the
-maximum richness possible (based on the number of reference sequences available).
-Provide one value to make all libraries have the same diversity, or one richness
-value per library otherwise. Default: 0</p>
-</dd>
-<dt><strong><a name="sp_shared_perc_shared_perc_shared_perc" class="item">-sp <shared_perc> | -shared_perc <shared_perc></a></strong></dt>
+<p>Number of independent libraries to create. Specify how diverse and similar they should be with <diversity>, <shared_perc> and <permuted_perc>. Assign them different MID tags with <multiplex_mids>. Default: 1</p>
+</dd>
+<dt id="mi-multiplex_ids--multiplex_ids-multiplex_ids">-mi <multiplex_ids> | -multiplex_ids <multiplex_ids></dt>
<dd>
-<p>This option controls an aspect of beta-diversity. When creating multiple
-libraries, specify the percent of reference sequences they should have in common
-(relative to the diversity of the least diverse library). Default: 0 %</p>
+
+<p>Specify an optional FASTA file that contains multiplex sequence identifiers (a.k.a MIDs or barcodes) to add to the sequences (one sequence per library, in the order given). The MIDs are included in the length specified with the -read_dist option and can be altered by sequencing errors. See the MIDesigner or BarCrawl programs to generate MID sequences.</p>
+
</dd>
-<dt><strong><a name="pp_permuted_perc_permuted_perc_permuted_perc" class="item">-pp <permuted_perc> | -permuted_perc <permuted_perc></a></strong></dt>
+<dt id="di-diversity...--diversity-diversity">-di <diversity>... | -diversity <diversity>...</dt>
+<dd>
+<p>This option specifies alpha diversity, specifically the richness, i.e. number of reference sequences to take randomly and include in each library. Use 0 for the maximum richness possible (based on the number of reference sequences available). Provide one value to make all libraries have the same diversity, or one richness value per library otherwise. Default: 0</p>
+
+</dd>
+<dt id="sp-shared_perc--shared_perc-shared_perc">-sp <shared_perc> | -shared_perc <shared_perc></dt>
<dd>
-<p>This option controls another aspect of beta-diversity. For multiple libraries,
-choose the percent of the most-abundant reference sequences to permute (randomly
-shuffle) the rank-abundance of. Default: 0 %</p>
+
+<p>This option controls an aspect of beta-diversity. When creating multiple libraries, specify the percent of reference sequences they should have in common (relative to the diversity of the least diverse library). Default: 0 %</p>
+
</dd>
-<dt><strong><a name="rs_random_seed_random_seed_random_seed" class="item">-rs <random_seed> | -random_seed <random_seed></a></strong></dt>
+<dt id="pp-permuted_perc--permuted_perc-permuted_perc">-pp <permuted_perc> | -permuted_perc <permuted_perc></dt>
+<dd>
+
+<p>This option controls another aspect of beta-diversity. For multiple libraries, choose the percent of the most-abundant reference sequences to permute (randomly shuffle) the rank-abundance of. Default: 100 %</p>
+</dd>
+<dt id="rs-random_seed--random_seed-random_seed">-rs <random_seed> | -random_seed <random_seed></dt>
<dd>
+
<p>Seed number to use for the pseudo-random number generator.</p>
-</dd>
-<dt><strong><a name="dt_desc_track_desc_track_desc_track" class="item">-dt <desc_track> | -desc_track <desc_track></a></strong></dt>
-<dd>
-<p>Track read information (reference sequence, position, errors, ...) by writing
-it in the read description. Default: 1</p>
</dd>
-<dt><strong><a name="ql_qual_levels_qual_levels_qual_levels" class="item">-ql <qual_levels>... | -qual_levels <qual_levels>...</a></strong></dt>
-
+<dt id="dt-desc_track--desc_track-desc_track">-dt <desc_track> | -desc_track <desc_track></dt>
<dd>
-<p>Generate basic quality scores for the simulated reads. Good residues are given a
-specified good score (e.g. 30) and residues that are the result of an insertion
-or substitution are given a specified bad score (e.g. 10). Specify first the
-good score and then the bad score on the command-line, e.g.: 30 10. Default:</p>
-</dd>
-<dt><strong><a name="fq_fastq_output_fastq_output_fastq_output" class="item">-fq <fastq_output> | -fastq_output <fastq_output></a></strong></dt>
+<p>Track read information (reference sequence, position, errors, ...) by writing it in the read description. Default: 1</p>
+
+</dd>
+<dt id="ql-qual_levels...--qual_levels-qual_levels">-ql <qual_levels>... | -qual_levels <qual_levels>...</dt>
<dd>
-<p>Whether to write the generated reads in FASTQ format (with Sanger-encoded
-quality scores) instead of FASTA and QUAL or not (1: yes, 0: no).
-<qual_levels> need to be specified for this option to be effective. Default: 0</p>
+
+<p>Generate basic quality scores for the simulated reads. Good residues are given a specified good score (e.g. 30) and residues that are the result of an insertion or substitution are given a specified bad score (e.g. 10). Specify first the good score and then the bad score on the command-line, e.g.: 30 10. Default:</p>
+
</dd>
-<dt><strong><a name="bn_base_name_base_name_base_name" class="item">-bn <base_name> | -base_name <base_name></a></strong></dt>
+<dt id="fq-fastq_output--fastq_output-fastq_output">-fq <fastq_output> | -fastq_output <fastq_output></dt>
+<dd>
+
+<p>Whether to write the generated reads in FASTQ format (with Sanger-encoded quality scores) instead of FASTA and QUAL or not (1: yes, 0: no). <qual_levels> need to be specified for this option to be effective. Default: 0</p>
+</dd>
+<dt id="bn-base_name--base_name-base_name">-bn <base_name> | -base_name <base_name></dt>
<dd>
+
<p>Prefix of the output files. Default: grinder</p>
-</dd>
-<dt><strong><a name="od_output_dir_output_dir_output_dir" class="item">-od <output_dir> | -output_dir <output_dir></a></strong></dt>
-<dd>
-<p>Directory where the results should be written. This folder will be created if
-needed. Default: .</p>
</dd>
-<dt><strong><a name="pf_profile_file_profile_file_profile_file" class="item">-pf <profile_file> | -profile_file <profile_file></a></strong></dt>
+<dt id="od-output_dir--output_dir-output_dir">-od <output_dir> | -output_dir <output_dir></dt>
+<dd>
+<p>Directory where the results should be written. This folder will be created if needed. Default: .</p>
+
+</dd>
+<dt id="pf-profile_file--profile_file-profile_file">-pf <profile_file> | -profile_file <profile_file></dt>
<dd>
-<p>A file that contains Grinder arguments. This is useful if you use many options
-or often use the same options. Lines with comments (#) are ignored. Consider the
-profile file, 'simple_profile.txt':</p>
-<pre>
- # A simple Grinder profile
+
+<p>A file that contains Grinder arguments. This is useful if you use many options or often use the same options. Lines with comments (#) are ignored. Consider the profile file, 'simple_profile.txt':</p>
+
+<pre><code> # A simple Grinder profile
-read_dist 105 normal 12
- -total_reads 1000</pre>
+ -total_reads 1000</code></pre>
+
<p>Running: grinder -reference_file viral_genomes.fa -profile_file simple_profile.txt</p>
+
<p>Translates into: grinder -reference_file viral_genomes.fa -read_dist 105 normal 12 -total_reads 1000</p>
+
<p>Note that the arguments specified in the profile should not be specified again on the command line.</p>
+
</dd>
</dl>
-<p>
-</p>
-<hr />
-<h1><a name="cli_output">CLI OUTPUT</a></h1>
-<p>For each shotgun or amplicon read library requested, the following files are
-generated:</p>
+
+<h1 id="CLI-OUTPUT">CLI OUTPUT</h1>
+
+<p>For each shotgun or amplicon read library requested, the following files are generated:</p>
+
<ul>
-<li>
-<p>A rank-abundance file, tab-delimited, that shows the relative abundance of the
-different reference sequences</p>
+
+<li><p>A rank-abundance file, tab-delimited, that shows the relative abundance of the different reference sequences</p>
+
</li>
-<li>
-<p>A file containing the read sequences in FASTA format. The read headers
-contain information necessary to track from which reference sequence each read
-was taken and what errors it contains. This file is not generated if <fastq_output>
-option was provided.</p>
+<li><p>A file containing the read sequences in FASTA format. The read headers contain information necessary to track from which reference sequence each read was taken and what errors it contains. This file is not generated if <fastq_output> option was provided.</p>
+
</li>
-<li>
-<p>If the <qual_levels> option was specified, a file containing the quality scores
-of the reads (in QUAL format).</p>
+<li><p>If the <qual_levels> option was specified, a file containing the quality scores of the reads (in QUAL format).</p>
+
</li>
-<li>
-<p>If the <fastq_output> option was provided, a file containing the read sequences
-in FASTQ format.</p>
+<li><p>If the <fastq_output> option was provided, a file containing the read sequences in FASTQ format.</p>
+
</li>
</ul>
-<p>
-</p>
-<hr />
-<h1><a name="api_examples">API EXAMPLES</a></h1>
-<p>The Grinder API allows to conveniently use Grinder within Perl scripts. Here is
-a synopsis:</p>
-<pre>
- use Grinder;</pre>
-<pre>
+
+<h1 id="API-EXAMPLES">API EXAMPLES</h1>
+
+<p>The Grinder API allows to conveniently use Grinder within Perl scripts. Here is a synopsis:</p>
+
+<pre><code> use Grinder;
+
# Set up a new factory (see the OPTIONS section for a complete list of parameters)
- my $factory = Grinder->new( -reference_file => 'genomes.fna' );</pre>
-<pre>
+ my $factory = Grinder->new( -reference_file => 'genomes.fna' );
+
# Process all shotgun libraries requested
- while ( my $struct = $factory->next_lib ) {</pre>
-<pre>
+ while ( my $struct = $factory->next_lib ) {
+
# The ID and abundance of the 3rd most abundant genome in this community
my $id = $struct->{ids}->[2];
- my $ab = $struct->{abs}->[2];</pre>
-<pre>
+ my $ab = $struct->{abs}->[2];
+
# Create shotgun reads
- while ( my $read = $factory->next_read) {</pre>
-<pre>
+ while ( my $read = $factory->next_read) {
+
# The read is a Bioperl sequence object with these properties:
my $read_id = $read->id; # read ID given by Grinder
my $read_seq = $read->seq; # nucleotide sequence
@@ -767,10 +637,10 @@ a synopsis:</p>
my $ref_strand = $read->strand; # strand of the reference
}
- }</pre>
-<pre>
+ }
+
# Similarly, for shotgun mate pairs
- my $factory = Grinder->new( -reference_file => 'genomes.fna',
+ my $factory = Grinder->new( -reference_file => 'genomes.fna',
-insert_dist => 250 );
while ( $factory->next_lib ) {
while ( my $read = $factory->next_read ) {
@@ -779,86 +649,80 @@ a synopsis:</p>
# The third read is the first mate of the next mate pair
# ...
}
- }</pre>
-<pre>
+ }
+
# To generate an amplicon library
- my $factory = Grinder->new( -reference_file => 'genomes.fna',
- -forward_reverse => '16Sgenes.fna',
+ my $factory = Grinder->new( -reference_file => 'genomes.fna',
+ -forward_reverse => '16Sgenes.fna',
-length_bias => 0,
-unidirectional => 1 );
while ( $factory->next_lib ) {
while ( my $read = $factory->next_read) {
# ...
}
- }</pre>
-<p>
-</p>
-<hr />
-<h1><a name="api_methods">API METHODS</a></h1>
+ }</code></pre>
+
+<h1 id="API-METHODS">API METHODS</h1>
+
<p>The rest of the documentation details the available Grinder API methods.</p>
-<p>
-</p>
-<h2><a name="new">new</a></h2>
-<p>Title : new</p>
-<p>Function: Create a new Grinder factory initialized with the passed arguments.
- Available parameters described in the OPTIONS section.</p>
-<p>Usage : my $factory = Grinder->new( -reference_file => 'genomes.fna' );</p>
+
+<h2 id="new">new</h2>
+
+<p>Title : new</p>
+
+<p>Function: Create a new Grinder factory initialized with the passed arguments. Available parameters described in the OPTIONS section.</p>
+
+<p>Usage : my $factory = Grinder->new( -reference_file => 'genomes.fna' );</p>
+
<p>Returns : a new Grinder object</p>
-<p>
-</p>
-<h2><a name="next_lib">next_lib</a></h2>
-<p>Title : next_lib</p>
+
+<h2 id="next_lib">next_lib</h2>
+
+<p>Title : next_lib</p>
+
<p>Function: Go to the next shotgun library to process.</p>
-<p>Usage : my $struct = $factory->next_lib;</p>
-<p>Returns : Community structure to be used for this library, where $struct->{ids}
- is an array reference containing the IDs of the genome making up the
- community (sorted by decreasing relative abundance) and $struct->{abs}
- is an array reference of the genome abundances (in the same order as
- the IDs).</p>
-<p>
-</p>
-<h2><a name="next_read">next_read</a></h2>
-<p>Title : next_read</p>
+
+<p>Usage : my $struct = $factory->next_lib;</p>
+
+<p>Returns : Community structure to be used for this library, where $struct->{ids} is an array reference containing the IDs of the genome making up the community (sorted by decreasing relative abundance) and $struct->{abs} is an array reference of the genome abundances (in the same order as the IDs).</p>
+
+<h2 id="next_read">next_read</h2>
+
+<p>Title : next_read</p>
+
<p>Function: Create an amplicon or shotgun read for the current library.</p>
-<p>Usage : my $read = $factory->next_read; # for single read
- my $mate1 = $factory->next_read; # for mate pairs
- my $mate2 = $factory->next_read;</p>
+
+<p>Usage : my $read = $factory->next_read; # for single read my $mate1 = $factory->next_read; # for mate pairs my $mate2 = $factory->next_read;</p>
+
<p>Returns : A sequence represented as a Bio::Seq::SimulatedRead object</p>
-<p>
-</p>
-<h2><a name="get_random_seed">get_random_seed</a></h2>
-<p>Title : get_random_seed</p>
+
+<h2 id="get_random_seed">get_random_seed</h2>
+
+<p>Title : get_random_seed</p>
+
<p>Function: Return the number used to seed the pseudo-random number generator</p>
-<p>Usage : my $seed = $factory->get_random_seed;</p>
+
+<p>Usage : my $seed = $factory->get_random_seed;</p>
+
<p>Returns : seed number</p>
-<p>
-</p>
-<hr />
-<h1><a name="copyright">COPYRIGHT</a></h1>
-<p>Copyright 2009-2012 Florent ANGLY <<a href="mailto:florent.angly at gmail.com">florent.angly at gmail.com</a>></p>
-<p>Grinder is free software: you can redistribute it and/or modify
-it under the terms of the GNU General Public License (GPL) as published by
-the Free Software Foundation, either version 3 of the License, or
-(at your option) any later version.
-Grinder is distributed in the hope that it will be useful,
-but WITHOUT ANY WARRANTY; without even the implied warranty of
-MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-GNU General Public License for more details.
-You should have received a copy of the GNU General Public License
-along with Grinder. If not, see <http://www.gnu.org/licenses/>.</p>
-<p>
-</p>
-<hr />
-<h1><a name="bugs">BUGS</a></h1>
-<p>All complex software has bugs lurking in it, and this program is no exception.
-If you find a bug, please report it on the SourceForge Tracker for Grinder:
-<a href="http://sourceforge.net/tracker/?group_id=244196&atid=1124737">http://sourceforge.net/tracker/</a></p>
-<p>Bug reports, suggestions and patches are welcome. Grinder's code is developed
-on Sourceforge (<a href="http://sourceforge.net/scm/?type=git&group_id=244196">http://sourceforge.net/scm/</a>) and is
-under Git revision control. To get started with a patch, do:</p>
-<pre>
- git clone git://biogrinder.git.sourceforge.net/gitroot/biogrinder/biogrinder</pre>
+
+<h1 id="COPYRIGHT">COPYRIGHT</h1>
+
+<p>Copyright 2009-2013 Florent ANGLY <florent.angly at gmail.com></p>
+
+<p>Grinder is free software: you can redistribute it and/or modify it under the terms of the GNU General Public License (GPL) as published by the Free Software Foundation, either version 3 of the License, or (at your option) any later version. Grinder is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have rece [...]
+
+<h1 id="BUGS">BUGS</h1>
+
+<p>All complex software has bugs lurking in it, and this program is no exception. If you find a bug, please report it on the SourceForge Tracker for Grinder: <a href="http://sourceforge.net/tracker/?group_id=244196&atid=1124737">http://sourceforge.net/tracker/?group_id=244196&atid=1124737</a></p>
+
+<p>Bug reports, suggestions and patches are welcome. Grinder's code is developed on Sourceforge (<a href="http://sourceforge.net/scm/?type=git&group_id=244196">http://sourceforge.net/scm/?type=git&group_id=244196</a>) and is under Git revision control. To get started with a patch, do:</p>
+
+<pre><code> git clone git://biogrinder.git.sourceforge.net/gitroot/biogrinder/biogrinder</code></pre>
+
</body>
</html>
+
+
diff --git a/script/grinder b/bin/grinder
similarity index 100%
rename from script/grinder
rename to bin/grinder
diff --git a/script/grinder.pod b/bin/grinder.pod
similarity index 98%
rename from script/grinder.pod
rename to bin/grinder.pod
index 65b5748..78bbb7a 100644
--- a/script/grinder.pod
+++ b/bin/grinder.pod
@@ -1,4 +1,4 @@
-# This file was automatically generated by Getopt::Euclid. Do not edit it.
+# This file was generated dynamically by Getopt::Euclid. Do not edit it.
=head1 NAME
@@ -118,7 +118,7 @@ Available from L<http://dx.doi.org/10.1093/nar/gks251>.
=head1 VERSION
-This document refers to grinder version 0.5.2
+This document refers to grinder version 0.5.3
=head1 AUTHOR
@@ -619,10 +619,10 @@ different MID tags with <multiplex_mids>. Default: 1
=item -mi <multiplex_ids> | -multiplex_ids <multiplex_ids>
Specify an optional FASTA file that contains multiplex sequence identifiers
-(a.k.a MIDs or barcodes) to add to the sequences (one sequence per library). The
-MIDs are included in the length specified with the -read_dist option and can be
-altered by sequencing errors. See the MIDesigner or BarCrawl programs to
-generate MID sequences.
+(a.k.a MIDs or barcodes) to add to the sequences (one sequence per library, in
+the order given). The MIDs are included in the length specified with the
+-read_dist option and can be altered by sequencing errors. See the MIDesigner or
+BarCrawl programs to generate MID sequences.
=item -di <diversity>... | -diversity <diversity>...
@@ -642,7 +642,7 @@ libraries, specify the percent of reference sequences they should have in common
This option controls another aspect of beta-diversity. For multiple libraries,
choose the percent of the most-abundant reference sequences to permute (randomly
-shuffle) the rank-abundance of. Default: 0 %
+shuffle) the rank-abundance of. Default: 100 %
=item -rs <random_seed> | -random_seed <random_seed>
@@ -841,7 +841,7 @@ Returns : seed number
=head1 COPYRIGHT
-Copyright 2009-2012 Florent ANGLY <florent.angly at gmail.com>
+Copyright 2009-2013 Florent ANGLY <florent.angly at gmail.com>
Grinder is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License (GPL) as published by
diff --git a/inc/Module/AutoInstall.pm b/inc/Module/AutoInstall.pm
index aa7aa92..22dfa82 100644
--- a/inc/Module/AutoInstall.pm
+++ b/inc/Module/AutoInstall.pm
@@ -8,7 +8,7 @@ use ExtUtils::MakeMaker ();
use vars qw{$VERSION};
BEGIN {
- $VERSION = '1.06';
+ $VERSION = '1.16';
}
# special map on pre-defined feature sets
@@ -115,7 +115,7 @@ sub import {
print "*** $class version " . $class->VERSION . "\n";
print "*** Checking for Perl dependencies...\n";
- my $cwd = Cwd::cwd();
+ my $cwd = Cwd::getcwd();
$Config = [];
@@ -166,7 +166,7 @@ sub import {
$modules = [ %{$modules} ] if UNIVERSAL::isa( $modules, 'HASH' );
unshift @$modules, -default => &{ shift(@$modules) }
- if ( ref( $modules->[0] ) eq 'CODE' ); # XXX: bugward combatability
+ if ( ref( $modules->[0] ) eq 'CODE' ); # XXX: bugward compatibility
while ( my ( $mod, $arg ) = splice( @$modules, 0, 2 ) ) {
if ( $mod =~ m/^-(\w+)$/ ) {
@@ -345,22 +345,26 @@ sub install {
my $i; # used below to strip leading '-' from config keys
my @config = ( map { s/^-// if ++$i; $_ } @{ +shift } );
- my ( @modules, @installed );
- while ( my ( $pkg, $ver ) = splice( @_, 0, 2 ) ) {
+ my ( @modules, @installed, @modules_to_upgrade );
+ while (my ($pkg, $ver) = splice(@_, 0, 2)) {
- # grep out those already installed
- if ( _version_cmp( _version_of($pkg), $ver ) >= 0 ) {
- push @installed, $pkg;
- }
- else {
- push @modules, $pkg, $ver;
- }
- }
+ # grep out those already installed
+ if (_version_cmp(_version_of($pkg), $ver) >= 0) {
+ push @installed, $pkg;
+ if ($UpgradeDeps) {
+ push @modules_to_upgrade, $pkg, $ver;
+ }
+ }
+ else {
+ push @modules, $pkg, $ver;
+ }
+ }
- if ($UpgradeDeps) {
- push @modules, @installed;
- @installed = ();
- }
+ if ($UpgradeDeps) {
+ push @modules, @modules_to_upgrade;
+ @installed = ();
+ @modules_to_upgrade = ();
+ }
return @installed unless @modules; # nothing to do
return @installed if _check_lock(); # defer to the CPAN shell
@@ -533,7 +537,7 @@ sub _install_cpan {
while ( my ( $opt, $arg ) = splice( @config, 0, 2 ) ) {
( $args{$opt} = $arg, next )
if $opt =~ /^(?:force|notest)$/; # pseudo-option
- $CPAN::Config->{$opt} = $arg;
+ $CPAN::Config->{$opt} = $opt eq 'urllist' ? [$arg] : $arg;
}
if ($args{notest} && (not CPAN::Shell->can('notest'))) {
@@ -611,7 +615,7 @@ sub _under_cpan {
require Cwd;
require File::Spec;
- my $cwd = File::Spec->canonpath( Cwd::cwd() );
+ my $cwd = File::Spec->canonpath( Cwd::getcwd() );
my $cpan = File::Spec->canonpath( $CPAN::Config->{cpan_home} );
return ( index( $cwd, $cpan ) > -1 );
@@ -927,4 +931,4 @@ END_MAKE
__END__
-#line 1193
+#line 1197
diff --git a/inc/Module/Install.pm b/inc/Module/Install.pm
index 4ecf46b..f44ab4d 100644
--- a/inc/Module/Install.pm
+++ b/inc/Module/Install.pm
@@ -17,7 +17,7 @@ package Module::Install;
# 3. The ./inc/ version of Module::Install loads
# }
-use 5.005;
+use 5.006;
use strict 'vars';
use Cwd ();
use File::Find ();
@@ -31,7 +31,7 @@ BEGIN {
# This is not enforced yet, but will be some time in the next few
# releases once we can make sure it won't clash with custom
# Module::Install extensions.
- $VERSION = '1.06';
+ $VERSION = '1.16';
# Storage for the pseudo-singleton
$MAIN = undef;
@@ -156,10 +156,10 @@ END_DIE
sub autoload {
my $self = shift;
my $who = $self->_caller;
- my $cwd = Cwd::cwd();
+ my $cwd = Cwd::getcwd();
my $sym = "${who}::AUTOLOAD";
$sym->{$cwd} = sub {
- my $pwd = Cwd::cwd();
+ my $pwd = Cwd::getcwd();
if ( my $code = $sym->{$pwd} ) {
# Delegate back to parent dirs
goto &$code unless $cwd eq $pwd;
@@ -239,7 +239,7 @@ sub new {
# ignore the prefix on extension modules built from top level.
my $base_path = Cwd::abs_path($FindBin::Bin);
- unless ( Cwd::abs_path(Cwd::cwd()) eq $base_path ) {
+ unless ( Cwd::abs_path(Cwd::getcwd()) eq $base_path ) {
delete $args{prefix};
}
return $args{_self} if $args{_self};
@@ -338,7 +338,7 @@ sub find_extensions {
if ( $subpath eq lc($subpath) || $subpath eq uc($subpath) ) {
my $content = Module::Install::_read($subpath . '.pm');
my $in_pod = 0;
- foreach ( split //, $content ) {
+ foreach ( split /\n/, $content ) {
$in_pod = 1 if /^=\w/;
$in_pod = 0 if /^=cut/;
next if ($in_pod || /^=cut/); # skip pod text
@@ -378,6 +378,7 @@ eval( $] >= 5.006 ? <<'END_NEW' : <<'END_OLD' ); die $@ if $@;
sub _read {
local *FH;
open( FH, '<', $_[0] ) or die "open($_[0]): $!";
+ binmode FH;
my $string = do { local $/; <FH> };
close FH or die "close($_[0]): $!";
return $string;
@@ -386,6 +387,7 @@ END_NEW
sub _read {
local *FH;
open( FH, "< $_[0]" ) or die "open($_[0]): $!";
+ binmode FH;
my $string = do { local $/; <FH> };
close FH or die "close($_[0]): $!";
return $string;
@@ -416,6 +418,7 @@ eval( $] >= 5.006 ? <<'END_NEW' : <<'END_OLD' ); die $@ if $@;
sub _write {
local *FH;
open( FH, '>', $_[0] ) or die "open($_[0]): $!";
+ binmode FH;
foreach ( 1 .. $#_ ) {
print FH $_[$_] or die "print($_[0]): $!";
}
@@ -425,6 +428,7 @@ END_NEW
sub _write {
local *FH;
open( FH, "> $_[0]" ) or die "open($_[0]): $!";
+ binmode FH;
foreach ( 1 .. $#_ ) {
print FH $_[$_] or die "print($_[0]): $!";
}
@@ -434,7 +438,7 @@ END_OLD
# _version is for processing module versions (eg, 1.03_05) not
# Perl versions (eg, 5.8.1).
-sub _version ($) {
+sub _version {
my $s = shift || 0;
my $d =()= $s =~ /(\.)/g;
if ( $d >= 2 ) {
@@ -450,12 +454,12 @@ sub _version ($) {
return $l + 0;
}
-sub _cmp ($$) {
+sub _cmp {
_version($_[1]) <=> _version($_[2]);
}
# Cloned from Params::Util::_CLASS
-sub _CLASS ($) {
+sub _CLASS {
(
defined $_[0]
and
diff --git a/inc/Module/Install/AutoInstall.pm b/inc/Module/Install/AutoInstall.pm
index 6efe4fe..e19d259 100644
--- a/inc/Module/Install/AutoInstall.pm
+++ b/inc/Module/Install/AutoInstall.pm
@@ -6,7 +6,7 @@ use Module::Install::Base ();
use vars qw{$VERSION @ISA $ISCORE};
BEGIN {
- $VERSION = '1.06';
+ $VERSION = '1.16';
@ISA = 'Module::Install::Base';
$ISCORE = 1;
}
diff --git a/inc/Module/Install/Base.pm b/inc/Module/Install/Base.pm
index 802844a..5762a74 100644
--- a/inc/Module/Install/Base.pm
+++ b/inc/Module/Install/Base.pm
@@ -4,7 +4,7 @@ package Module::Install::Base;
use strict 'vars';
use vars qw{$VERSION};
BEGIN {
- $VERSION = '1.06';
+ $VERSION = '1.16';
}
# Suspend handler for "redefined" warnings
diff --git a/inc/Module/Install/Can.pm b/inc/Module/Install/Can.pm
index 22167b8..d859276 100644
--- a/inc/Module/Install/Can.pm
+++ b/inc/Module/Install/Can.pm
@@ -8,7 +8,7 @@ use Module::Install::Base ();
use vars qw{$VERSION @ISA $ISCORE};
BEGIN {
- $VERSION = '1.06';
+ $VERSION = '1.16';
@ISA = 'Module::Install::Base';
$ISCORE = 1;
}
diff --git a/inc/Module/Install/Fetch.pm b/inc/Module/Install/Fetch.pm
index bee0c4f..41d3517 100644
--- a/inc/Module/Install/Fetch.pm
+++ b/inc/Module/Install/Fetch.pm
@@ -6,7 +6,7 @@ use Module::Install::Base ();
use vars qw{$VERSION @ISA $ISCORE};
BEGIN {
- $VERSION = '1.06';
+ $VERSION = '1.16';
@ISA = 'Module::Install::Base';
$ISCORE = 1;
}
diff --git a/inc/Module/Install/Include.pm b/inc/Module/Install/Include.pm
index 8310e4c..2eb1d1f 100644
--- a/inc/Module/Install/Include.pm
+++ b/inc/Module/Install/Include.pm
@@ -6,7 +6,7 @@ use Module::Install::Base ();
use vars qw{$VERSION @ISA $ISCORE};
BEGIN {
- $VERSION = '1.06';
+ $VERSION = '1.16';
@ISA = 'Module::Install::Base';
$ISCORE = 1;
}
diff --git a/inc/Module/Install/Makefile.pm b/inc/Module/Install/Makefile.pm
index 7052f36..e9918d2 100644
--- a/inc/Module/Install/Makefile.pm
+++ b/inc/Module/Install/Makefile.pm
@@ -8,7 +8,7 @@ use Fcntl qw/:flock :seek/;
use vars qw{$VERSION @ISA $ISCORE};
BEGIN {
- $VERSION = '1.06';
+ $VERSION = '1.16';
@ISA = 'Module::Install::Base';
$ISCORE = 1;
}
@@ -133,7 +133,7 @@ sub makemaker_args {
return $args;
}
-# For mm args that take multiple space-seperated args,
+# For mm args that take multiple space-separated args,
# append an argument to the current list.
sub makemaker_append {
my $self = shift;
diff --git a/inc/Module/Install/Metadata.pm b/inc/Module/Install/Metadata.pm
index 58430f3..9792685 100644
--- a/inc/Module/Install/Metadata.pm
+++ b/inc/Module/Install/Metadata.pm
@@ -6,7 +6,7 @@ use Module::Install::Base ();
use vars qw{$VERSION @ISA $ISCORE};
BEGIN {
- $VERSION = '1.06';
+ $VERSION = '1.16';
@ISA = 'Module::Install::Base';
$ISCORE = 1;
}
@@ -347,7 +347,7 @@ sub name_from {
^ \s*
package \s*
([\w:]+)
- \s* ;
+ [\s|;]*
/ixms
) {
my ($name, $module_name) = ($1, $1);
@@ -705,7 +705,7 @@ sub _write_mymeta_data {
my @yaml = Parse::CPAN::Meta::LoadFile('META.yml');
my $meta = $yaml[0];
- # Overwrite the non-configure dependency hashs
+ # Overwrite the non-configure dependency hashes
delete $meta->{requires};
delete $meta->{build_requires};
delete $meta->{recommends};
diff --git a/inc/Module/Install/ReadmeFromPod.pm b/inc/Module/Install/ReadmeFromPod.pm
index fecda2b..b5e03c3 100644
--- a/inc/Module/Install/ReadmeFromPod.pm
+++ b/inc/Module/Install/ReadmeFromPod.pm
@@ -7,7 +7,7 @@ use warnings;
use base qw(Module::Install::Base);
use vars qw($VERSION);
-$VERSION = '0.16';
+$VERSION = '0.22';
sub readme_from {
my $self = shift;
diff --git a/inc/Module/Install/Scripts.pm b/inc/Module/Install/Scripts.pm
index 419286f..333d49b 100644
--- a/inc/Module/Install/Scripts.pm
+++ b/inc/Module/Install/Scripts.pm
@@ -6,7 +6,7 @@ use Module::Install::Base ();
use vars qw{$VERSION @ISA $ISCORE};
BEGIN {
- $VERSION = '1.06';
+ $VERSION = '1.16';
@ISA = 'Module::Install::Base';
$ISCORE = 1;
}
diff --git a/inc/Module/Install/Win32.pm b/inc/Module/Install/Win32.pm
index eeaa3fe..218a66b 100644
--- a/inc/Module/Install/Win32.pm
+++ b/inc/Module/Install/Win32.pm
@@ -6,7 +6,7 @@ use Module::Install::Base ();
use vars qw{$VERSION @ISA $ISCORE};
BEGIN {
- $VERSION = '1.06';
+ $VERSION = '1.16';
@ISA = 'Module::Install::Base';
$ISCORE = 1;
}
diff --git a/inc/Module/Install/WriteAll.pm b/inc/Module/Install/WriteAll.pm
index 85d8018..530749b 100644
--- a/inc/Module/Install/WriteAll.pm
+++ b/inc/Module/Install/WriteAll.pm
@@ -6,7 +6,7 @@ use Module::Install::Base ();
use vars qw{$VERSION @ISA $ISCORE};
BEGIN {
- $VERSION = '1.06';
+ $VERSION = '1.16';
@ISA = qw{Module::Install::Base};
$ISCORE = 1;
}
diff --git a/lib/Bio/DB/Fasta.pm b/lib/Bio/DB/Fasta.pm
deleted file mode 100644
index eb153b2..0000000
--- a/lib/Bio/DB/Fasta.pm
+++ /dev/null
@@ -1,455 +0,0 @@
-#
-# BioPerl module for Bio::DB::Fasta
-#
-# You may distribute this module under the same terms as perl itself
-#
-
-
-=head1 NAME
-
-Bio::DB::Fasta - Fast indexed access to fasta files
-
-=head1 SYNOPSIS
-
- use Bio::DB::Fasta;
-
- # Create database from a directory of Fasta files
- my $db = Bio::DB::Fasta->new('/path/to/fasta/files/');
- my @ids = $db->get_all_primary_ids;
-
- # Simple access
- my $seqstr = $db->seq('CHROMOSOME_I', 4_000_000 => 4_100_000);
- my $revseq = $db->seq('CHROMOSOME_I', 4_100_000 => 4_000_000);
- my $length = $db->length('CHROMOSOME_I');
- my $header = $db->header('CHROMOSOME_I');
- my $alphabet = $db->alphabet('CHROMOSOME_I');
-
- # Access to sequence objects. See Bio::PrimarySeqI.
- my $seq = $db->get_Seq_by_id('CHROMOSOME_I');
- my $seqstr = $seq->seq;
- my $subseq = $seq->subseq(4_000_000 => 4_100_000);
- my $trunc = $seq->trunc(4_000_000 => 4_100_000);
- my $length = $seq->length;
-
- # Loop through sequence objects
- my $stream = $db->get_PrimarySeq_stream;
- while (my $seq = $stream->next_seq) {
- # Bio::PrimarySeqI stuff
- }
-
- # Filehandle access
- my $fh = Bio::DB::Fasta->newFh('/path/to/fasta/files/');
- while (my $seq = <$fh>) {
- # Bio::PrimarySeqI stuff
- }
-
- # Tied hash access
- tie %sequences,'Bio::DB::Fasta','/path/to/fasta/files/';
- print $sequences{'CHROMOSOME_I:1,20000'};
-
-=head1 DESCRIPTION
-
-Bio::DB::Fasta provides indexed access to a single Fasta file, several files,
-or a directory of files. It provides persistent random access to each sequence
-entry (either as a Bio::PrimarySeqI-compliant object or a string), and to
-subsequences within each entry, allowing you to retrieve portions of very large
-sequences without bringing the entire sequence into memory. Bio::DB::Fasta is
-based on Bio::DB::IndexedBase. See this module's documentation for details.
-
-The Fasta files may contain any combination of nucleotide and protein sequences;
-during indexing the module guesses the molecular type. Entries may have any line
-length up to 65,536 characters, and different line lengths are allowed in the
-same file. However, within a sequence entry, all lines must be the same length
-except for the last. An error will be thrown if this is not the case.
-
-The module uses /^E<gt>(\S+)/ to extract the primary ID of each sequence
-from the Fasta header. See -makeid in Bio::DB::IndexedBase to pass a callback
-routine to reversibly modify this primary ID, e.g. if you wish to extract a
-specific portion of the gi|gb|abc|xyz GenBank IDs.
-
-=head1 DATABASE CREATION AND INDEXING
-
-The object-oriented constructor is new(), the filehandle constructor is newFh()
-and the tied hash constructor is tie(). They all allow to index a single Fasta
-file, several files, or a directory of files. See Bio::DB::IndexedBase.
-
-=head1 SEE ALSO
-
-L<Bio::DB::IndexedBase>
-
-L<Bio::DB::Qual>
-
-L<Bio::PrimarySeqI>
-
-=head1 AUTHOR
-
-Lincoln Stein E<lt>lstein at cshl.orgE<gt>.
-
-Copyright (c) 2001 Cold Spring Harbor Laboratory.
-
-This library is free software; you can redistribute it and/or modify
-it under the same terms as Perl itself. See DISCLAIMER.txt for
-disclaimers of warranty.
-
-=head1 APPENDIX
-
-The rest of the documentation details each of the object
-methods. Internal methods are usually preceded with a _
-
-For BioPerl-style access, the following methods are provided:
-
-=head2 get_Seq_by_id
-
- Title : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_primary_id
- Usage : my $seq = $db->get_Seq_by_id($id);
- Function: Given an ID, fetch the corresponding sequence from the database.
- Returns : A Bio::PrimarySeq::Fasta object (Bio::PrimarySeqI compliant)
- Note that to save resource, Bio::PrimarySeq::Fasta sequence objects
- only load the sequence string into memory when requested using seq().
- See L<Bio::PrimarySeqI> for methods provided by the sequence objects
- returned from get_Seq_by_id() and get_PrimarySeq_stream().
- Args : ID
-
-=head2 get_PrimarySeq_stream
-
- Title : get_PrimarySeq_stream
- Usage : my $stream = $db->get_PrimarySeq_stream();
- Function: Get a stream of Bio::PrimarySeq::Fasta objects. The stream supports a
- single method, next_seq(). Each call to next_seq() returns a new
- Bio::PrimarySeq::Fasta sequence object, until no more sequences remain.
- Returns : A Bio::DB::Indexed::Stream object
- Args : None
-
-=head1
-
-For simple access, the following methods are provided:
-
-=cut
-
-
-package Bio::DB::Fasta;
-
-use strict;
-use IO::File;
-use File::Spec;
-use Bio::PrimarySeqI;
-
-use base qw(Bio::DB::IndexedBase);
-
-our $obj_class = 'Bio::PrimarySeq::Fasta';
-our $file_glob = '*.{fa,FA,fasta,FASTA,fast,FAST,dna,DNA,fna,FNA,faa,FAA,fsa,FSA}';
-
-
-=head2 new
-
- Title : new
- Usage : my $db = Bio::DB::Fasta->new( $path, %options);
- Function: Initialize a new database object. When indexing a directory, files
- ending in .fa,fasta,fast,dna,fna,faa,fsa are indexed by default.
- Returns : A new Bio::DB::Fasta object.
- Args : A single file, or path to dir, or arrayref of files
- Optional arguments: see Bio::DB::IndexedBase
-
-=cut
-
-
-sub _calculate_offsets {
- # Bio::DB::IndexedBase calls this to calculate offsets
- my ($self, $fileno, $file, $offsets) = @_;
-
- my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!");
- binmode $fh;
- warn "Indexing $file\n" if $self->{debug};
- my ($offset, @ids, $linelen, $alphabet, $headerlen, $count, $seq_lines,
- $last_line, %offsets);
- my ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
-
- my $termination_length = $self->{termination_length};
- while (my $line = <$fh>) {
- # Account for crlf-terminated Windows files
- if (index($line, '>') == 0) {
- if ($line =~ /^>(\S+)/) {
- print STDERR "Indexed $count sequences...\n"
- if $self->{debug} && (++$count%1000) == 0;
-
- $self->_check_linelength($linelen);
- my $pos = tell($fh);
- if (@ids) {
- my $strlen = $pos - $offset - length($line);
- $strlen -= $termination_length * $seq_lines;
- my $ppos = &{$self->{packmeth}}($offset, $strlen, $strlen,
- $linelen, $headerlen, $alphabet, $fileno);
- $alphabet = Bio::DB::IndexedBase::NA;
- for my $id (@ids) {
- $offsets->{$id} = $ppos;
- }
- }
- @ids = $self->_makeid($line);
- ($offset, $headerlen, $linelen, $seq_lines) = ($pos, length $line, 0, 0);
- ($l3_len, $l2_len, $l_len, $blank_lines) = (0, 0, 0, 0);
-
- } else {
- # Catch bad header lines, bug 3172
- $self->throw("FASTA header doesn't match '>(\\S+)': $line");
- }
- } elsif ($line !~ /\S/) {
- # Skip blank line
- $blank_lines++;
- next;
- } else {
- # Need to check every line :(
- $l3_len = $l2_len;
- $l2_len = $l_len;
- $l_len = length $line;
- if (Bio::DB::IndexedBase::DIE_ON_MISSMATCHED_LINES) {
- if ( ($l3_len > 0) && ($l2_len > 0) && ($l3_len != $l2_len) ) {
- my $fap = substr($line, 0, 20)."..";
- $self->throw("Each line of the fasta entry must be the same ".
- "length except the last. Line above #$. '$fap' is $l2_len".
- " != $l3_len chars.");
- }
- if ($blank_lines) {
- # Blank lines not allowed in entry
- $self->throw("Blank lines can only precede header lines, ".
- "found preceding line #$.");
- }
- }
- $linelen ||= length $line;
- $alphabet ||= $self->_guess_alphabet($line);
- $seq_lines++;
- }
- $last_line = $line;
- }
-
- # Process last entry
- $self->_check_linelength($linelen);
- my $pos = tell $fh;
- if (@ids) {
- my $strlen = $pos - $offset;
- if ($linelen == 0) { # yet another pesky empty chr_random.fa file
- $strlen = 0;
- } else {
- if ($last_line !~ /\s$/) {
- $seq_lines--;
- }
- $strlen -= $termination_length * $seq_lines;
- }
- my $ppos = &{$self->{packmeth}}($offset, $strlen, $strlen, $linelen,
- $headerlen, $alphabet, $fileno);
- for my $id (@ids) {
- $offsets->{$id} = $ppos;
- }
- }
-
- return \%offsets;
-}
-
-
-=head2 seq
-
- Title : seq, sequence, subseq
- Usage : # Entire sequence string
- my $seqstr = $db->seq($id);
- # Subsequence
- my $subseqstr = $db->seq($id, $start, $stop, $strand);
- # or...
- my $subseqstr = $db->seq($compound_id);
- Function: Get a subseq of a sequence from the database. For your convenience,
- the sequence to extract can be specified with any of the following
- compound IDs:
- $db->seq("$id:$start,$stop")
- $db->seq("$id:$start..$stop")
- $db->seq("$id:$start-$stop")
- $db->seq("$id:$start,$stop/$strand")
- $db->seq("$id:$start..$stop/$strand")
- $db->seq("$id:$start-$stop/$strand")
- $db->seq("$id/$strand")
- In the case of DNA or RNA sequence, if $stop is less than $start,
- then the reverse complement of the sequence is returned. Avoid using
- it if possible since this goes against Bio::Seq conventions.
- Returns : A string
- Args : ID of sequence to retrieve
- or
- Compound ID of subsequence to fetch
- or
- ID, optional start (defaults to 1), optional end (defaults to length
- of sequence) and optional strand (defaults to 1).
-
-=cut
-
-sub subseq {
- my ($self, $id, $start, $stop, $strand) = @_;
- $self->throw('Need to provide a sequence ID') if not defined $id;
- ($id, $start, $stop, $strand) = $self->_parse_compound_id($id, $start, $stop, $strand);
-
- my $data;
-
- my $fh = $self->_fh($id) or return;
- my $filestart = $self->_calc_offset($id, $start);
- my $filestop = $self->_calc_offset($id, $stop );
-
- seek($fh, $filestart,0);
- read($fh, $data, $filestop-$filestart+1);
- $data =~ s/\n//g;
- $data =~ s/\r//g;
-
- if ($strand == -1) {
- # Reverse-complement the sequence
- $data = Bio::PrimarySeqI::_revcom_from_string($self, $data, $self->alphabet($id));
- }
- return $data;
-}
-
-*seq = *sequence = \&subseq;
-
-
-=head2 length
-
- Title : length
- Usage : my $length = $qualdb->length($id);
- Function: Get the number of residues in the indicated sequence.
- Returns : Number
- Args : ID of entry
-
-=head2 header
-
- Title : header
- Usage : my $header = $db->header($id);
- Function: Get the header line (ID and description fields) of the specified
- sequence.
- Returns : String
- Args : ID of sequence
-
-=cut
-
-sub header {
- my ($self, $id) = @_;
- $self->throw('Need to provide a sequence ID') if not defined $id;
- my ($offset, $headerlen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,4];
- $offset -= $headerlen;
- my $data;
- my $fh = $self->_fh($id) or return;
- seek($fh, $offset, 0);
- read($fh, $data, $headerlen);
- chomp $data;
- substr($data, 0, 1) = '';
- return $data;
-}
-
-
-=head2 alphabet
-
- Title : alphabet
- Usage : my $alphabet = $db->alphabet($id);
- Function: Get the molecular type of the indicated sequence: dna, rna or protein
- Returns : String
- Args : ID of sequence
-
-=cut
-
-
-#-------------------------------------------------------------
-# Bio::PrimarySeqI compatibility
-#
-package Bio::PrimarySeq::Fasta;
-use overload '""' => 'display_id';
-
-use base qw(Bio::Root::Root Bio::PrimarySeqI);
-
-sub new {
- my ($class, @args) = @_;
- my $self = $class->SUPER::new(@args);
- my ($db, $id, $start, $stop) = $self->_rearrange(
- [qw(DATABASE ID START STOP)],
- @args);
- $self->{db} = $db;
- $self->{id} = $id;
- $self->{stop} = $stop || $db->length($id);
- $self->{start} = $start || ($self->{stop} > 0 ? 1 : 0); # handle 0-length seqs
- return $self;
-}
-
-sub fetch_sequence {
- return shift->seq(@_);
-}
-
-sub seq {
- my $self = shift;
- return $self->{db}->seq($self->{id}, $self->{start}, $self->{stop});
-}
-
-sub subseq {
- my $self = shift;
- return $self->trunc(@_)->seq();
-}
-
-sub trunc {
- # Override Bio::PrimarySeqI trunc() method. This way, we create an object
- # that does not store the sequence in memory.
- my ($self, $start, $stop) = @_;
- $self->throw("Stop cannot be smaller than start") if $stop < $start;
- if ($self->{start} <= $self->{stop}) {
- $start = $self->{start}+$start-1;
- $stop = $self->{start}+$stop-1;
- } else {
- $start = $self->{start}-($start-1);
- $stop = $self->{start}-($stop-1);
- }
- return $self->new( $self->{db}, $self->{id}, $start, $stop );
-}
-
-sub is_circular {
- my $self = shift;
- return $self->{is_circular};
-}
-
-sub display_id {
- my $self = shift;
- return $self->{id};
-}
-
-sub accession_number {
- my $self = shift;
- return 'unknown';
-}
-
-sub primary_id {
- # Following Bio::PrimarySeqI, since this sequence has no accession number,
- # its primary_id should be a stringified memory location.
- my $self = shift;
- return overload::StrVal($self);
-}
-
-sub can_call_new {
- return 0;
-}
-
-sub alphabet {
- my $self = shift;
- return $self->{db}->alphabet($self->{id});
-}
-
-sub revcom {
- # Override Bio::PrimarySeqI revcom() with optimized method.
- my $self = shift;
- return $self->new(@{$self}{'db', 'id', 'stop', 'start'});
-}
-
-sub length {
- # Get length from sequence location, not the sequence string (too expensive)
- my $self = shift;
- return $self->{start} < $self->{stop} ?
- $self->{stop} - $self->{start} + 1 :
- $self->{start} - $self->{stop} + 1 ;
-}
-
-sub description {
- my $self = shift;
- my $header = $self->{'db'}->header($self->{id});
- # Remove the ID from the header
- return (split(/\s+/, $header, 2))[1];
-}
-*desc = \&description;
-
-
-1;
diff --git a/lib/Bio/DB/IndexedBase.pm b/lib/Bio/DB/IndexedBase.pm
deleted file mode 100644
index 05d9de7..0000000
--- a/lib/Bio/DB/IndexedBase.pm
+++ /dev/null
@@ -1,1104 +0,0 @@
-#
-# BioPerl module for Bio::DB::IndexedBase
-#
-# You may distribute this module under the same terms as perl itself
-#
-
-
-=head1 NAME
-
-Bio::DB::IndexedBase - Base class for modules using indexed sequence files
-
-=head1 SYNOPSIS
-
- use Bio::DB::XXX; # a made-up class that uses Bio::IndexedBase
-
- # 1/ Bio::SeqIO-style access
-
- # Index some sequence files
- my $db = Bio::DB::XXX->new('/path/to/file'); # from a single file
- my $db = Bio::DB::XXX->new(['file1', 'file2']); # from multiple files
- my $db = Bio::DB::XXX->new('/path/to/files/'); # from a directory
-
- # Get IDs of all the sequences in the database
- my @ids = $db->get_all_primary_ids;
-
- # Get a specific sequence
- my $seq = $db->get_Seq_by_id('CHROMOSOME_I');
-
- # Loop through all sequences
- my $stream = $db->get_PrimarySeq_stream;
- while (my $seq = $stream->next_seq) {
- # Do something...
- }
-
-
- # 2/ Access via filehandle
- my $fh = Bio::DB::XXX->newFh('/path/to/file');
- while (my $seq = <$fh>) {
- # Do something...
- }
-
-
- # 3/ Tied-hash access
- tie %sequences, 'Bio::DB::XXX', '/path/to/file';
- print $sequences{'CHROMOSOME_I:1,20000'};
-
-=head1 DESCRIPTION
-
-Bio::DB::IndexedBase provides a base class for modules that want to index
-and read sequence files and provides persistent, random access to each sequence
-entry, without bringing the entire file into memory. This module is compliant
-with the Bio::SeqI interface and both. Bio::DB::Fasta and Bio::DB::Qual both use
-Bio::DB::IndexedBase.
-
-When you initialize the module, you point it at a single file, several files, or
-a directory of files. The first time it is run, the module generates an index
-of the content of the files using the AnyDBM_File module (BerkeleyDB preferred,
-followed by GDBM_File, NDBM_File, and SDBM_File). Subsequently, it uses the
-index file to find the sequence file and offset for any requested sequence. If
-one of the source files is updated, the module reindexes just that one file. You
-can also force reindexing manually at any time. For improved performance, the
-module keeps a cache of open filehandles, closing less-recently used ones when
-the cache is full.
-
-Entries may have any line length up to 65,536 characters, and different line
-lengths are allowed in the same file. However, within a sequence entry, all
-lines must be the same length except for the last. An error will be thrown if
-this is not the case!
-
-This module was developed for use with the C. elegans and human genomes, and has
-been tested with sequence segments as large as 20 megabases. Indexing the C.
-elegans genome (100 megabases of genomic sequence plus 100,000 ESTs) takes ~5
-minutes on my 300 MHz pentium laptop. On the same system, average access time
-for any 200-mer within the C. elegans genome was E<lt>0.02s.
-
-=head1 DATABASE CREATION AND INDEXING
-
-The two constructors for this class are new() and newFh(). The former creates a
-Bio::DB::IndexedBase object which is accessed via method calls. The latter
-creates a tied filehandle which can be used Bio::SeqIO style to fetch sequence
-objects in a stream fashion. There is also a tied hash interface.
-
-=over
-
-=item $db = Bio::DB::IndexedBase-E<gt>new($path [,%options])
-
-Create a new Bio::DB::IndexedBase object from the files designated by $path
-$path may be a single file, an arrayref of files, or a directory containing
-such files.
-
-After the database is created, you can use methods like get_all_primary_ids()
-and get_Seq_by_id() to retrieve sequence objects.
-
-=item $fh = Bio::DB::IndexedBase-E<gt>newFh($path [,%options])
-
-Create a tied filehandle opened on a Bio::DB::IndexedBase object. Reading
-from this filehandle with E<lt>E<gt> will return a stream of sequence objects,
-Bio::SeqIO style. The path and the options should be specified as for new().
-
-=item $obj = tie %db,'Bio::DB::IndexedBase', '/path/to/file' [, at args]
-
-Create a tied-hash by tieing %db to Bio::DB::IndexedBase using the indicated
-path to the files. The optional @args list is the same set used by new(). If
-successful, tie() returns the tied object, undef otherwise.
-
-Once tied, you can use the hash to retrieve an individual sequence by
-its ID, like this:
-
- my $seq = $db{CHROMOSOME_I};
-
-The keys() and values() functions will return the sequence IDs and their
-sequences, respectively. In addition, each() can be used to iterate over the
-entire data set:
-
- while (my ($id,$sequence) = each %db) {
- print "$id => $sequence\n";
- }
-
-
-When dealing with very large sequences, you can avoid bringing them into memory
-by calling each() in a scalar context. This returns the key only. You can then
-use tied(%db) to recover the Bio::DB::IndexedBase object and call its methods.
-
- while (my $id = each %db) {
- print "$id: $db{$sequence:1,100}\n";
- print "$id: ".tied(%db)->length($id)."\n";
- }
-
-In addition, you may invoke the FIRSTKEY and NEXTKEY tied hash methods directly
-to retrieve the first and next ID in the database, respectively. This allows to
-write the following iterative loop using just the object-oriented interface:
-
- my $db = Bio::DB::IndexedBase->new('/path/to/file');
- for (my $id=$db->FIRSTKEY; $id; $id=$db->NEXTKEY($id)) {
- # do something with sequence
- }
-
-=back
-
-=head1 INDEX CONTENT
-
-Several attributes of each sequence are stored in the index file. Given a
-sequence ID, these attributes can be retrieved using the following methods:
-
-=over
-
-=item offset($id)
-
-Get the offset of the indicated sequence from the beginning of the file in which
-it is located. The offset points to the beginning of the sequence, not the
-beginning of the header line.
-
-=item strlen($id)
-
-Get the number of characters in the sequence string.
-
-=item length($id)
-
-Get the number of residues of the sequence.
-
-=item linelen($id)
-
-Get the length of the line for this sequence. If the sequence is wrapped, then
-linelen() is likely to be much shorter than strlen().
-
-=item headerlen($id)
-
-Get the length of the header line for the indicated sequence.
-
-=item header_offset
-
-Get the offset of the header line for the indicated sequence from the beginning
-of the file in which it is located. This attribute is not stored. It is
-calculated from offset() and headerlen().
-
-=item alphabet($id)
-
-Get the molecular type (alphabet) of the indicated sequence. This method handles
-residues according to the IUPAC convention.
-
-=item file($id)
-
-Get the the name of the file in which the indicated sequence can be found.
-
-=back
-
-=head1 INTERFACE COMPLIANCE NOTES
-
-Bio::DB::IndexedBase is compliant with the Bio::DB::SeqI and hence with the
-Bio::RandomAccessI interfaces.
-
-Database do not necessarily provide any meaningful internal primary ID for the
-sequences they store. However, Bio::DB::IndexedBase's internal primary IDs are
-the IDs of the sequences. This means that the same ID passed to get_Seq_by_id()
-and get_Seq_by_primary_id() will return the same sequence.
-
-Since this database index has no notion of sequence version or namespace, the
-get_Seq_by_id(), get_Seq_by_acc() and get_Seq_by_version() are identical.
-
-=head1 BUGS
-
-When a sequence is deleted from one of the files, this deletion is not detected
-by the module and removed from the index. As a result, a "ghost" entry will
-remain in the index and will return garbage results if accessed.
-
-Also, if you are indexing a directory, it is wise to not add or remove files
-from it.
-
-In case you have changed the files in a directory, or the sequences in a file,
-you can to rebuild the entire index, either by deleting it manually, or by
-passing -reindex=E<gt>1 to new() when initializing the module.
-
-=head1 SEE ALSO
-
-L<DB_File>
-
-L<Bio::DB::Fasta>
-
-L<Bio::DB::Qual>
-
-=head1 AUTHOR
-
-Lincoln Stein E<lt>lstein at cshl.orgE<gt>.
-
-Copyright (c) 2001 Cold Spring Harbor Laboratory.
-
-Florent Angly (for the modularization)
-
-This library is free software; you can redistribute it and/or modify
-it under the same terms as Perl itself. See DISCLAIMER.txt for
-disclaimers of warranty.
-
-=head1 APPENDIX
-
-The rest of the documentation details each of the object
-methods. Internal methods are usually preceded with a _
-
-=cut
-
-
-package Bio::DB::IndexedBase;
-
-BEGIN {
- @AnyDBM_File::ISA = qw(DB_File GDBM_File NDBM_File SDBM_File)
- if(!$INC{'AnyDBM_File.pm'});
-}
-
-use strict;
-use IO::File;
-use AnyDBM_File;
-use Fcntl;
-use File::Spec;
-use File::Basename qw(basename dirname);
-use Bio::PrimarySeq;
-
-use base qw(Bio::DB::SeqI);
-
-# Store offset, strlen, linelen, headerlen, type and fileno
-use constant STRUCT => 'NNNnnCa*'; # 32-bit file offset and seq length
-use constant STRUCTBIG => 'QQQnnCa*'; # 64-bit
-
-use constant NA => 0;
-use constant DNA => 1;
-use constant RNA => 2;
-use constant PROTEIN => 3;
-
-use constant DIE_ON_MISSMATCHED_LINES => 1;
-# you can avoid dying if you want but you may get incorrect results
-
-
-=head2 new
-
- Title : new
- Usage : my $db = Bio::DB::IndexedBase->new($path, -reindex => 1);
- Function: Initialize a new database object
- Returns : A Bio::DB::IndexedBase object
- Args : A single file, or path to dir, or arrayref of files
- Optional arguments:
-
- Option Description Default
- ----------- ----------- -------
- -glob Glob expression to search for files in directories *
- -makeid A code subroutine for transforming IDs None
- -maxopen Maximum size of filehandle cache 32
- -debug Turn on status messages 0
- -reindex Force the index to be rebuilt 0
- -dbmargs Additional arguments to pass to the DBM routine None
- -index_name Name of the file that will hold the indices
- -clean Remove the index file when finished 0
-
-The -dbmargs option can be used to control the format of the index. For example,
-you can pass $DB_BTREE to this argument so as to force the IDs to be sorted and
-retrieved alphabetically. Note that you must use the same arguments every time
-you open the index!
-
-The -makeid option gives you a chance to modify sequence IDs during indexing.
-For example, you may wish to extract a portion of the gi|gb|abc|xyz nonsense
-that GenBank Fasta files use. The original header line can be recovered later.
-The option value for -makeid should be a code reference that takes a scalar
-argument (the full header line) and returns a scalar or an array of scalars (the
-ID or IDs you want to assign). For example:
-
- $db = Bio::DB::IndexedBase->new('file.fa', -makeid => \&extract_gi);
-
- sub extract_gi {
- # Extract GI from GenBank
- my $header = shift;
- my ($id) = ($header =~ /gi\|(\d+)/m);
- return $id || '';
- }
-
-extract_gi() will be called with the full header line, e.g. a Fasta line would
-include the "E<gt>", the ID and the description:
-
- >gi|352962132|ref|NG_030353.1| Homo sapiens sal-like 3 (Drosophila) (SALL3)
-
-In the database, this sequence can now be retrieved by its GI instead of its
-complete ID:
-
- my $seq = $db->get_Seq_by_id(352962132);
-
-The -makeid option is ignored after the index is constructed.
-
-=cut
-
-sub new {
- my ($class, $path, %opts) = @_;
-
- my $self = bless {
- debug => $opts{-debug} || 0,
- makeid => $opts{-makeid},
- glob => $opts{-glob} || eval '$'.$class.'::file_glob' || '*',
- maxopen => $opts{-maxopen} || 32,
- clean => $opts{-clean} || 0,
- dbmargs => $opts{-dbmargs} || undef,
- fhcache => {},
- cacheseq => {},
- curopen => 0,
- openseq => 1,
- dirname => undef,
- offsets => undef,
- index_name => $opts{-index_name},
- obj_class => eval '$'.$class.'::obj_class',
- offset_meth => \&{$class.'::_calculate_offsets'},
- fileno2path => [],
- filepath2no => {},
- }, $class;
-
- my ($offsets, $dirname);
- my $ref = ref $path || '';
- if ( $ref eq 'ARRAY' ) {
- $offsets = $self->index_files($path, $opts{-reindex});
- require Cwd;
- $dirname = Cwd::getcwd();
- } else {
- if (-d $path) {
- # because Win32 glob() is broken with respect to long file names
- # that contain whitespace.
- $path = Win32::GetShortPathName($path)
- if $^O =~ /^MSWin/i && eval 'use Win32; 1';
- $offsets = $self->index_dir($path, $opts{-reindex});
- $dirname = $path;
- } elsif (-f _) {
- $offsets = $self->index_file($path, $opts{-reindex});
- $dirname = dirname($path);
- } else {
- $self->throw( "$path: Invalid file or dirname");
- }
- }
- @{$self}{qw(dirname offsets)} = ($dirname, $offsets);
-
- return $self;
-}
-
-
-=head2 newFh
-
- Title : newFh
- Usage : my $fh = Bio::DB::IndexedBase->newFh('/path/to/files/', %options);
- Function: Index and get a new Fh for a single file, several files or a directory
- Returns : Filehandle object
- Args : Same as new()
-
-=cut
-
-sub newFh {
- my ($class, @args) = @_;
- my $self = $class->new(@args);
- require Symbol;
- my $fh = Symbol::gensym;
- tie $$fh, 'Bio::DB::Indexed::Stream', $self
- or $self->throw("Could not tie filehandle: $!");
- return $fh;
-}
-
-
-=head2 dbmargs
-
- Title : dbmargs
- Usage : my @args = $db->dbmargs;
- Function: Get stored dbm arguments
- Returns : Array
- Args : None
-
-=cut
-
-sub dbmargs {
- my $self = shift;
- my $args = $self->{dbmargs} or return;
- return ref($args) eq 'ARRAY' ? @$args : $args;
-}
-
-
-=head2 glob
-
- Title : glob
- Usage : my $glob = $db->glob;
- Function: Get the expression used to match files in directories
- Returns : String
- Args : None
-
-=cut
-
-sub glob {
- my $self = shift;
- return $self->{glob};
-}
-
-
-=head2 index_dir
-
- Title : index_dir
- Usage : $db->index_dir($dir);
- Function: Index the files that match -glob in the given directory
- Returns : Hashref of offsets
- Args : Dirname
- Boolean to force a reindexing the directory
-
-=cut
-
-sub index_dir {
- my ($self, $dir, $force_reindex) = @_;
- my @files = glob( File::Spec->catfile($dir, $self->{glob}) );
- $self->throw("No suitable files found in $dir") if scalar @files == 0;
- $self->{index_name} ||= File::Spec->catfile($dir, 'directory.index');
- my $offsets = $self->_index_files(\@files, $force_reindex);
- return $offsets;
-}
-
-
-=head2 get_all_primary_ids
-
- Title : get_all_primary_ids, get_all_ids, ids
- Usage : my @ids = $db->get_all_primary_ids;
- Function: Get the IDs stored in all indexes. This is a Bio::DB::SeqI method
- implementation. Note that in this implementation, the internal
- database primary IDs are also the sequence IDs.
- Returns : List of ids
- Args : None
-
-=cut
-
-sub get_all_primary_ids {
- return keys %{shift->{offsets}};
-}
-
-*ids = *get_all_ids = \&get_all_primary_ids;
-
-
-=head2 index_file
-
- Title : index_file
- Usage : $db->index_file($filename);
- Function: Index the given file
- Returns : Hashref of offsets
- Args : Filename
- Boolean to force reindexing the file
-
-=cut
-
-sub index_file {
- my ($self, $file, $force_reindex) = @_;
- $self->{index_name} ||= "$file.index";
- my $offsets = $self->_index_files([$file], $force_reindex);
- return $offsets;
-}
-
-
-=head2 index_files
-
- Title : index_files
- Usage : $db->index_files(\@files);
- Function: Index the given files
- Returns : Hashref of offsets
- Args : Arrayref of filenames
- Boolean to force reindexing the files
-
-=cut
-
-sub index_files {
- my ($self, $files, $force_reindex) = @_;
- my @paths = map { File::Spec->rel2abs($_) } @$files;
- require Digest::MD5;
- my $digest = Digest::MD5::md5_hex( join('', sort @paths) );
- $self->{index_name} ||= "fileset_$digest.index"; # unique name for the given files
- my $offsets = $self->_index_files($files, $force_reindex);
- return $offsets;
-}
-
-
-=head2 index_name
-
- Title : index_name
- Usage : my $indexname = $db->index_name($path);
- Function: Get the full name of the index file
- Returns : String
- Args : None
-
-=cut
-
-sub index_name {
- return shift->{index_name};
-}
-
-
-=head2 path
-
- Title : path
- Usage : my $path = $db->path($path);
- Function: When a simple file or a directory of files is indexed, this returns
- the file directory. When indexing an arbitrary list of files, the
- return value is the path of the current working directory.
- Returns : String
- Args : None
-
-=cut
-
-sub path {
- return shift->{dirname};
-}
-
-
-=head2 get_PrimarySeq_stream
-
- Title : get_PrimarySeq_stream
- Usage : my $stream = $db->get_PrimarySeq_stream();
- Function: Get a SeqIO-like stream of sequence objects. The stream supports a
- single method, next_seq(). Each call to next_seq() returns a new
- PrimarySeqI compliant sequence object, until no more sequences remain.
- This is a Bio::DB::SeqI method implementation.
- Returns : A Bio::DB::Indexed::Stream object
- Args : None
-
-=cut
-
-sub get_PrimarySeq_stream {
- my $self = shift;
- return Bio::DB::Indexed::Stream->new($self);
-}
-
-
-=head2 get_Seq_by_id
-
- Title : get_Seq_by_id, get_Seq_by_acc, get_Seq_by_version, get_Seq_by_primary_id
- Usage : my $seq = $db->get_Seq_by_id($id);
- Function: Given an ID, fetch the corresponding sequence from the database.
- This is a Bio::DB::SeqI and Bio::DB::RandomAccessI method implementation.
- Returns : A sequence object
- Args : ID
-
-=cut
-
-sub get_Seq_by_id {
- my ($self, $id) = @_;
- $self->throw('Need to provide a sequence ID') if not defined $id;
- return if not exists $self->{offsets}{$id};
- return $self->{obj_class}->new($self, $id);
-}
-
-*get_Seq_by_version = *get_Seq_by_primary_id = *get_Seq_by_acc = \&get_Seq_by_id;
-
-
-=head2 _calculate_offsets
-
- Title : _calculate_offsets
- Usage : $db->_calculate_offsets($filename, $offsets);
- Function: This method calculates the sequence offsets in a file based on ID and
- should be implemented by classes that use Bio::DB::IndexedBase.
- Returns : Hash of offsets
- Args : File to process
- Hashref of file offsets keyed by IDs.
-
-=cut
-
-sub _calculate_offsets {
- my $self = shift;
- $self->throw_not_implemented();
-}
-
-
-sub _index_files {
- # Do the indexing of the given files using the index file on record
- my ($self, $files, $force_reindex) = @_;
-
- $self->_set_pack_method( @$files );
-
- # Get name of index file
- my $index = $self->index_name;
-
- # If caller has requested reindexing, unlink the index file.
- unlink $index if $force_reindex;
-
- # Get the modification time of the index
- my $indextime = (stat $index)[9] || 0;
-
- # Register files and find if there has been any update
- my $modtime = 0;
- my @updated;
- for my $file (@$files) {
- # Register file
- $self->_path2fileno(basename($file));
- # Any update?
- my $m = (stat $file)[9] || 0;
- if ($m > $modtime) {
- $modtime = $m;
- }
- if ($m > $indextime) {
- push @updated, $file;
- }
- }
-
- # Get termination length from first file
- $self->{termination_length} = $self->_calc_termination_length( $files->[0] );
-
- # Reindex contents of changed files if needed
- my $reindex = $force_reindex || (scalar @updated > 0);
- $self->{offsets} = $self->_open_index($index, $reindex) or return;
- if ($reindex) {
- $self->{indexing} = $index;
- for my $file (@updated) {
- my $fileno = $self->_path2fileno(basename($file));
- &{$self->{offset_meth}}($self, $fileno, $file, $self->{offsets});
- }
- delete $self->{indexing};
- }
-
- # Closing and reopening might help corrupted index file problem on Windows
- $self->_close_index($self->{offsets});
-
- return $self->{offsets} = $self->_open_index($index);
-}
-
-
-sub _open_index {
- # Open index file in read-only or write mode
- my ($self, $index_file, $write) = @_;
- my %offsets;
- my $flags = $write ? O_CREAT|O_RDWR : O_RDONLY;
- my @dbmargs = $self->dbmargs;
- tie %offsets, 'AnyDBM_File', $index_file, $flags, 0644, @dbmargs
- or $self->throw( "Could not open index file $index_file: $!");
- return \%offsets;
-}
-
-
-sub _close_index {
- # Close index file
- my ($self, $index) = @_;
- untie %$index;
- return 1;
-}
-
-
-sub _parse_compound_id {
- # Handle compound IDs:
- # $db->seq($id)
- # $db->seq($id, $start, $stop, $strand)
- # $db->seq("$id:$start,$stop")
- # $db->seq("$id:$start..$stop")
- # $db->seq("$id:$start-$stop")
- # $db->seq("$id:$start,$stop/$strand")
- # $db->seq("$id:$start..$stop/$strand")
- # $db->seq("$id:$start-$stop/$strand")
- # $db->seq("$id/$strand")
- my ($self, $id, $start, $stop, $strand) = @_;
-
- if ( (not defined $start ) &&
- (not defined $stop ) &&
- (not defined $strand) &&
- ($id =~ /^ (.+?) (?:\:([\d_]+)(?:,|-|\.\.)([\d_]+))? (?:\/(.+))? $/x) ) {
- # Start, stop and strand not provided and ID looks like a compound ID
- ($id, $start, $stop, $strand) = ($1, $2, $3, $4);
- }
-
- # Start, stop and strand defaults
- $stop ||= $self->length($id) || 0; # 0 if sequence not found in database
- $start ||= ($stop > 0) ? 1 : 0;
- $strand ||= 1;
-
- # Convert numbers such as 1_000_000 to 1000000
- $start =~ s/_//g;
- $stop =~ s/_//g;
-
- if ($start > $stop) {
- # Change the strand
- ($start, $stop) = ($stop, $start);
- $strand *= -1;
- }
-
- return $id, $start, $stop, $strand;
-}
-
-
-sub _guess_alphabet {
- # Determine the molecular type of the given sequence string:
- # 'dna', 'rna', 'protein' or '' (unknown/empty)
- my ($self, $string) = @_;
- # Handle IUPAC residues like PrimarySeq does
- my $alphabet = Bio::PrimarySeq::_guess_alphabet_from_string($self, $string, 1);
- return $alphabet eq 'dna' ? DNA
- : $alphabet eq 'rna' ? RNA
- : $alphabet eq 'protein' ? PROTEIN
- : NA;
-}
-
-
-sub _makeid {
- # Process the header line by applying any transformation given in -makeid
- my ($self, $header_line) = @_;
- return ref($self->{makeid}) eq 'CODE' ? $self->{makeid}->($header_line) : $1;
-}
-
-
-sub _check_linelength {
- # Check that the line length is valid. Generate an error otherwise.
- my ($self, $linelength) = @_;
- return if not defined $linelength;
- $self->throw(
- "Each line of the qual file must be less than 65,536 characters. Line ".
- "$. is $linelength chars."
- ) if $linelength > 65535;
-}
-
-
-sub _calc_termination_length {
- # Try the beginning of the file to determine termination length
- # Account for crlf-terminated Windows and Mac files
- my ($self, $file) = @_;
- my $fh = IO::File->new($file) or $self->throw( "Could not open $file: $!");
- my $line = <$fh>;
- close $fh;
- $self->{termination_length} = ($line =~ /\r\n$/) ? 2 : 1;
- return $self->{termination_length};
-}
-
-
-sub _calc_offset {
- # Get the offset of the n-th residue of the sequence with the given ID
- # and termination length (tl)
- my ($self, $id, $n) = @_;
- my $tl = $self->{termination_length};
- $n--;
- my ($offset, $seqlen, $linelen) = (&{$self->{unpackmeth}}($self->{offsets}{$id}))[0,1,3];
- $n = 0 if $n < 0;
- $n = $seqlen-1 if $n >= $seqlen;
- return $offset + $linelen * int($n/($linelen-$tl)) + $n % ($linelen-$tl);
-}
-
-
-sub _fh {
- # Given a sequence ID, return the filehandle on which to find this sequence
- my ($self, $id) = @_;
- $self->throw('Need to provide a sequence ID') if not defined $id;
- my $file = $self->file($id) or return;
- return $self->_fhcache( File::Spec->catfile($self->{dirname}, $file) ) or
- $self->throw( "Can't open file $file");
-}
-
-
-sub _fhcache {
- my ($self, $path) = @_;
- if (!$self->{fhcache}{$path}) {
- if ($self->{curopen} >= $self->{maxopen}) {
- my @lru = sort {$self->{cacheseq}{$a} <=> $self->{cacheseq}{$b};}
- keys %{$self->{fhcache}};
- splice(@lru, $self->{maxopen} / 3);
- $self->{curopen} -= @lru;
- for (@lru) {
- delete $self->{fhcache}{$_};
- }
- }
- $self->{fhcache}{$path} = IO::File->new($path) || return;
- binmode $self->{fhcache}{$path};
- $self->{curopen}++;
- }
- $self->{cacheseq}{$path}++;
- return $self->{fhcache}{$path};
-}
-
-
-#-------------------------------------------------------------
-# Methods to store and retrieve data from indexed file
-#
-
-=head2 offset
-
- Title : offset
- Usage : my $offset = $db->offset($id);
- Function: Get the offset of the indicated sequence from the beginning of the
- file in which it is located. The offset points to the beginning of
- the sequence, not the beginning of the header line.
- Returns : String
- Args : ID of sequence
-
-=cut
-
-sub offset {
- my ($self, $id) = @_;
- $self->throw('Need to provide a sequence ID') if not defined $id;
- my $offset = $self->{offsets}{$id} or return;
- return (&{$self->{unpackmeth}}($offset))[0];
-}
-
-
-=head2 strlen
-
- Title : strlen
- Usage : my $length = $db->strlen($id);
- Function: Get the number of characters in the sequence string.
- Returns : Integer
- Args : ID of sequence
-
-=cut
-
-sub strlen {
- my ($self, $id) = @_;
- $self->throw('Need to provide a sequence ID') if not defined $id;
- my $offset = $self->{offsets}{$id} or return;
- return (&{$self->{unpackmeth}}($offset))[1];
-}
-
-
-=head2 length
-
- Title : length
- Usage : my $length = $db->length($id);
- Function: Get the number of residues of the sequence.
- Returns : Integer
- Args : ID of sequence
-
-=cut
-
-sub length {
- my ($self, $id) = @_;
- $self->throw('Need to provide a sequence ID') if not defined $id;
- my $offset = $self->{offsets}{$id} or return;
- return (&{$self->{unpackmeth}}($offset))[2];
-}
-
-
-=head2 linelen
-
- Title : linelen
- Usage : my $linelen = $db->linelen($id);
- Function: Get the length of the line for this sequence.
- Returns : Integer
- Args : ID of sequence
-
-=cut
-
-sub linelen {
- my ($self, $id) = @_;
- $self->throw('Need to provide a sequence ID') if not defined $id;
- my $offset = $self->{offsets}{$id} or return;
- return (&{$self->{unpackmeth}}($offset))[3];
-}
-
-
-=head2 headerlen
-
- Title : headerlen
- Usage : my $length = $db->headerlen($id);
- Function: Get the length of the header line for the indicated sequence.
- Returns : Integer
- Args : ID of sequence
-
-=cut
-
-sub headerlen {
- my ($self, $id) = @_;
- $self->throw('Need to provide a sequence ID') if not defined $id;
- my $offset = $self->{offsets}{$id} or return;
- return (&{$self->{unpackmeth}}($offset))[4];
-}
-
-
-=head2 header_offset
-
- Title : header_offset
- Usage : my $offset = $db->header_offset($id);
- Function: Get the offset of the header line for the indicated sequence from
- the beginning of the file in which it is located.
- Returns : String
- Args : ID of sequence
-
-=cut
-
-sub header_offset {
- my ($self, $id) = @_;
- $self->throw('Need to provide a sequence ID') if not defined $id;
- return if not $self->{offsets}{$id};
- return $self->offset($id) - $self->headerlen($id);
-}
-
-
-=head2 alphabet
-
- Title : alphabet
- Usage : my $alphabet = $db->alphabet($id);
- Function: Get the molecular type of the indicated sequence: dna, rna or protein
- Returns : String
- Args : ID of sequence
-
-=cut
-
-sub alphabet {
- my ($self, $id) = @_;
- $self->throw('Need to provide a sequence ID') if not defined $id;
- my $offset = $self->{offsets}{$id} or return;
- my $alphabet = (&{$self->{unpackmeth}}($offset))[5];
- return : $alphabet == Bio::DB::IndexedBase::DNA ? 'dna'
- : $alphabet == Bio::DB::IndexedBase::RNA ? 'rna'
- : $alphabet == Bio::DB::IndexedBase::PROTEIN ? 'protein'
- : '';
-}
-
-
-=head2 file
-
- Title : file
- Usage : my $file = $db->file($id);
- Function: Get the the name of the file in which the indicated sequence can be
- found.
- Returns : String
- Args : ID of sequence
-
-=cut
-
-sub file {
- my ($self, $id) = @_;
- $self->throw('Need to provide a sequence ID') if not defined $id;
- my $offset = $self->{offsets}{$id} or return;
- return $self->_fileno2path((&{$self->{unpackmeth}}($offset))[6]);
-}
-
-
-sub _fileno2path {
- my ($self, $fileno) = @_;
- return $self->{fileno2path}->[$fileno];
-}
-
-
-sub _path2fileno {
- my ($self, $path) = @_;
- if ( not exists $self->{filepath2no}->{$path} ) {
- my $fileno = ($self->{filepath2no}->{$path} = 0+ $self->{fileno}++);
- $self->{fileno2path}->[$fileno] = $path; # Save path
- }
- return $self->{filepath2no}->{$path};
-
-}
-
-
-sub _packSmall {
- return pack STRUCT, @_;
-}
-
-
-sub _packBig {
- return pack STRUCTBIG, @_;
-}
-
-
-sub _unpackSmall {
- return unpack STRUCT, shift;
-}
-
-
-sub _unpackBig {
- return unpack STRUCTBIG, shift;
-}
-
-
-sub _set_pack_method {
- # Determine whether to use 32 or 64 bit integers for the given files.
- my $self = shift;
- # Find the maximum file size:
- my ($maxsize) = sort { $b <=> $a } map { -s $_ } @_;
- my $fourGB = (2 ** 32) - 1;
-
- if ($maxsize > $fourGB) {
- # At least one file exceeds 4Gb - we will need to use 64 bit ints
- $self->{packmeth} = \&_packBig;
- $self->{unpackmeth} = \&_unpackBig;
- } else {
- $self->{packmeth} = \&_packSmall;
- $self->{unpackmeth} = \&_unpackSmall;
- }
- return 1;
-}
-
-
-#-------------------------------------------------------------
-# Tied hash logic
-#
-
-sub TIEHASH {
- return shift->new(@_);
-}
-
-
-sub FETCH {
- return shift->subseq(@_);
-}
-
-
-sub STORE {
- shift->throw("Read-only database");
-}
-
-
-sub DELETE {
- shift->throw("Read-only database");
-}
-
-
-sub CLEAR {
- shift->throw("Read-only database");
-}
-
-
-sub EXISTS {
- return defined shift->offset(@_);
-}
-
-
-sub FIRSTKEY {
- return tied(%{shift->{offsets}})->FIRSTKEY(@_);
-}
-
-
-sub NEXTKEY {
- return tied(%{shift->{offsets}})->NEXTKEY(@_);
-}
-
-
-sub DESTROY {
- my $self = shift;
- if ( $self->{clean} || $self->{indexing} ) {
- # Indexing aborted or cleaning requested. Delete the index file.
- unlink $self->{index_name};
- }
- return 1;
-}
-
-
-#-------------------------------------------------------------
-# stream-based access to the database
-#
-
-package Bio::DB::Indexed::Stream;
-use base qw(Tie::Handle Bio::DB::SeqI);
-
-
-sub new {
- my ($class, $db) = @_;
- my $key = $db->FIRSTKEY;
- return bless {
- db => $db,
- key => $key
- }, $class;
-}
-
-sub next_seq {
- my $self = shift;
- my ($key, $db) = @{$self}{'key', 'db'};
- return if not defined $key;
- my $value = $db->get_Seq_by_id($key);
- $self->{key} = $db->NEXTKEY($key);
- return $value;
-}
-
-sub TIEHANDLE {
- my ($class, $db) = @_;
- return $class->new($db);
-}
-
-sub READLINE {
- my $self = shift;
- return $self->next_seq;
-}
-
-
-1;
diff --git a/lib/Bio/PrimarySeq.pm b/lib/Bio/PrimarySeq.pm
deleted file mode 100644
index cf1f5c0..0000000
--- a/lib/Bio/PrimarySeq.pm
+++ /dev/null
@@ -1,951 +0,0 @@
-#
-# bioperl module for Bio::PrimarySeq
-#
-# Please direct questions and support issues to <bioperl-l at bioperl.org>
-#
-# Cared for by Ewan Birney <birney at ebi.ac.uk>
-#
-# Copyright Ewan Birney
-#
-# You may distribute this module under the same terms as perl itself
-
-# POD documentation - main docs before the code
-
-=head1 NAME
-
-Bio::PrimarySeq - Bioperl lightweight sequence object
-
-=head1 SYNOPSIS
-
- # Bio::SeqIO for file reading, Bio::DB::GenBank for
- # database reading
-
- use Bio::Seq;
- use Bio::SeqIO;
- use Bio::DB::GenBank;
-
- # make from memory
-
- $seqobj = Bio::PrimarySeq->new (
- -seq => 'ATGGGGTGGGCGGTGGGTGGTTTG',
- -id => 'GeneFragment-12',
- -accession_number => 'X78121',
- -alphabet => 'dna',
- -is_circular => 1,
- );
- print "Sequence ", $seqobj->id(), " with accession ",
- $seqobj->accession_number, "\n";
-
- # read from file
-
- $inputstream = Bio::SeqIO->new(
- -file => "myseq.fa",
- -format => 'Fasta',
- );
- $seqobj = $inputstream->next_seq();
- print "Sequence ", $seqobj->id(), " and desc ", $seqobj->desc, "\n";
-
- # to get out parts of the sequence.
-
- print "Sequence ", $seqobj->id(), " with accession ",
- $seqobj->accession_number, " and desc ", $seqobj->desc, "\n";
-
- $string = $seqobj->seq();
- $string2 = $seqobj->subseq(1,40);
-
-=head1 DESCRIPTION
-
-PrimarySeq is a lightweight sequence object, storing the sequence, its
-name, a computer-useful unique name, and other fundamental attributes.
-It does not contain sequence features or other information. To have a
-sequence with sequence features you should use the Seq object which uses
-this object.
-
-Although new users will use Bio::PrimarySeq a lot, in general you will
-be using it from the Bio::Seq object. For more information on Bio::Seq
-see L<Bio::Seq>. For interest you might like to know that
-Bio::Seq has-a Bio::PrimarySeq and forwards most of the function calls
-to do with sequence to it (the has-a relationship lets us get out of a
-otherwise nasty cyclical reference in Perl which would leak memory).
-
-Sequence objects are defined by the Bio::PrimarySeqI interface, and this
-object is a pure Perl implementation of the interface. If that's
-gibberish to you, don't worry. The take home message is that this
-object is the bioperl default sequence object, but other people can
-use their own objects as sequences if they so wish. If you are
-interested in wrapping your own objects as compliant Bioperl sequence
-objects, then you should read the Bio::PrimarySeqI documentation
-
-The documentation of this object is a merge of the Bio::PrimarySeq and
-Bio::PrimarySeqI documentation. This allows all the methods which you can
-call on sequence objects here.
-
-=head1 FEEDBACK
-
-=head2 Mailing Lists
-
-User feedback is an integral part of the evolution of this and other
-Bioperl modules. Send your comments and suggestions preferably to one
-of the Bioperl mailing lists. Your participation is much appreciated.
-
- bioperl-l at bioperl.org - General discussion
- http://bioperl.org/wiki/Mailing_lists - About the mailing lists
-
-=head2 Support
-
-Please direct usage questions or support issues to the mailing list:
-
-I<bioperl-l at bioperl.org>
-
-rather than to the module maintainer directly. Many experienced and
-reponsive experts will be able look at the problem and quickly
-address it. Please include a thorough description of the problem
-with code and data examples if at all possible.
-
-=head2 Reporting Bugs
-
-Report bugs to the Bioperl bug tracking system to help us keep track
-the bugs and their resolution. Bug reports can be submitted via the
-web:
-
- https://redmine.open-bio.org/projects/bioperl/
-
-=head1 AUTHOR - Ewan Birney
-
-Email birney at ebi.ac.uk
-
-=head1 APPENDIX
-
-The rest of the documentation details each of the object
-methods. Internal methods are usually preceded with a _
-
-=cut
-
-
-
-package Bio::PrimarySeq;
-
-use strict;
-
-our $MATCHPATTERN = 'A-Za-z\-\.\*\?=~';
-our $GAP_SYMBOLS = '-~';
-
-use base qw(Bio::Root::Root Bio::PrimarySeqI
- Bio::IdentifiableI Bio::DescribableI);
-
-
-# Setup the allowed values for alphabet()
-my %valid_type = map {$_, 1} qw( dna rna protein );
-
-
-=head2 new
-
- Title : new
- Usage : $seqobj = Bio::PrimarySeq->new( -seq => 'ATGGGGGTGGTGGTACCCT',
- -id => 'human_id',
- -accession_number => 'AL000012',
- );
- Function: Returns a new primary seq object from
- basic constructors, being a string for the sequence
- and strings for id and accession_number.
-
- Note that you can provide an empty sequence string. However, in
- this case you MUST specify the type of sequence you wish to
- initialize by the parameter -alphabet. See alphabet() for possible
- values.
- Returns : a new Bio::PrimarySeq object
- Args : -seq => sequence string
- -ref_to_seq => ... or reference to a sequence string
- -display_id => display id of the sequence (locus name)
- -accession_number => accession number
- -primary_id => primary id (Genbank id)
- -version => version number
- -namespace => the namespace for the accession
- -authority => the authority for the namespace
- -description => description text
- -desc => alias for description
- -alphabet => skip alphabet guess and set it to dna, rna or protein
- -id => alias for display id
- -is_circular => boolean to indicate that sequence is circular
- -direct => boolean to directly set sequences. The next time -seq,
- seq() or -ref_to_seq is use, the sequence will not be
- validated. Be careful with this...
- -nowarnonempty => boolean to avoid warning when sequence is empty
-
-=cut
-
-sub new {
- my ($class, @args) = @_;
- my $self = $class->SUPER::new(@args);
- my ($seq, $id, $acc, $pid, $ns, $auth, $v, $oid, $desc, $description,
- $alphabet, $given_id, $is_circular, $direct, $ref_to_seq, $len,
- $nowarnonempty) =
- $self->_rearrange([qw(SEQ
- DISPLAY_ID
- ACCESSION_NUMBER
- PRIMARY_ID
- NAMESPACE
- AUTHORITY
- VERSION
- OBJECT_ID
- DESC
- DESCRIPTION
- ALPHABET
- ID
- IS_CIRCULAR
- DIRECT
- REF_TO_SEQ
- LENGTH
- NOWARNONEMPTY
- )],
- @args);
-
- # Private var _nowarnonempty, needs to be set before calling _guess_alphabet
- $self->{'_nowarnonempty'} = $nowarnonempty;
- $self->{'_direct'} = $direct;
-
- if( defined $id && defined $given_id ) {
- if( $id ne $given_id ) {
- $self->throw("Provided both id and display_id constructors: [$id] [$given_id]");
- }
- }
- if( defined $given_id ) { $id = $given_id; }
-
- # Bernd's idea: set ids now for more informative invalid sequence messages
- defined $id && $self->display_id($id);
- $acc && $self->accession_number($acc);
- defined $pid && $self->primary_id($pid);
-
- # Set alphabet now to avoid guessing it later, when sequence is set
- $alphabet && $self->alphabet($alphabet);
-
- # Set the length before the seq. If there is a seq, length will be updated later
- $self->{'length'} = $len || 0;
-
- # Set the sequence (but also alphabet and length)
- if ($ref_to_seq) {
- $self->_set_seq_by_ref($ref_to_seq, $alphabet);
- } else {
- if (defined $seq) {
- # Note: the sequence string may be empty
- $self->seq($seq);
- }
- }
-
- $desc && $self->desc($desc);
- $description && $self->description($description);
- $is_circular && $self->is_circular($is_circular);
- $ns && $self->namespace($ns);
- $auth && $self->authority($auth);
- defined($v) && $self->version($v);
- defined($oid) && $self->object_id($oid);
-
- return $self;
-}
-
-
-=head2 seq
-
- Title : seq
- Usage : $string = $seqobj->seq();
- Function: Get or set the sequence as a string of letters. The case of
- the letters is left up to the implementer. Suggested cases are
- upper case for proteins and lower case for DNA sequence (IUPAC
- standard), but you should not rely on this. An error is thrown if
- the sequence contains invalid characters: see validate_seq().
- Returns : A scalar
- Args : - Optional new sequence value (a string) to set
- - Optional alphabet (it is guessed by default)
-
-=cut
-
-sub seq {
- my ($self, @args) = @_;
-
- if( scalar @args == 0 ) {
- return $self->{'seq'};
- }
-
- my ($seq_str, $alphabet) = @args;
- if (@args) {
- $self->_set_seq_by_ref(\$seq_str, $alphabet);
- }
-
- return $self->{'seq'};
-}
-
-
-sub _set_seq_by_ref {
- # Set a sequence by reference. A reference is used to avoid the cost of
- # copying the sequence (which can be very large) between functions.
- my ($self, $seq_str_ref, $alphabet) = @_;
-
- # Validate sequence if sequence is not empty and we are not in direct mode
- if ( (! $self->{'_direct'}) && (defined $$seq_str_ref) ) {
- $self->validate_seq($$seq_str_ref, 1);
- }
- delete $self->{'_direct'}; # next sequence will have to be validated
-
- # Record sequence length
- my $len = CORE::length($$seq_str_ref || '');
- my $is_changed_seq = (exists $self->{'seq'}) && ($len > 0);
- # Note: if the new seq is empty or undef, this is not considered a change
- delete $self->{'_freeze_length'} if $is_changed_seq;
- $self->{'length'} = $len if not exists $self->{'_freeze_length'};
-
- # Set sequence
- $self->{'seq'} = $$seq_str_ref;
-
- # Set or guess alphabet
- if ($alphabet) {
- # Alphabet specified, set it no matter what
- $self->alphabet($alphabet);
- } elsif ($is_changed_seq || (! defined($self->alphabet()))) {
- # If we changed a previous sequence to a new one or if there is no
- # alphabet yet at all, we need to guess the (possibly new) alphabet
- $self->_guess_alphabet();
- } # else (seq not changed and alphabet was defined) do nothing
-
- return 1;
-}
-
-
-=head2 validate_seq
-
- Title : validate_seq
- Usage : if(! $seqobj->validate_seq($seq_str) ) {
- print "sequence $seq_str is not valid for an object of
- alphabet ",$seqobj->alphabet, "\n";
- }
- Function: Test that the given sequence is valid, i.e. contains only valid
- characters. The allowed characters are all letters (A-Z) and '-','.',
- '*','?','=' and '~'. Spaces are not valid. Note that this
- implementation does not take alphabet() into account and that empty
- sequences are considered valid.
- Returns : 1 if the supplied sequence string is valid, 0 otherwise.
- Args : - Sequence string to be validated
- - Boolean to optionally throw an error if the sequence is invalid
-
-=cut
-
-sub validate_seq {
- my ($self, $seqstr, $throw) = @_;
- if ( (defined $seqstr ) &&
- ($seqstr !~ /^[$MATCHPATTERN]*$/) ) {
- if ($throw) {
- $self->throw("Failed validation of sequence '".(defined($self->id) ||
- '[unidentified sequence]')."'. Invalid characters were: " .
- join('',($seqstr =~ /[^$MATCHPATTERN]/g)));
- }
- return 0;
- }
- return 1;
-}
-
-
-=head2 subseq
-
- Title : subseq
- Usage : $substring = $seqobj->subseq(10,40);
- $substring = $seqobj->subseq(10,40,'nogap');
- $substring = $seqobj->subseq(-start=>10, -end=>40, -replace_with=>'tga');
- $substring = $seqobj->subseq($location_obj);
- $substring = $seqobj->subseq($location_obj, -nogap => 1);
- Function: Return the subseq from start to end, where the first sequence
- character has coordinate 1 number is inclusive, ie 1-2 are the
- first two characters of the sequence. The given start coordinate
- has to be larger than the end, even if the sequence is circular.
- Returns : a string
- Args : integer for start position
- integer for end position
- OR
- Bio::LocationI location for subseq (strand honored)
- Specify -NOGAP=>1 to return subseq with gap characters removed
- Specify -REPLACE_WITH=>$new_subseq to replace the subseq returned
- with $new_subseq in the sequence object
-
-=cut
-
-sub subseq {
- my $self = shift;
- my @args = @_;
- my ($start, $end, $nogap, $replace) = $self->_rearrange([qw(START
- END
- NOGAP
- REPLACE_WITH)], @args);
-
- # If -replace_with is specified, validate the replacement sequence
- if (defined $replace) {
- $self->validate_seq( $replace ) ||
- $self->throw("Replacement sequence does not look valid");
- }
-
- if( ref($start) && $start->isa('Bio::LocationI') ) {
- my $loc = $start;
- my $seq = '';
- foreach my $subloc ($loc->each_Location()) {
- my $piece = $self->subseq(-start => $subloc->start(),
- -end => $subloc->end(),
- -replace_with => $replace,
- -nogap => $nogap);
- $piece =~ s/[$GAP_SYMBOLS]//g if $nogap;
- if ($subloc->strand() < 0) {
- $piece = $self->_revcom_from_string($piece, $self->alphabet);
- }
- $seq .= $piece;
- }
- return $seq;
- } elsif( defined $start && defined $end ) {
- if( $start > $end ){
- $self->throw("Bad start,end parameters. Start [$start] has to be ".
- "less than end [$end]");
- }
- if( $start <= 0 ) {
- $self->throw("Bad start parameter ($start). Start must be positive.");
- }
-
- # Remove one from start, and then length is end-start
- $start--;
-
- my $seqstr;
- if (defined $replace) {
- $seqstr = substr $self->{seq}, $start, $end-$start, $replace;
- } else {
- $seqstr = substr $self->{seq}, $start, $end-$start;
- }
-
-
- if ($end > $self->length) {
- if ($self->is_circular) {
- my $start = 0;
- my $end = $end - $self->length;
-
- my $appendstr;
- if (defined $replace) {
- $appendstr = substr $self->{seq}, $start, $end-$start, $replace;
- } else {
- $appendstr = substr $self->{seq}, $start, $end-$start;
- }
-
- $seqstr .= $appendstr;
- } else {
- $self->throw("Bad end parameter ($end). End must be less than ".
- "the total length of sequence (total=".$self->length.")")
- }
- }
-
- $seqstr =~ s/[$GAP_SYMBOLS]//g if ($nogap);
- return $seqstr;
-
- } else {
- $self->warn("Incorrect parameters to subseq - must be two integers or ".
- "a Bio::LocationI object. Got:", $self,$start,$end,$replace,$nogap);
- return;
- }
-}
-
-
-=head2 length
-
- Title : length
- Usage : $len = $seqobj->length();
- Function: Get the stored length of the sequence in number of symbols (bases
- or amino acids).
-
- In some circumstances, you can also set this attribute:
- 1/ For empty sequences, you can set the length to anything you want:
- my $seqobj = Bio::PrimarySeq->new( -length => 123 );
- my $len = $seqobj->len; # 123
- 2/ To save memory when using very long sequences, you can set the
- length of the sequence to the length of the sequence (and nothing
- else):
- my $seqobj = Bio::PrimarySeq->new( -seq => 'ACGT...' ); # 1 Mbp sequence
- # process $seqobj... then after you're done with it
- $seqobj->length($seqobj->length);
- $seqobj->seq(undef); # free memory!
- my $len = $seqobj->len; # 1 Mbp
-
- Note that if you set seq() to a value other than undef at any time,
- the length attribute will be reset.
- Returns : integer representing the length of the sequence.
- Args : Optionally, the value on set
-
-=cut
-
-sub length {
- my ($self, $val) = @_;
- if (defined $val) {
- my $len = $self->{'length'};
- if ($len && ($len != $val)) {
- $self->throw("You're trying to lie about the length: ".
- "is $len but you say ".$val);
- }
- $self->{'length'} = $val;
- $self->{'_freeze_length'} = undef;
- }
- return $self->{'length'};
-}
-
-
-=head2 display_id
-
- Title : display_id or display_name
- Usage : $id_string = $seqobj->display_id();
- Function: Get or set the display id, aka the common name of the sequence object.
-
- The semantics of this is that it is the most likely string to
- be used as an identifier of the sequence, and likely to have
- "human" readability. The id is equivalent to the ID field of
- the GenBank/EMBL databanks and the id field of the
- Swissprot/sptrembl database. In fasta format, the >(\S+) is
- presumed to be the id, though some people overload the id to
- embed other information. Bioperl does not use any embedded
- information in the ID field, and people are encouraged to use
- other mechanisms (accession field for example, or extending
- the sequence object) to solve this.
-
- With the new Bio::DescribeableI interface, display_name aliases
- to this method.
- Returns : A string for the display ID
- Args : Optional string for the display ID to set
-
-=cut
-
-sub display_id {
- my ($self, $value) = @_;
- if( defined $value) {
- $self->{'display_id'} = $value;
- }
- return $self->{'display_id'};
-}
-
-
-=head2 accession_number
-
- Title : accession_number or object_id
- Usage : $unique_key = $seqobj->accession_number;
- Function: Returns the unique biological id for a sequence, commonly
- called the accession_number. For sequences from established
- databases, the implementors should try to use the correct
- accession number. Notice that primary_id() provides the
- unique id for the implemetation, allowing multiple objects
- to have the same accession number in a particular implementation.
-
- For sequences with no accession number, this method should
- return "unknown".
-
- [Note this method name is likely to change in 1.3]
-
- With the new Bio::IdentifiableI interface, this is aliased
- to object_id
- Returns : A string
- Args : A string (optional) for setting
-
-=cut
-
-sub accession_number {
- my( $self, $acc ) = @_;
- if (defined $acc) {
- $self->{'accession_number'} = $acc;
- } else {
- $acc = $self->{'accession_number'};
- $acc = 'unknown' unless defined $acc;
- }
- return $acc;
-}
-
-
-=head2 primary_id
-
- Title : primary_id
- Usage : $unique_key = $seqobj->primary_id;
- Function: Returns the unique id for this object in this
- implementation. This allows implementations to manage their
- own object ids in a way the implementaiton can control
- clients can expect one id to map to one object.
-
- For sequences with no natural primary id, this method
- should return a stringified memory location.
- Returns : A string
- Args : A string (optional, for setting)
-
-=cut
-
-sub primary_id {
- my $self = shift;
-
- if(@_) {
- $self->{'primary_id'} = shift;
- }
- if( ! defined($self->{'primary_id'}) ) {
- return "$self";
- }
- return $self->{'primary_id'};
-}
-
-
-=head2 alphabet
-
- Title : alphabet
- Usage : if( $seqobj->alphabet eq 'dna' ) { # Do something }
- Function: Get/set the alphabet of sequence, one of
- 'dna', 'rna' or 'protein'. This is case sensitive.
-
- This is not called <type> because this would cause
- upgrade problems from the 0.5 and earlier Seq objects.
- Returns : a string either 'dna','rna','protein'. NB - the object must
- make a call of the type - if there is no alphabet specified it
- has to guess.
- Args : optional string to set : 'dna' | 'rna' | 'protein'
-
-
-=cut
-
-sub alphabet {
- my ($self,$value) = @_;
- if (defined $value) {
- $value = lc $value;
- unless ( $valid_type{$value} ) {
- $self->throw("Alphabet '$value' is not a valid alphabet (".
- join(',', map "'$_'", sort keys %valid_type) .") lowercase");
- }
- $self->{'alphabet'} = $value;
- }
- return $self->{'alphabet'};
-}
-
-
-=head2 desc
-
- Title : desc or description
- Usage : $seqobj->desc($newval);
- Function: Get/set description of the sequence.
-
- 'description' is an alias for this for compliance with the
- Bio::DescribeableI interface.
- Returns : value of desc (a string)
- Args : newvalue (a string or undef, optional)
-
-
-=cut
-
-sub desc{
- my $self = shift;
-
- return $self->{'desc'} = shift if @_;
- return $self->{'desc'};
-}
-
-
-=head2 can_call_new
-
- Title : can_call_new
- Usage :
- Function:
- Example :
- Returns : true
- Args :
-
-=cut
-
-sub can_call_new {
- my ($self) = @_;
-
- return 1;
-}
-
-
-=head2 id
-
- Title : id
- Usage : $id = $seqobj->id();
- Function: This is mapped on display_id
- Example :
- Returns :
- Args :
-
-=cut
-
-sub id {
- return shift->display_id(@_);
-}
-
-
-=head2 is_circular
-
- Title : is_circular
- Usage : if( $seqobj->is_circular) { # Do something }
- Function: Returns true if the molecule is circular
- Returns : Boolean value
- Args : none
-
-=cut
-
-sub is_circular{
- my $self = shift;
- return $self->{'is_circular'} = shift if @_;
- return $self->{'is_circular'};
-}
-
-
-=head1 Methods for Bio::IdentifiableI compliance
-
-=head2 object_id
-
- Title : object_id
- Usage : $string = $seqobj->object_id();
- Function: Get or set a string which represents the stable primary identifier
- in this namespace of this object. For DNA sequences this
- is its accession_number, similarly for protein sequences.
-
- This is aliased to accession_number().
- Returns : A scalar
- Args : Optional object ID to set.
-
-=cut
-
-sub object_id {
- return shift->accession_number(@_);
-}
-
-
-=head2 version
-
- Title : version
- Usage : $version = $seqobj->version();
- Function: Get or set a number which differentiates between versions of
- the same object. Higher numbers are considered to be
- later and more relevant, but a single object described
- the same identifier should represent the same concept.
- Returns : A number
- Args : Optional version to set.
-
-=cut
-
-sub version{
- my ($self,$value) = @_;
- if( defined $value) {
- $self->{'_version'} = $value;
- }
- return $self->{'_version'};
-}
-
-
-=head2 authority
-
- Title : authority
- Usage : $authority = $seqobj->authority();
- Function: Get or set a string which represents the organisation which
- granted the namespace, written as the DNS name of the
- organisation (eg, wormbase.org).
- Returns : A scalar
- Args : Optional authority to set.
-
-=cut
-
-sub authority {
- my ($self, $value) = @_;
- if( defined $value) {
- $self->{'authority'} = $value;
- }
- return $self->{'authority'};
-}
-
-
-=head2 namespace
-
- Title : namespace
- Usage : $string = $seqobj->namespace();
- Function: Get or set a string representing the name space this identifier
- is valid in, often the database name or the name describing the
- collection.
- Returns : A scalar
- Args : Optional namespace to set.
-
-=cut
-
-sub namespace{
- my ($self,$value) = @_;
- if( defined $value) {
- $self->{'namespace'} = $value;
- }
- return $self->{'namespace'} || "";
-}
-
-
-=head1 Methods for Bio::DescribableI compliance
-
-This comprises of display_name and description.
-
-=head2 display_name
-
- Title : display_name
- Usage : $string = $seqobj->display_name();
- Function: Get or set a string which is what should be displayed to the user.
- The string should have no spaces (ideally, though a cautious
- user of this interface would not assumme this) and should be
- less than thirty characters (though again, double checking
- this is a good idea).
-
- This is aliased to display_id().
- Returns : A string for the display name
- Args : Optional string for the display name to set.
-
-=cut
-
-sub display_name {
- return shift->display_id(@_);
-}
-
-
-=head2 description
-
- Title : description
- Usage : $string = $seqobj->description();
- Function: Get or set a text string suitable for displaying to the user a
- description. This string is likely to have spaces, but
- should not have any newlines or formatting - just plain
- text. The string should not be greater than 255 characters
- and clients can feel justified at truncating strings at 255
- characters for the purposes of display.
-
- This is aliased to desc().
- Returns : A string for the description
- Args : Optional string for the description to set.
-
-=cut
-
-sub description {
- return shift->desc(@_);
-}
-
-
-=head1 Methods Inherited from Bio::PrimarySeqI
-
-These methods are available on Bio::PrimarySeq, although they are
-actually implemented on Bio::PrimarySeqI
-
-=head2 revcom
-
- Title : revcom
- Usage : $rev = $seqobj->revcom();
- Function: Produces a new Bio::SeqI implementing object which
- is the reversed complement of the sequence. For protein
- sequences this throws an exception of
- "Sequence is a protein. Cannot revcom".
-
- The id is the same id as the orginal sequence, and the
- accession number is also indentical. If someone wants to
- track that this sequence has be reversed, it needs to
- define its own extensions.
-
- To do an inplace edit of an object you can go:
-
- $seqobj = $seqobj->revcom();
-
- This of course, causes Perl to handle the garbage
- collection of the old object, but it is roughly speaking as
- efficient as an inplace edit.
- Returns : A new (fresh) Bio::SeqI object
- Args : none
-
-=head2 trunc
-
- Title : trunc
- Usage : $subseq = $myseq->trunc(10,100);
- Function: Provides a truncation of a sequence,
- Returns : A fresh Bio::SeqI implementing object.
- Args : Numbers for the start and end positions
-
-=head1 Internal methods
-
-These are internal methods to PrimarySeq
-
-=head2 _guess_alphabet
-
- Title : _guess_alphabet
- Usage :
- Function: Automatically guess and set the type of sequence: dna, rna, protein
- or '' if the sequence was empty. This method first removes dots (.),
- dashes (-) and question marks (?) before guessing the alphabet
- using the IUPAC conventions for ambiguous residues. Since the DNA and
- RNA characters are also valid characters for proteins, there is
- no foolproof way of determining the right alphabet. This is our best
- guess only!
- Returns : string 'dna', 'rna', 'protein' or ''.
- Args : none
-
-=cut
-
-sub _guess_alphabet {
- my ($self) = @_;
- # Guess alphabet
- my $alphabet = $self->_guess_alphabet_from_string($self->seq, $self->{'_nowarnonempty'});
- # Set alphabet unless it is unknown
- $self->alphabet($alphabet) if $alphabet;
- return $alphabet;
-}
-
-
-sub _guess_alphabet_from_string {
- # Get the alphabet from a sequence string
- my ($self, $str, $nowarnonempty) = @_;
-
- $nowarnonempty = 0 if not defined $nowarnonempty;
-
- # Remove chars that clearly don't denote nucleic or amino acids
- $str =~ s/[-.?]//gi;
-
- # Check for sequences without valid letters
- my $alphabet;
- my $total = CORE::length($str);
- if( $total == 0 ) {
- if (not $nowarnonempty) {
- $self->warn("Got a sequence without letters. Could not guess alphabet");
- }
- $alphabet = '';
- }
-
- # Determine alphabet now
- if (not defined $alphabet) {
- if ($str =~ m/[EFIJLOPQXZ]/i) {
- # Start with a safe method to find proteins.
- # Unambiguous IUPAC letters for proteins are: E,F,I,J,L,O,P,Q,X,Z
- $alphabet = 'protein';
- } else {
- # Alphabet is unsure, could still be DNA, RNA or protein
- # DNA and RNA contain mostly A, T, U, G, C and N, but the other
- # letters they use are also among the 15 valid letters that a
- # protein sequence can contain at this stage. Make our best guess
- # based on sequence composition. If it contains over 70% of ACGTUN,
- # it is likely nucleic.
- if( ($str =~ tr/ATUGCNatugcn//) / $total > 0.7 ) {
- if ( $str =~ m/U/i ) {
- $alphabet = 'rna';
- } else {
- $alphabet = 'dna';
- }
- } else {
- $alphabet = 'protein';
- }
- }
- }
-
- return $alphabet;
-}
-
-
-############################################################################
-# aliases due to name changes or to compensate for our lack of consistency #
-############################################################################
-
-sub accession {
- my $self = shift;
-
- $self->warn(ref($self)."::accession is deprecated, ".
- "use accession_number() instead");
- return $self->accession_number(@_);
-}
-
-1;
-
diff --git a/lib/Bio/PrimarySeqI.pm b/lib/Bio/PrimarySeqI.pm
deleted file mode 100644
index ce27ae1..0000000
--- a/lib/Bio/PrimarySeqI.pm
+++ /dev/null
@@ -1,944 +0,0 @@
-#
-# BioPerl module for Bio::PrimarySeqI
-#
-# Please direct questions and support issues to <bioperl-l at bioperl.org>
-#
-# Cared for by Ewan Birney <birney at ebi.ac.uk>
-#
-# Copyright Ewan Birney
-#
-# You may distribute this module under the same terms as perl itself
-
-# POD documentation - main docs before the code
-
-
-=head1 NAME
-
-Bio::PrimarySeqI - Interface definition for a Bio::PrimarySeq
-
-=head1 SYNOPSIS
-
- # Bio::PrimarySeqI is the interface class for sequences.
- # If you are a newcomer to bioperl, you might want to start with
- # Bio::Seq documentation.
-
- # Test if this is a seq object
- $obj->isa("Bio::PrimarySeqI") ||
- $obj->throw("$obj does not implement the Bio::PrimarySeqI interface");
-
- # Accessors
- $string = $obj->seq();
- $substring = $obj->subseq(12,50);
- $display = $obj->display_id(); # for human display
- $id = $obj->primary_id(); # unique id for this object,
- # implementation defined
- $unique_key= $obj->accession_number(); # unique biological id
-
-
- # Object manipulation
- eval {
- $rev = $obj->revcom();
- };
- if( $@ ) {
- $obj->throw( "Could not reverse complement. ".
- "Probably not DNA. Actual exception\n$@\n" );
- }
-
- $trunc = $obj->trunc(12,50);
- # $rev and $trunc are Bio::PrimarySeqI compliant objects
-
-
-=head1 DESCRIPTION
-
-This object defines an abstract interface to basic sequence
-information - for most users of the package the documentation (and
-methods) in this class are not useful - this is a developers-only
-class which defines what methods have to be implmented by other Perl
-objects to comply to the Bio::PrimarySeqI interface. Go "perldoc
-Bio::Seq" or "man Bio::Seq" for more information on the main class for
-sequences.
-
-PrimarySeq is an object just for the sequence and its name(s), nothing
-more. Seq is the larger object complete with features. There is a pure
-perl implementation of this in L<Bio::PrimarySeq>. If you just want to
-use L<Bio::PrimarySeq> objects, then please read that module first. This
-module defines the interface, and is of more interest to people who
-want to wrap their own Perl Objects/RDBs/FileSystems etc in way that
-they "are" bioperl sequence objects, even though it is not using Perl
-to store the sequence etc.
-
-This interface defines what bioperl considers necessary to "be" a
-sequence, without providing an implementation of this, an
-implementation is provided in L<Bio::PrimarySeq>. If you want to provide
-a Bio::PrimarySeq-compliant object which in fact wraps another
-object/database/out-of-perl experience, then this is the correct thing
-to wrap, generally by providing a wrapper class which would inherit
-from your object and this Bio::PrimarySeqI interface. The wrapper class
-then would have methods lists in the "Implementation Specific
-Functions" which would provide these methods for your object.
-
-=head1 FEEDBACK
-
-=head2 Mailing Lists
-
-User feedback is an integral part of the evolution of this and other
-Bioperl modules. Send your comments and suggestions preferably to one
-of the Bioperl mailing lists. Your participation is much appreciated.
-
- bioperl-l at bioperl.org - General discussion
- http://bioperl.org/wiki/Mailing_lists - About the mailing lists
-
-=head2 Support
-
-Please direct usage questions or support issues to the mailing list:
-
-I<bioperl-l at bioperl.org>
-
-rather than to the module maintainer directly. Many experienced and
-reponsive experts will be able look at the problem and quickly
-address it. Please include a thorough description of the problem
-with code and data examples if at all possible.
-
-=head2 Reporting Bugs
-
-Report bugs to the Bioperl bug tracking system to help us keep track
-the bugs and their resolution. Bug reports can be submitted via the
-web:
-
- https://redmine.open-bio.org/projects/bioperl/
-
-=head1 AUTHOR - Ewan Birney
-
-Email birney at ebi.ac.uk
-
-=head1 APPENDIX
-
-The rest of the documentation details each of the object
-methods. Internal methods are usually preceded with a _
-
-=cut
-
-
-package Bio::PrimarySeqI;
-use strict;
-use Bio::Tools::CodonTable;
-
-use base qw(Bio::Root::RootI);
-
-
-=head1 Implementation-specific Functions
-
-These functions are the ones that a specific implementation must
-define.
-
-=head2 seq
-
- Title : seq
- Usage : $string = $obj->seq()
- Function: Returns the sequence as a string of letters. The
- case of the letters is left up to the implementer.
- Suggested cases are upper case for proteins and lower case for
- DNA sequence (IUPAC standard), but implementations are suggested to
- keep an open mind about case (some users... want mixed case!)
- Returns : A scalar
- Status : Virtual
-
-=cut
-
-sub seq {
- my ($self) = @_;
- $self->throw_not_implemented();
-}
-
-
-=head2 subseq
-
- Title : subseq
- Usage : $substring = $obj->subseq(10,40);
- Function: Returns the subseq from start to end, where the first base
- is 1 and the number is inclusive, i.e. 1-2 are the first two
- bases of the sequence.
-
- Start cannot be larger than end but can be equal.
-
- Returns : A string
- Args :
- Status : Virtual
-
-=cut
-
-sub subseq {
- my ($self) = @_;
- $self->throw_not_implemented();
-}
-
-
-=head2 display_id
-
- Title : display_id
- Usage : $id_string = $obj->display_id();
- Function: Returns the display id, also known as the common name of the Sequence
- object.
-
- The semantics of this is that it is the most likely string
- to be used as an identifier of the sequence, and likely to
- have "human" readability. The id is equivalent to the ID
- field of the GenBank/EMBL databanks and the id field of the
- Swissprot/sptrembl database. In fasta format, the >(\S+) is
- presumed to be the id, though some people overload the id
- to embed other information. Bioperl does not use any
- embedded information in the ID field, and people are
- encouraged to use other mechanisms (accession field for
- example, or extending the sequence object) to solve this.
-
- Notice that $seq->id() maps to this function, mainly for
- legacy/convenience reasons.
- Returns : A string
- Args : None
- Status : Virtual
-
-=cut
-
-sub display_id {
- my ($self) = @_;
- $self->throw_not_implemented();
-}
-
-
-=head2 accession_number
-
- Title : accession_number
- Usage : $unique_biological_key = $obj->accession_number;
- Function: Returns the unique biological id for a sequence, commonly
- called the accession_number. For sequences from established
- databases, the implementors should try to use the correct
- accession number. Notice that primary_id() provides the
- unique id for the implemetation, allowing multiple objects
- to have the same accession number in a particular implementation.
-
- For sequences with no accession number, this method should return
- "unknown".
- Returns : A string
- Args : None
- Status : Virtual
-
-=cut
-
-sub accession_number {
- my ($self, at args) = @_;
- $self->throw_not_implemented();
-}
-
-
-=head2 primary_id
-
- Title : primary_id
- Usage : $unique_implementation_key = $obj->primary_id;
- Function: Returns the unique id for this object in this
- implementation. This allows implementations to manage their
- own object ids in a way the implementaiton can control
- clients can expect one id to map to one object.
-
- For sequences with no accession number, this method should
- return a stringified memory location.
-
- Returns : A string
- Args : None
- Status : Virtual
-
-=cut
-
-sub primary_id {
- my ($self, at args) = @_;
- $self->throw_not_implemented();
-}
-
-
-=head2 can_call_new
-
- Title : can_call_new
- Usage : if( $obj->can_call_new ) {
- $newobj = $obj->new( %param );
- }
- Function: Can_call_new returns 1 or 0 depending
- on whether an implementation allows new
- constructor to be called. If a new constructor
- is allowed, then it should take the followed hashed
- constructor list.
-
- $myobject->new( -seq => $sequence_as_string,
- -display_id => $id
- -accession_number => $accession
- -alphabet => 'dna',
- );
- Returns : 1 or 0
- Args :
-
-
-=cut
-
-sub can_call_new {
- my ($self, at args) = @_;
- # we default to 0 here
- return 0;
-}
-
-
-=head2 alphabet
-
- Title : alphabet
- Usage : if( $obj->alphabet eq 'dna' ) { /Do Something/ }
- Function: Returns the type of sequence being one of
- 'dna', 'rna' or 'protein'. This is case sensitive.
-
- This is not called "type" because this would cause
- upgrade problems from the 0.5 and earlier Seq objects.
-
- Returns : A string either 'dna','rna','protein'. NB - the object must
- make a call of the alphabet, if there is no alphabet specified it
- has to guess.
- Args : None
- Status : Virtual
-
-=cut
-
-sub alphabet {
- my ( $self ) = @_;
- $self->throw_not_implemented();
-}
-
-
-=head2 moltype
-
- Title : moltype
- Usage : Deprecated. Use alphabet() instead.
-
-=cut
-
-sub moltype {
- my ($self, at args) = @_;
- $self->warn("moltype: pre v1.0 method. Calling alphabet() instead...");
- return $self->alphabet(@args);
-}
-
-
-=head1 Implementation-optional Functions
-
-The following functions rely on the above functions. An
-implementing class does not need to provide these functions, as they
-will be provided by this class, but is free to override these
-functions.
-
-The revcom(), trunc(), and translate() methods create new sequence
-objects. They will call new() on the class of the sequence object
-instance passed as argument, unless can_call_new() returns FALSE. In
-the latter case a Bio::PrimarySeq object will be created. Implementors
-which really want to control how objects are created (eg, for object
-persistence over a database, or objects in a CORBA framework), they
-are encouraged to override these methods
-
-=head2 revcom
-
- Title : revcom
- Usage : $rev = $seq->revcom()
- Function: Produces a new Bio::PrimarySeqI implementing object which
- is the reversed complement of the sequence. For protein
- sequences this throws an exception of "Sequence is a
- protein. Cannot revcom".
-
- The id is the same id as the original sequence, and the
- accession number is also indentical. If someone wants to
- track that this sequence has be reversed, it needs to
- define its own extensionsj.
-
- To do an inplace edit of an object you can go:
-
- $seq = $seq->revcom();
-
- This of course, causes Perl to handle the garbage
- collection of the old object, but it is roughly speaking as
- efficient as an inplace edit.
-
- Returns : A new (fresh) Bio::PrimarySeqI object
- Args : None
-
-
-=cut
-
-sub revcom {
- my ($self) = @_;
- my ($seqclass, $opts) = $self->_setup_class;
- my $out = $seqclass->new(
- -seq => $self->_revcom_from_string($self->seq, $self->alphabet),
- -is_circular => $self->is_circular,
- -display_id => $self->display_id,
- -accession_number => $self->accession_number,
- -alphabet => $self->alphabet,
- -desc => $self->desc,
- -verbose => $self->verbose,
- %$opts,
- );
- return $out;
-}
-
-
-sub _revcom_from_string {
- my ($self, $string, $alphabet) = @_;
-
- # Check that reverse-complementing makes sense
- if( $alphabet eq 'protein' ) {
- $self->throw("Sequence is a protein. Cannot revcom.");
- }
- if( $alphabet ne 'dna' && $alphabet ne 'rna' ) {
- my $msg = "Sequence is not dna or rna, but [$alphabet]. Attempting to revcom, ".
- "but unsure if this is right.";
- if( $self->can('warn') ) {
- $self->warn($msg);
- } else {
- warn("[$self] $msg");
- }
- }
-
- # If sequence is RNA, map to DNA (then map back later)
- if( $alphabet eq 'rna' ) {
- $string =~ tr/uU/tT/;
- }
-
- # Reverse-complement now
- $string =~ tr/acgtrymkswhbvdnxACGTRYMKSWHBVDNX/tgcayrkmswdvbhnxTGCAYRKMSWDVBHNX/;
- $string = CORE::reverse $string;
-
- # Map back RNA to DNA
- if( $alphabet eq 'rna' ) {
- $string =~ tr/tT/uU/;
- }
-
- return $string;
-}
-
-
-=head2 trunc
-
- Title : trunc
- Usage : $subseq = $myseq->trunc(10,100);
- Function: Provides a truncation of a sequence.
- Returns : A fresh Bio::PrimarySeqI implementing object.
- Args : Two integers denoting first and last base of the sub-sequence.
-
-
-=cut
-
-sub trunc {
- my ($self,$start,$end) = @_;
-
- my $str;
- if( defined $start && ref($start) &&
- $start->isa('Bio::LocationI') ) {
- $str = $self->subseq($start); # start is a location actually
- } elsif( !$end ) {
- $self->throw("trunc start,end -- there was no end for $start");
- } elsif( $end < $start ) {
- my $msg = "start [$start] is greater than end [$end]. \n".
- "If you want to truncated and reverse complement, \n".
- "you must call trunc followed by revcom. Sorry.";
- $self->throw($msg);
- } else {
- $str = $self->subseq($start,$end);
- }
-
- my ($seqclass, $opts) = $self->_setup_class;
- my $out = $seqclass->new(
- -seq => $str,
- -display_id => $self->display_id,
- -accession_number => $self->accession_number,
- -alphabet => $self->alphabet,
- -desc => $self->desc,
- -verbose => $self->verbose,
- %$opts,
- );
- return $out;
-}
-
-
-=head2 translate
-
- Title : translate
- Usage : $protein_seq_obj = $dna_seq_obj->translate
-
- Or if you expect a complete coding sequence (CDS) translation,
- with initiator at the beginning and terminator at the end:
-
- $protein_seq_obj = $cds_seq_obj->translate(-complete => 1);
-
- Or if you want translate() to find the first initiation
- codon and return the corresponding protein:
-
- $protein_seq_obj = $cds_seq_obj->translate(-orf => 1);
-
- Function: Provides the translation of the DNA sequence using full
- IUPAC ambiguities in DNA/RNA and amino acid codes.
-
- The complete CDS translation is identical to EMBL/TREMBL
- database translation. Note that the trailing terminator
- character is removed before returning the translated protein
- object.
-
- Note: if you set $dna_seq_obj->verbose(1) you will get a
- warning if the first codon is not a valid initiator.
-
- Returns : A Bio::PrimarySeqI implementing object
- Args : -terminator
- character for terminator, default '*'
- -unknown
- character for unknown, default 'X'
- -frame
- positive integer frame shift (in bases), default 0
- -codontable_id
- integer codon table id, default 1
- -complete
- boolean, if true, complete CDS is expected. default false
- -complete_codons
- boolean, if true, codons which are incomplete are translated if a
- suitable amino acid is found. For instance, if the incomplete
- codon is 'GG', the completed codon is 'GGN', which is glycine
- (G). Defaults to 'false'; setting '-complete' also makes this
- true.
- -throw
- boolean, throw exception if ORF not complete, default false
- -orf
- if 'longest', find longest ORF. other true value, find
- first ORF. default 0
- -codontable
- optional L<Bio::Tools::CodonTable> object to use for
- translation
- -start
- optional three-character string to force as initiation
- codon (e.g. 'atg'). If unset, start codons are
- determined by the CodonTable. Case insensitive.
- -offset
- optional positive integer offset for fuzzy locations.
- if set, must be either 1, 2, or 3
-
-=head3 Notes
-
-The -start argument only applies when -orf is set to 1. By default all
-initiation codons found in the given codon table are used but when
-"start" is set to some codon this codon will be used exclusively as
-the initiation codon. Note that the default codon table (NCBI
-"Standard") has 3 initiation codons!
-
-By default translate() translates termination codons to the some
-character (default is *), both internal and trailing codons. Setting
-"-complete" to 1 tells translate() to remove the trailing character.
-
--offset is used for seqfeatures which contain the the \codon_start tag
-and can be set to 1, 2, or 3. This is the offset by which the
-sequence translation starts relative to the first base of the feature
-
-For details on codon tables used by translate() see L<Bio::Tools::CodonTable>.
-
-Deprecated argument set (v. 1.5.1 and prior versions) where each argument is an
-element in an array:
-
- 1: character for terminator (optional), defaults to '*'.
- 2: character for unknown amino acid (optional), defaults to 'X'.
- 3: frame (optional), valid values are 0, 1, 2, defaults to 0.
- 4: codon table id (optional), defaults to 1.
- 5: complete coding sequence expected, defaults to 0 (false).
- 6: boolean, throw exception if not complete coding sequence
- (true), defaults to warning (false)
- 7: codontable, a custom Bio::Tools::CodonTable object (optional).
-
-=cut
-
-sub translate {
- my ($self, at args) = @_;
- my ($terminator, $unknown, $frame, $codonTableId, $complete,
- $complete_codons, $throw, $codonTable, $orf, $start_codon, $offset);
-
- ## new API with named parameters, post 1.5.1
- if ($args[0] && $args[0] =~ /^-[A-Z]+/i) {
- ($terminator, $unknown, $frame, $codonTableId, $complete,
- $complete_codons, $throw,$codonTable, $orf, $start_codon, $offset) =
- $self->_rearrange([qw(TERMINATOR
- UNKNOWN
- FRAME
- CODONTABLE_ID
- COMPLETE
- COMPLETE_CODONS
- THROW
- CODONTABLE
- ORF
- START
- OFFSET)], @args);
- ## old API, 1.5.1 and preceding versions
- } else {
- ($terminator, $unknown, $frame, $codonTableId,
- $complete, $throw, $codonTable, $offset) = @args;
- }
-
- ## Initialize termination codon, unknown codon, codon table id, frame
- $terminator = '*' unless (defined($terminator) and $terminator ne '');
- $unknown = "X" unless (defined($unknown) and $unknown ne '');
- $frame = 0 unless (defined($frame) and $frame ne '');
- $codonTableId = 1 unless (defined($codonTableId) and $codonTableId ne '');
- $complete_codons ||= $complete || 0;
-
- ## Get a CodonTable, error if custom CodonTable is invalid
- if ($codonTable) {
- $self->throw("Need a Bio::Tools::CodonTable object, not ". $codonTable)
- unless $codonTable->isa('Bio::Tools::CodonTable');
- } else {
-
- # shouldn't this be cached? Seems wasteful to have a new instance
- # every time...
- $codonTable = Bio::Tools::CodonTable->new( -id => $codonTableId);
- }
-
- ## Error if alphabet is "protein"
- $self->throw("Can't translate an amino acid sequence.") if
- ($self->alphabet =~ /protein/i);
-
- ## Error if -start parameter isn't a valid codon
- if ($start_codon) {
- $self->throw("Invalid start codon: $start_codon.") if
- ( $start_codon !~ /^[A-Z]{3}$/i );
- }
-
- my $seq;
- if ($offset) {
- $self->throw("Offset must be 1, 2, or 3.") if
- ( $offset !~ /^[123]$/ );
- my ($start, $end) = ($offset, $self->length);
- ($seq) = $self->subseq($start, $end);
- } else {
- ($seq) = $self->seq();
- }
-
- ## ignore frame if an ORF is supposed to be found
- if ( $orf ) {
- my ($orf_region) = $self->_find_orfs_nucleotide( $seq, $codonTable, $start_codon, $orf eq 'longest' ? 0 : 'first_only' );
- $seq = $self->_orf_sequence( $seq, $orf_region );
- } else {
- ## use frame, error if frame is not 0, 1 or 2
- $self->throw("Valid values for frame are 0, 1, or 2, not $frame.")
- unless ($frame == 0 or $frame == 1 or $frame == 2);
- $seq = substr($seq,$frame);
- }
-
- ## Translate it
- my $output = $codonTable->translate($seq, $complete_codons);
- # Use user-input terminator/unknown
- $output =~ s/\*/$terminator/g;
- $output =~ s/X/$unknown/g;
-
- ## Only if we are expecting to translate a complete coding region
- if ($complete) {
- my $id = $self->display_id;
- # remove the terminator character
- if( substr($output,-1,1) eq $terminator ) {
- chop $output;
- } else {
- $throw && $self->throw("Seq [$id]: Not using a valid terminator codon!");
- $self->warn("Seq [$id]: Not using a valid terminator codon!");
- }
- # test if there are terminator characters inside the protein sequence!
- if ($output =~ /\Q$terminator\E/) {
- $id ||= '';
- $throw && $self->throw("Seq [$id]: Terminator codon inside CDS!");
- $self->warn("Seq [$id]: Terminator codon inside CDS!");
- }
- # if the initiator codon is not ATG, the amino acid needs to be changed to M
- if ( substr($output,0,1) ne 'M' ) {
- if ($codonTable->is_start_codon(substr($seq, 0, 3)) ) {
- $output = 'M'. substr($output,1);
- } elsif ($throw) {
- $self->throw("Seq [$id]: Not using a valid initiator codon!");
- } else {
- $self->warn("Seq [$id]: Not using a valid initiator codon!");
- }
- }
- }
-
- my ($seqclass, $opts) = $self->_setup_class;
- my $out = $seqclass->new(
- -seq => $output,
- -display_id => $self->display_id,
- -accession_number => $self->accession_number,
- # is there anything wrong with retaining the desc?
- -desc => $self->desc,
- -alphabet => 'protein',
- -verbose => $self->verbose,
- %$opts,
- );
- return $out;
-}
-
-
-=head2 transcribe()
-
- Title : transcribe
- Usage : $xseq = $seq->transcribe
- Function: Convert base T to base U
- Returns : PrimarySeqI object of alphabet 'rna' or
- undef if $seq->alphabet ne 'dna'
- Args :
-
-=cut
-
-sub transcribe {
- my $self = shift;
- return unless $self->alphabet eq 'dna';
- my $s = $self->seq;
- $s =~ tr/tT/uU/;
- my $desc = $self->desc || '';
- my ($seqclass, $opts) = $self->_setup_class;
- return $seqclass->new(
- -seq => $s,
- -alphabet => 'rna',
- -display_id => $self->display_id,
- -accession_number => $self->accession_number,
- -desc => "${desc}[TRANSCRIBED]",
- -verbose => $self->verbose,
- %$opts,
- );
-}
-
-
-=head2 rev_transcribe()
-
- Title : rev_transcribe
- Usage : $rtseq = $seq->rev_transcribe
- Function: Convert base U to base T
- Returns : PrimarySeqI object of alphabet 'dna' or
- undef if $seq->alphabet ne 'rna'
- Args :
-
-=cut
-
-sub rev_transcribe {
- my $self = shift;
- return unless $self->alphabet eq 'rna';
- my $s = $self->seq;
- $s =~ tr/uU/tT/;
- my ($seqclass, $opts) = $self->_setup_class;
- return $seqclass->new(
- -seq => $s,
- -alphabet => 'dna',
- -display_id => $self->display_id,
- -accession_number => $self->accession_number,
- -desc => $self->desc . "[REVERSE TRANSCRIBED]",
- -verbose => $self->verbose,
- %$opts,
- );
-}
-
-
-=head2 id
-
- Title : id
- Usage : $id = $seq->id()
- Function: ID of the sequence. This should normally be (and actually is in
- the implementation provided here) just a synonym for display_id().
- Returns : A string.
- Args :
-
-=cut
-
-sub id {
- my ($self)= @_;
- return $self->display_id();
-}
-
-
-=head2 length
-
- Title : length
- Usage : $len = $seq->length()
- Function:
- Returns : Integer representing the length of the sequence.
- Args :
-
-=cut
-
-sub length {
- my ($self)= @_;
- $self->throw_not_implemented();
-}
-
-
-=head2 desc
-
- Title : desc
- Usage : $seq->desc($newval);
- $description = $seq->desc();
- Function: Get/set description text for a seq object
- Returns : Value of desc
- Args : newvalue (optional)
-
-=cut
-
-sub desc {
- shift->throw_not_implemented();
-}
-
-
-=head2 is_circular
-
- Title : is_circular
- Usage : if( $obj->is_circular) { # Do something }
- Function: Returns true if the molecule is circular
- Returns : Boolean value
- Args : none
-
-=cut
-
-sub is_circular {
- shift->throw_not_implemented;
-}
-
-
-=head1 Private functions
-
-These are some private functions for the PrimarySeqI interface. You do not
-need to implement these functions
-
-=head2 _find_orfs_nucleotide
-
- Title : _find_orfs_nucleotide
- Usage :
- Function: Finds ORF starting at 1st initiation codon in nucleotide sequence.
- The ORF is not required to have a termination codon.
- Example :
- Returns : a list of string coordinates of ORF locations (0-based half-open),
- sorted descending by length (so that the longest is first)
- as: [ start, end, frame, length ], [ start, end, frame, length ], ...
- Args : Nucleotide sequence,
- CodonTable object,
- (optional) alternative initiation codon (e.g. 'ATA'),
- (optional) boolean that, if true, stops after finding the
- first available ORF
-
-=cut
-
-sub _find_orfs_nucleotide {
- my ( $self, $sequence, $codon_table, $start_codon, $first_only ) = @_;
- $sequence = uc $sequence;
- $start_codon = uc $start_codon if $start_codon;
-
- my $is_start = $start_codon
- ? sub { shift eq $start_codon }
- : sub { $codon_table->is_start_codon( shift ) };
-
- # stores the begin index of the currently-running ORF in each
- # reading frame
- my @current_orf_start = (-1,-1,-1);
-
- #< stores coordinates of longest observed orf (so far) in each
- # reading frame
- my @orfs;
-
- # go through each base of the sequence, and each reading frame for each base
- my $seqlen = CORE::length $sequence;
- for( my $j = 0; $j <= $seqlen-3; $j++ ) {
- my $frame = $j % 3;
-
- my $this_codon = substr( $sequence, $j, 3 );
-
- # if in an orf and this is either a stop codon or the last in-frame codon in the string
- if ( $current_orf_start[$frame] >= 0 ) {
- if ( $codon_table->is_ter_codon( $this_codon ) ||( my $is_last_codon_in_frame = ($j >= $seqlen-5)) ) {
- # record ORF start, end (half-open), length, and frame
- my @this_orf = ( $current_orf_start[$frame], $j+3, undef, $frame );
- my $this_orf_length = $this_orf[2] = ( $this_orf[1] - $this_orf[0] );
-
- $self->warn( "Translating partial ORF "
- .$self->_truncate_seq( $self->_orf_sequence( $sequence, \@this_orf ))
- .' from end of nucleotide sequence'
- )
- if $first_only && $is_last_codon_in_frame;
-
- return \@this_orf if $first_only;
- push @orfs, \@this_orf;
- $current_orf_start[$frame] = -1;
- }
- }
- # if this is a start codon
- elsif ( $is_start->($this_codon) ) {
- $current_orf_start[$frame] = $j;
- }
- }
-
- return sort { $b->[2] <=> $a->[2] } @orfs;
-}
-
-
-sub _truncate_seq {
- my ($self, $seq) = @_;
- return CORE::length($seq) > 200 ? substr($seq,0,50).'...'.substr($seq,-50) : $seq;
-}
-
-
-sub _orf_sequence {
- my ($self, $seq, $orf ) = @_;
- return '' unless $orf;
- return substr( $seq, $orf->[0], $orf->[2] )
-}
-
-
-=head2 _attempt_to_load_Seq
-
- Title : _attempt_to_load_Seq
- Usage :
- Function:
- Example :
- Returns :
- Args :
-
-=cut
-
-sub _attempt_to_load_Seq {
- my ($self) = @_;
-
- if( $main::{'Bio::PrimarySeq'} ) {
- return 1;
- } else {
- eval {
- require Bio::PrimarySeq;
- };
- if( $@ ) {
- my $text = "Bio::PrimarySeq could not be loaded for [$self]\n".
- "This indicates that you are using Bio::PrimarySeqI ".
- "without Bio::PrimarySeq loaded or without providing a ".
- "complete implementation.\nThe most likely problem is that there ".
- "has been a misconfiguration of the bioperl environment\n".
- "Actual exception:\n\n";
- $self->throw("$text$@\n");
- return 0;
- }
- return 1;
- }
-}
-
-
-sub _setup_class {
- # Return name of class and setup some default parameters
- my ($self) = @_;
- my $seqclass;
- if ($self->can_call_new()) {
- $seqclass = ref($self);
- } else {
- $seqclass = 'Bio::PrimarySeq';
- $self->_attempt_to_load_Seq();
- }
- my %opts;
- if ($seqclass eq 'Bio::PrimarySeq') {
- # Since sequence is in a Seq object, it has already been validated.
- # We do not need to validate its trunc(), revcom(), etc
- $opts{ -direct } = 1;
- }
- return $seqclass, \%opts;
-}
-
-
-1;
diff --git a/lib/Bio/Seq/SeqFastaSpeedFactory.pm b/lib/Bio/Seq/SeqFastaSpeedFactory.pm
deleted file mode 100644
index 8fb1035..0000000
--- a/lib/Bio/Seq/SeqFastaSpeedFactory.pm
+++ /dev/null
@@ -1,149 +0,0 @@
-#
-# BioPerl module for Bio::Seq::SeqFastaSpeedFactory
-#
-# Please direct questions and support issues to <bioperl-l at bioperl.org>
-#
-# Cared for by Jason Stajich <jason at bioperl.org>
-#
-# Copyright Jason Stajich
-#
-# You may distribute this module under the same terms as perl itself
-
-# POD documentation - main docs before the code
-
-=head1 NAME
-
-Bio::Seq::SeqFastaSpeedFactory - Rapid creation of Bio::Seq objects through a factory
-
-=head1 SYNOPSIS
-
- use Bio::Seq::SeqFastaSpeedFactory;
- my $factory = Bio::Seq::SeqFastaSpeedFactory->new();
- my $seq = $factory->create( -seq => 'WYRAVLC',
- -id => 'name' );
-
-=head1 DESCRIPTION
-
-This factory was designed to build Bio::Seq objects as quickly as possible, but
-is not as generic as L<Bio::Seq::SeqFactory>. It can be used to create sequences
-from non-rich file formats. The L<Bio::SeqIO::fasta> sequence parser uses this
-factory.
-
-=head1 FEEDBACK
-
-=head2 Mailing Lists
-
-User feedback is an integral part of the evolution of this and other
-Bioperl modules. Send your comments and suggestions preferably to
-the Bioperl mailing list. Your participation is much appreciated.
-
- bioperl-l at bioperl.org - General discussion
- http://bioperl.org/wiki/Mailing_lists - About the mailing lists
-
-=head2 Support
-
-Please direct usage questions or support issues to the mailing list:
-
-I<bioperl-l at bioperl.org>
-
-rather than to the module maintainer directly. Many experienced and
-reponsive experts will be able look at the problem and quickly
-address it. Please include a thorough description of the problem
-with code and data examples if at all possible.
-
-=head2 Reporting Bugs
-
-Report bugs to the Bioperl bug tracking system to help us keep track
-of the bugs and their resolution. Bug reports can be submitted via the
-web:
-
- https://redmine.open-bio.org/projects/bioperl/
-
-=head1 AUTHOR - Jason Stajich
-
-Email jason at bioperl.org
-
-=head1 APPENDIX
-
-The rest of the documentation details each of the object methods.
-Internal methods are usually preceded with a _
-
-=cut
-
-
-# Let the code begin...
-
-
-package Bio::Seq::SeqFastaSpeedFactory;
-use strict;
-
-use Bio::Seq;
-use Bio::PrimarySeq;
-
-use base qw(Bio::Root::Root Bio::Factory::SequenceFactoryI);
-
-
-=head2 new
-
- Title : new
- Usage : my $obj = Bio::Seq::SeqFastaSpeedFactory->new();
- Function: Builds a new Bio::Seq::SeqFastaSpeedFactory object
- Returns : Bio::Seq::SeqFastaSpeedFactory
- Args : None
-
-=cut
-
-sub new {
- my($class, at args) = @_;
- my $self = $class->SUPER::new(@args);
- return $self;
-}
-
-
-=head2 create
-
- Title : create
- Usage : my $seq = $seqbuilder->create(-seq => 'CAGT', -id => 'name');
- Function: Instantiates a new Bio::Seq object, correctly built but very
- fast, knowing stuff about Bio::PrimarySeq and Bio::Seq
- Returns : A Bio::Seq object
- Args : Initialization parameters for the sequence object we want:
- -id
- -primary_id
- -display_id
- -desc
- -seq
- -alphabet
-
-=cut
-
-sub create {
- my ($self, at args) = @_;
-
- my %param = @args;
- @param{ map { lc $_ } keys %param } = values %param; # lowercase keys
-
- my $sequence = $param{'-seq'};
- my $fulldesc = $param{'-desc'};
- my $id = defined $param{'-id'} ? $param{'-id'} : $param{'-primary_id'};
- my $alphabet = $param{'-alphabet'};
-
- my $seq = bless {}, 'Bio::Seq';
- my $t_pseq = $seq->{'primary_seq'} = bless {}, 'Bio::PrimarySeq';
- $t_pseq->{'seq'} = $sequence;
- $t_pseq->{'length'} = CORE::length($sequence);
- $t_pseq->{'desc'} = $fulldesc;
- $t_pseq->{'display_id'} = $id;
- $t_pseq->{'primary_id'} = $id;
- $seq->{'primary_id'} = $id; # currently Bio::Seq does not delegate this
- if( $sequence and !$alphabet ) {
- $t_pseq->_guess_alphabet();
- } elsif ( $sequence and $alphabet ) {
- $t_pseq->{'alphabet'} = $alphabet;
- }
-
- return $seq;
-}
-
-1;
-
diff --git a/lib/Bio/Seq/SimulatedRead.pm b/lib/Bio/Seq/SimulatedRead.pm
deleted file mode 100644
index fcc381e..0000000
--- a/lib/Bio/Seq/SimulatedRead.pm
+++ /dev/null
@@ -1,653 +0,0 @@
-package Bio::Seq::SimulatedRead;
-
-
-=head1 NAME
-
-Bio::Seq::SimulatedRead - Read with sequencing errors taken from a reference sequence
-
-=head1 SYNOPSIS
-
- use Bio::Seq::SimulatedRead;
- use Bio::PrimarySeq;
-
- # Create a reference sequence
- my $genome = Bio::PrimarySeq->new( -id => 'human_chr2',
- -seq => 'TAAAAAAACCCCTG',
- -desc => 'The human genome' );
-
- # A 10-bp error-free read taken from a genome
- my $read = Bio::Seq::SimulatedRead->new(
- -reference => $genome , # sequence to generate the read from
- -id => 'read001', # read ID
- -start => 3 , # start of the read on the genome forward strand
- -end => 12 , # end of the read on the genome forward strand
- -strand => 1 , # genome strand that the read is on
- );
-
- # Display the sequence of the read
- print $read->seq."\n";
-
- # Add a tag or MID to the beginning of the read
- $read->mid('ACGT');
-
- # Add sequencing errors (error positions are 1-based and relative to the
- # error-free MID-containing read)
- my $errors = {};
- $errors->{'8'}->{'+'} = 'AAA'; # insertion of AAA after residue 8
- $errors->{'1'}->{'%'} = 'G'; # substitution of residue 1 by a G
- $errors->{'4'}->{'-'} = undef; # deletion of residue 4
- $read->errors($errors);
-
- # Display the sequence of the read with errors
- print $read->seq."\n";
-
- # String representation of where the read came from and its errors
- print $read->desc."\n";
-
-=head1 DESCRIPTION
-
-This object is a simulated read with sequencing errors. The user can provide a
-reference sequence to take a read from, the position and orientation of the
-read on the reference sequence, and the sequencing errors to generate.
-
-The sequence of the read is automatically calculated based on this information.
-By default, the description of the reads contain tracking information and will
-look like this (Bioperl-style):
-
- reference=human_chr2 start=3 end=12 strand=-1 mid=ACGT errors=1%G,4-,8+AAA description="The human genome"
-
-or Genbank-style:
-
- reference=human_chr2 position=complement(3..12) mid=ACGT errors=1%G,4-,8+AAA description="The human genome"
-
-Creating a simulated read follows these steps:
- 1/ Define the read start(), end(), strand() and qual_levels() if you want
- quality scores to be generated. Do not change these values once set because
- the read will not be updated.
- 2/ Specify the reference sequence that the read should be taken from. Once
- this is done, you have a fully functional read. Do not use the reference()
- method again after you have gone to the next step.
- 3/ Use mid() to input a MID (or tag or barcode) to add to the beginning of the
- read. You can change the MID until you go to next step.
- 4/ Give sequencing error specifications using errors() as the last step. You
- can do that as many times as you like, and the read will be updated.
-
-=head1 AUTHOR
-
-Florent E Angly E<lt>florent . angly @ gmail-dot-comE<gt>.
-
-Copyright (c) 2011 Florent E Angly.
-
-This library is free software; you can redistribute it under the GNU General
-Public License version 3.
-
-=cut
-
-
-use strict;
-use warnings;
-use Bio::LocatableSeq;
-use base qw( Bio::Seq::Quality Bio::LocatableSeq );
-
-
-=head2 new
-
- Title : new
- Function : Create a new simulated read object
- Usage : my $read = Bio::Seq::SimulatedRead->new(
- -id => 'read001',
- -reference => $seq_obj ,
- -errors => $errors ,
- -start => 10 ,
- -end => 135 ,
- -strand => 1 ,
- );
- Arguments: -reference => Bio::SeqI, Bio::PrimarySeqI object representing the
- reference sequence to take the read from. See
- reference().
- -errors => Hashref representing the position of errors in the read
- See errors().
- -mid => String of a MID to prepend to the read. See mid().
- -track => Track where the read came from in the read description?
- See track().
- -coord_style => Define what coordinate system to use. See coord_style().
- All other methods from Bio::LocatableSeq are available.
- Returns : new Bio::Seq::SimulatedRead object
-
-=cut
-
-sub new {
- my ($class, @args) = @_;
- my $self = $class->SUPER::new(@args);
- my ($qual_levels, $reference, $mid, $errors, $track, $coord_style) =
- $self->_rearrange([qw(QUAL_LEVELS REFERENCE MID ERRORS TRACK COORD_STYLE)], @args);
- $coord_style = defined $coord_style ? $coord_style : 'bioperl';
- $self->coord_style($coord_style);
- $track = defined $track ? $track : 1;
- $self->track($track);
- $qual_levels = defined $qual_levels ? $qual_levels : [];
- $self->qual_levels($qual_levels) if defined $qual_levels;
- $self->reference($reference) if defined $reference;
- $self->mid($mid) if defined $mid;
- $self->{_mutated} = 0;
- $self->errors($errors) if defined $errors;
- return $self;
-}
-
-
-=head2 qual_levels
-
- Title : qual_levels
- Function : Get or set the quality scores to give to the read. By default, if your
- reference sequence does not have quality scores, no quality scores
- are generated for the simulated read. The generated quality scores
- are very basic. If a residue is error-free, it gets the quality score
- defined for good residues. If the residue has an error (is an
- addition or a mutation), the residue gets the quality score specified
- for bad residues. Call the qual_levels() method before using the
- reference() method.
- Usage : my $qual_levels = $read->qual_levels( );
- Arguments: Array reference containing the quality scores to use for:
- 1/ good residues (e.g. 30)
- 2/ bad residues (e.g. 10)
- Returns : Array reference containing the quality scores to use.
-
-=cut
-
-sub qual_levels {
- my ($self, $qual_levels) = @_;
- if (defined $qual_levels) {
- if ( (scalar @$qual_levels != 0) && (scalar @$qual_levels != 2) ) {
- $self->throw("The quality score specification must define the score".
- " to use for good and for bad residues\n");
- }
- $self->{qual_levels} = $qual_levels;
- }
- return $self->{qual_levels};
-}
-
-
-=head2 reference
-
- Title : reference
- Function : Get or set the reference sequence that the read comes from. Once the
- reference has been set, you have a functional simulated read which
- supports all the Bio::LocatableSeq methods. This method must be
- called after qual_levels() but before mid() or errors().
- Usage : my $seq_obj = $read->reference();
- Arguments: Bio::SeqI or Bio::PrimarySeqI object
- Returns : Bio::SeqI or Bio::PrimarySeqI object
-
-=cut
-
-sub reference {
- my ($self, $reference) = @_;
- if (defined $reference) {
- # Sanity check 1
- if ( (not $reference->isa('Bio::SeqI')) && (not $reference->isa('Bio::PrimarySeqI')) ) {
- $self->throw("Expected a Bio::SeqI object as reference, but got: $reference\n");
- }
- # Sanity check 2
- if ($self->{mid} || $self->{errors}) {
- $self->throw("Cannot change the reference sequence after an MID or ".
- "sequencing errors have been added to the read\n");
- }
- # Use beginning of reference sequence as start default
- if (not defined $self->start) {
- $self->start(1);
- }
- # Use end of reference sequence as end default
- if (not defined $self->end) {
- $self->end($reference->length);
- }
- # Use strand 1 as strand default
- if (not defined $self->strand) {
- $self->strand(1);
- }
- # Set the reference sequence object
- $self->{reference} = $reference;
- # Create a sequence, quality scores and description from the reference
- $self->_create_seq;
- $self->_create_qual if scalar @{$self->qual_levels};
- $self->_create_desc if $self->track;
- }
- return $self->{reference};
-}
-
-
-sub _create_seq {
- my $self = shift;
- # Get a truncation of the reference sequence
- my $reference = $self->reference;
- my $read_obj = $reference->trunc( $self->start, $self->end );
- # Reverse complement the read if needed
- if ($self->strand == -1) {
- $read_obj = $read_obj->revcom();
- }
- $self->seq($read_obj->seq);
- return 1;
-}
-
-
-sub _create_qual {
- my $self = shift;
- $self->qual([ ($self->qual_levels->[0]) x ($self->end - $self->start + 1) ]);
- return 1;
-}
-
-
-sub _create_desc {
- # Create the read description of the error-free read
- my $self = shift;
- # Reference sequence ID
- my $desc_str = '';
- my $ref_id = $self->reference->id;
- if (defined $ref_id) {
- $desc_str .= 'reference='.$ref_id.' ';
- }
- # Position of read on reference sequence: start, end and strand
- my $strand = $self->strand;
- if ($self->coord_style eq 'bioperl') {
- $desc_str .= 'start='.$self->start.' end='.$self->end.' ';
- if (defined $strand) {
- # Strand of the reference sequence that the read is on
- $strand = '+1' if $strand == 1;
- $desc_str .= 'strand='.$strand.' ';
- }
- } else {
- if ( (defined $strand) && ($strand == -1) ) {
- # Reverse complemented
- $desc_str .= 'position=complement('.$self->start.'..'.$self->end.') ';
- } else {
- # Regular (forward) orientation
- $desc_str .= 'position='.$self->start.'..'.$self->end.' ';
- }
- }
- # Description of the original sequence
- my $ref_desc = $self->reference->desc;
- if ( (defined $self->reference->desc) && ($self->reference->desc !~ m/^\s*$/) ) {
- $ref_desc =~ s/"/\\"/g; # escape double-quotes to \"
- $desc_str .= 'description="'.$ref_desc.'" ';
- }
- $desc_str =~ s/\s$//g;
- # Record new description
- $self->desc($desc_str);
- return 1;
-}
-
-
-=head2 mid
-
- Title : mid
- Function : Get or set a multiplex identifier (or MID, or tag, or barcode) to
- add to the read. By default, no MID is used. This method must be
- called after reference() but before errors().
- Usage : my $mid = read->mid();
- Arguments: MID sequence string (e.g. 'ACGT')
- Returns : MID sequence string
-
-=cut
-
-sub mid {
- my ($self, $mid) = @_;
- if (defined $mid) {
- # Sanity check 1
- if (not defined $self->reference) {
- $self->throw("Cannot add MID because the reference sequence was not ".
- "set\n");
- }
- # Sanity check 2
- if ($self->{errors}) {
- $self->throw("Cannot add an MID after sequencing errors have been ".
- "introduced in the read\n");
- }
- # Sanity check 3
- if (not $self->validate_seq($mid)) {
- $self->throw("MID is not a valid DNA sequence\n");
- }
- # Update sequence, quality scores and description with the MID
- $self->_update_seq_mid($mid);
- $self->_update_qual_mid($mid) if scalar @{$self->qual_levels};
- $self->_update_desc_mid($mid) if $self->track;
- # Set the MID value
- $self->{mid} = $mid;
- }
- return $self->{mid}
-}
-
-
-sub _update_seq_mid {
- # Update the MID of a sequence
- my ($self, $mid) = @_;
- # Remove old MID
- my $seq = $self->seq;
- my $old_mid = $self->{mid};
- if (defined $old_mid) {
- $seq =~ s/^$old_mid//;
- }
- # Add new MID
- $seq = $mid . $seq;
- $self->seq( $seq );
- return 1;
-}
-
-
-sub _update_qual_mid {
- # Update the MID of a quality scores
- my ($self, $mid) = @_;
- # Remove old MID
- my $qual = $self->qual;
- my $old_mid = $self->{mid};
- if (defined $old_mid) {
- splice @$qual, 0, length($old_mid);
- }
- $qual = [($self->qual_levels->[0]) x length($mid), @$qual];
- $self->qual( $qual );
- return 1;
-}
-
-
-sub _update_desc_mid {
- # Update MID specifications in the read description
- my ($self, $mid) = @_;
- if ($mid) {
- # Sequencing errors introduced in the read
- my $mid_str = "mid=".$mid;
- my $desc_str = $self->desc;
- $desc_str =~ s/((position|strand)=\S+)( mid=\S+)?/$1 $mid_str/g;
- $self->desc( $desc_str );
- }
- return 1;
-}
-
-
-=head2 errors
-
- Title : errors
- Function : Get or set the sequencing errors and update the read. By default, no
- errors are made. This method must be called after the mid() method.
- Usage : my $errors = $read->errors();
- Arguments: Reference to a hash of the position and nature of sequencing errors.
- The positions are 1-based relative to the error-free MID-containing
- read (not relative to the reference sequence). For example:
- $errors->{34}->{'%'} = 'T' ; # substitution of residue 34 by a T
- $errors->{23}->{'+'} = 'GG' ; # insertion of GG after residue 23
- $errors->{45}->{'-'} = undef; # deletion of residue 45
- Substitutions and deletions are for a single residue, but additions
- can be additions of several residues.
- An alternative way to specify errors is by using array references
- instead of scalar for the hash values. This allows to specify
- redundant mutations. For example, the case presented above would
- result in the same read sequence as the example below:
- $errors->{34}->{'%'} = ['C', 'T'] ; # substitution by a C and then a T
- $errors->{23}->{'+'} = ['G', 'G'] ; # insertion of G and then a G
- $errors->{45}->{'-'} = [undef, undef]; # deletion of residue, and again
- Returns : Reference to a hash of the position and nature of sequencing errors.
-
-=cut
-
-sub errors {
- my ($self, $errors) = @_;
- if (defined $errors) {
- # Verify that we have a hashref
- if ( (not defined ref $errors) || (not ref $errors eq 'HASH') ) {
- $self->throw("Error specification has to be a hashref. Got: $errors\n");
- }
- # Verify that we have a reference sequence
- if (not defined $self->reference) {
- $self->throw("Cannot add errors because the reference sequence was not set\n");
- }
- # Convert scalar error specs to arrayref specs
- $errors = $self->_scalar_to_arrayref($errors);
- # Check validity of error specifications
- $errors = $self->_validate_error_specs($errors);
- # Set the error specifications
- $self->{errors} = $errors;
- # Need to recalculate the read from the reference if previously mutated
- if ($self->{_mutated}) {
- $self->_create_seq;
- $self->_create_qual if scalar @{$self->qual_levels};
- $self->_create_desc if $self->track;
- }
- # Now mutate the read, quality score and description
- $self->_update_seq_errors;
- $self->_update_qual_errors if scalar @{$self->qual_levels};
- $self->_update_desc_errors if $self->track;
-
- }
- return $self->{errors};
-}
-
-
-sub _scalar_to_arrayref {
- # Replace the scalar values in the error specs by more versatile arrayrefs
- my ($self, $errors) = @_;
- while ( my ($pos, $ops) = each %$errors ) {
- while ( my ($op, $res) = each %$ops ) {
- if (ref $res eq '') {
- my $arr = [ split //, ($res || '') ];
- $arr = [undef] if scalar @$arr == 0;
- $$errors{$pos}{$op} = $arr;
- }
- }
- }
- return $errors;
-}
-
-
-sub _validate_error_specs {
- # Clean error specifications and warn of any issues encountered
- my ($self, $errors) = @_;
- my %valid_ops = ('%' => undef, '-' => undef, '+' => undef); # valid operations
-
- # Calculate read length
- my $read_length = $self->length;
- while ( my ($pos, $ops) = each %$errors ) {
-
- # Position cannot be no longer than the read length
- if ( (defined $read_length) && ($pos > $read_length) ) {
- $self->warn("Position $pos is beyond end of read ($read_length ".
- "residues). Skipping errors specified at this position.\n");
- delete $errors->{$pos};
- }
-
- # Position has to be 0+ for addition, 1+ for substitution and deletion
- if ( $pos < 1 && (exists $ops->{'%'} || exists $ops->{'-'}) ) {
- $self->warn("Positions of substitutions and deletions have to be ".
- "strictly positive but got $pos. Skipping substitution or deletion".
- " at this position\n");
- delete $ops->{'%'};
- delete $ops->{'-'};
- }
- if ( $pos < 0 && exists $ops->{'+'}) {
- $self->warn("Positions of additions have to be zero or more. ".
- "Skipping addition at position $pos.\n");
- delete $ops->{'+'};
- }
-
- # Valid operations are '%', '+' and '-'
- while ( my ($op, $res) = each %$ops ) {
- if (not exists $valid_ops{$op}) {
- $self->warn("Skipping unknown error operation '$op' at position".
- " $pos\n");
- delete $ops->{$op};
- } else {
- # Substitutions: have to have at least one residue to substitute
- if ( ($op eq '%') && (scalar @$res < 1) ) {
- $self->warn("At least one residue must be provided for substitutions,".
- "but got ".scalar(@$res)." at position $pos.\n");
- }
- # Additions: have to have at least one residue to add
- if ( ($op eq '+') && (scalar @$res < 1) ) {
- $self->warn("At least one residue must be provided for additions,".
- "but got ".scalar(@$res)." at position $pos.\n");
- }
- # Deletions
- if ( ($op eq '-') && (scalar @$res < 1) ) {
- $self->warn("At least one 'undef' must be provided for deletions,".
- "but got ".scalar(@$res)." at position $pos.\n");
- }
- }
- }
-
- delete $errors->{$pos} unless scalar keys %$ops;
- }
-
- return $errors;
-}
-
-
-sub _update_seq_errors {
- my $self = shift;
- my $seq_str = $self->seq;
- my $errors = $self->errors;
- if (scalar keys %$errors > 0) {
- my $off = 0;
- for my $pos ( sort {$a <=> $b} (keys %$errors) ) {
- # Process sequencing errors at that position
- for my $type ( '%', '-', '+' ) {
- next if not exists $$errors{$pos}{$type};
- my $arr = $$errors{$pos}{$type};
- if ($type eq '%') {
- # Substitution at residue position. If there are multiple
- # substitutions to do, directly skip to the last one.
- substr $seq_str, $pos - 1 + $off, 1, $$arr[-1];
- } elsif ($type eq '-') {
- # Deletion at residue position
- substr $seq_str, $pos - 1 + $off, 1, '';
- $off--;
- } elsif ($type eq '+') {
- # Insertion after residue position
- substr $seq_str, $pos + $off, 0, join('', @$arr);
- $off += scalar @$arr;
- }
- }
- }
- $self->{_mutated} = 1;
- } else {
- $self->{_mutated} = 0;
- }
- $self->seq($seq_str);
- return 1;
-}
-
-
-sub _update_qual_errors {
- my $self = shift;
- my $qual = $self->qual;
- my $errors = $self->errors;
- my $bad_qual = $self->qual_levels->[1];
- if (scalar keys %$errors > 0) {
- my $off = 0;
- for my $pos ( sort {$a <=> $b} (keys %$errors) ) {
- # Process sequencing errors at that position
- for my $type ( '%', '-', '+' ) {
- next if not exists $$errors{$pos}{$type};
- my $arr = $$errors{$pos}{$type};
- if ($type eq '%') {
- # Substitution at residue position
- splice @$qual, $pos - 1 + $off, 1, $bad_qual;
- } elsif ($type eq '-') {
- # Deletion at residue position
- splice @$qual, $pos - 1 + $off, 1;
- $off--;
- } elsif ($type eq '+') {
- # Insertion after residue position
- splice @$qual, $pos + $off, 0, ($bad_qual) x scalar(@$arr);
- $off += scalar @$arr;
- }
- }
- }
- }
- $self->qual($qual);
- return 1;
-}
-
-
-sub _update_desc_errors {
- # Add or update error specifications in the read description
- my $self = shift;
- my $errors = $self->errors;
- if (defined $errors and scalar keys %$errors > 0) {
- # Sequencing errors introduced in the read
- my $err_str = 'errors=';
- for my $pos ( sort {$a <=> $b} (keys %$errors) ) {
- # Process sequencing errors at that position
- for my $type ( '%', '-', '+' ) {
- next if not exists $$errors{$pos}{$type};
- for my $val ( @{$$errors{$pos}{$type}} ) {
- $val = '' if not defined $val;
- $err_str .= $pos . $type . $val . ',';
- }
- }
- }
- $err_str =~ s/,$//;
- my $desc_str = $self->desc;
- $desc_str =~ s/((position|strand)=\S+( mid=\S+)?)( errors=\S+)?/$1 $err_str/g;
- $self->desc( $desc_str );
- }
- return 1;
-}
-
-
-=head2 track
-
- Title : track
- Function : Get or set the tracking status in the read description. By default,
- tracking is on. This method can be called at any time.
- Usage : my $track = $read->track();
- Arguments: 1 for tracking, 0 otherwise
- Returns : 1 for tracking, 0 otherwise
-
-=cut
-
-sub track {
- my ($self, $track) = @_;
- if (defined $track) {
- if (defined $self->reference) {
- if ($track == 1) {
- $self->_create_desc;
- $self->_update_desc_mid($self->mid);
- $self->_update_desc_errors;
- } else {
- $self->desc(undef);
- }
- }
- $self->{track} = $track;
- }
- return $self->{track};
-}
-
-
-=head2 coord_style
-
- Title : coord_style
- Function : When tracking is on, define which 1-based coordinate system to use
- in the read description:
- * 'bioperl' uses the start, end and strand keywords (default),
- similarly to the GFF3 format. Example:
- start=1 end=10 strand=+1
- start=1 end=10 strand=-1
- * 'genbank' does only provide the position keyword. Example:
- position=1..10
- position=complement(1..10)
- Usage : my $coord_style = $read->track();
- Arguments: 'bioperl' or 'genbank'
- Returns : 'bioperl' or 'genbank'
-
-=cut
-
-sub coord_style {
- my ($self, $coord_style) = @_;
- my %styles = ( 'bioperl' => undef, 'genbank' => undef );
- if (defined $coord_style) {
- if (not exists $styles{$coord_style}) {
- die "Error: Invalid coordinate style '$coord_style'\n";
- }
- $self->{coord_style} = $coord_style;
- }
- return $self->{coord_style};
-}
-
-
-1;
diff --git a/lib/Bio/SeqFeature/Amplicon.pm b/lib/Bio/SeqFeature/Amplicon.pm
deleted file mode 100644
index b17f3a2..0000000
--- a/lib/Bio/SeqFeature/Amplicon.pm
+++ /dev/null
@@ -1,168 +0,0 @@
-#
-# BioPerl module for Bio::SeqFeature::Amplicon
-#
-# Please direct questions and support issues to <bioperl-l at bioperl.org>
-#
-# Copyright Florent Angly
-#
-# You may distribute this module under the same terms as perl itself
-
-
-=head1 NAME
-
-Bio::SeqFeature::Amplicon - Amplicon feature
-
-=head1 SYNOPSIS
-
- # Amplicon with explicit sequence
- use Bio::SeqFeature::Amplicon;
- my $amplicon = Bio::SeqFeature::Amplicon->new(
- -seq => $seq_object,
- -fwd_primer => $primer_object_1,
- -rev_primer => $primer_object_2,
- );
-
- # Amplicon with implicit sequence
- use Bio::Seq;
- my $template = Bio::Seq->new( -seq => 'AAAAACCCCCGGGGGTTTTT' );
- $amplicon = Bio::SeqFeature::Amplicon->new(
- -start => 6,
- -end => 15,
- );
- $template->add_SeqFeature($amplicon);
- print "Amplicon start : ".$amplicon->start."\n";
- print "Amplicon end : ".$amplicon->end."\n";
- print "Amplicon sequence: ".$amplicon->seq->seq."\n";
- # Amplicon sequence should be 'CCCCCGGGGG'
-
-=head1 DESCRIPTION
-
-Bio::SeqFeature::Amplicon extends L<Bio::SeqFeature::Subseq> to represent an
-amplicon sequence and optional primer sequences.
-
-=head1 FEEDBACK
-
-=head2 Mailing Lists
-
-User feedback is an integral part of the evolution of this and other
-Bioperl modules. Send your comments and suggestions preferably to one
-of the Bioperl mailing lists. Your participation is much appreciated.
-
- bioperl-l at bioperl.org - General discussion
- http://bioperl.org/wiki/Mailing_lists - About the mailing lists
-
-=head2 Support
-
-Please direct usage questions or support issues to the mailing list:
-
-I<bioperl-l at bioperl.org>
-
-rather than to the module maintainer directly. Many experienced and
-reponsive experts will be able look at the problem and quickly
-address it. Please include a thorough description of the problem
-with code and data examples if at all possible.
-
-=head2 Reporting Bugs
-
-Report bugs to the Bioperl bug tracking system to help us keep track
-the bugs and their resolution. Bug reports can be submitted via
-the web:
-
- https://redmine.open-bio.org/projects/bioperl/
-
-=head1 AUTHOR
-
-Florent Angly <florent.angly at gmail.com>
-
-=head1 APPENDIX
-
-The rest of the documentation details each of the object
-methods. Internal methods are usually preceded with a _
-
-=cut
-
-
-package Bio::SeqFeature::Amplicon;
-
-use strict;
-
-use base qw(Bio::SeqFeature::SubSeq);
-
-=head2 new
-
- Title : new()
- Usage : my $amplicon = Bio::SeqFeature::Amplicon( -seq => $seq_object );
- Function: Instantiate a new Bio::SeqFeature::Amplicon object
- Args : -seq , the sequence object or sequence string of the amplicon (optional)
- -fwd_primer , a Bio::SeqFeature primer object with specified location on amplicon (optional)
- -rev_primer , a Bio::SeqFeature primer object with specified location on amplicon (optional)
- Returns : A Bio::SeqFeature::Amplicon object
-
-=cut
-
-sub new {
- my ($class, @args) = @_;
- my $self = $class->SUPER::new(@args);
- my ($fwd_primer, $rev_primer) =
- $self->_rearrange([qw(FWD_PRIMER REV_PRIMER)], @args);
- $fwd_primer && $self->fwd_primer($fwd_primer);
- $rev_primer && $self->rev_primer($rev_primer);
- return $self;
-}
-
-
-sub _primer {
- # Get or set a primer. Type is either 'fwd' or 'rev'.
- my ($self, $type, $primer) = @_;
- if (defined $primer) {
- if ( not(ref $primer) || not $primer->isa('Bio::SeqFeature::Primer') ) {
- $self->throw("Expected a primer object but got a '".ref($primer)."'\n");
- }
- if ( not defined $self->location ) {
- $self->throw("Location of $type primer on amplicon is not known. ".
- "Use start(), end() or location() to set it.");
- }
- $primer->primary_tag($type.'_primer');
- $self->add_SeqFeature($primer);
- }
- return (grep { $_->primary_tag eq $type.'_primer' } $self->get_SeqFeatures)[0];
-}
-
-
-=head2 fwd_primer
-
- Title : fwd_primer
- Usage : my $primer = $feat->fwd_primer();
- Function: Get or set the forward primer. When setting it, the primary tag
- 'fwd_primer' is added to the primer and its start, stop and strand
- attributes are set if needed, assuming that the forward primer is
- at the beginning of the amplicon and the reverse primer at the end.
- Args : A Bio::SeqFeature::Primer object (optional)
- Returns : A Bio::SeqFeature::Primer object
-
-=cut
-
-sub fwd_primer {
- my ($self, $primer) = @_;
- return $self->_primer('fwd', $primer);
-}
-
-
-=head2 rev_primer
-
- Title : rev_primer
- Usage : my $primer = $feat->rev_primer();
- Function: Get or set the reverse primer. When setting it, the primary tag
- 'rev_primer' is added to the primer.
- Args : A Bio::SeqFeature::Primer object (optional)
- Returns : A Bio::SeqFeature::Primer object
-
-=cut
-
-sub rev_primer {
- my ($self, $primer) = @_;
- return $self->_primer('rev', $primer);
-}
-
-
-1;
diff --git a/lib/Bio/SeqFeature/Primer.pm b/lib/Bio/SeqFeature/Primer.pm
deleted file mode 100644
index 2d20684..0000000
--- a/lib/Bio/SeqFeature/Primer.pm
+++ /dev/null
@@ -1,335 +0,0 @@
-#
-# BioPerl module for Bio::SeqFeature::Primer
-#
-# This is the original copyright statement. I have relied on Chad's module
-# extensively for this module.
-#
-# Copyright (c) 1997-2001 bioperl, Chad Matsalla. All Rights Reserved.
-# This module is free software; you can redistribute it and/or
-# modify it under the same terms as Perl itself.
-#
-# Copyright Chad Matsalla
-#
-# You may distribute this module under the same terms as perl itself
-# POD documentation - main docs before the code
-#
-# But I have modified lots of it, so I guess I should add:
-#
-# Copyright (c) 2003 bioperl, Rob Edwards. All Rights Reserved.
-# This module is free software; you can redistribute it and/or
-# modify it under the same terms as Perl itself.
-#
-# Copyright Rob Edwards
-#
-# You may distribute this module under the same terms as perl itself
-# POD documentation - main docs before the code
-
-=head1 NAME
-
-Bio::SeqFeature::Primer - Primer Generic SeqFeature
-
-=head1 SYNOPSIS
-
- use Bio::SeqFeature::Primer;
-
- # Primer object with explicitly-defined sequence object or sequence string
- my $primer = Bio::SeqFeature::Primer->new( -seq => 'ACGTAGCT' );
- $primer->display_name('test_id');
- print "These are the details of the primer:\n".
- "Name: ".$primer->display_name."\n".
- "Tag: ".$primer->primary_tag."\n". # always 'Primer'
- "Sequence: ".$primer->seq->seq."\n".
- "Tm: ".$primer->Tm."\n\n"; # melting temperature
-
- # Primer object with implicit sequence object
- # It is a lighter approach for when the primer location on a template is known
- use Bio::Seq;
- my $template = Bio::Seq->new( -seq => 'ACGTAGCTCTTTTCATTCTGACTGCAACG' );
- $primer = Bio::SeqFeature::Primer->new( -start => 1, -end =>5, -strand => 1 );
- $template->add_SeqFeature($primer);
- print "Primer sequence is: ".$primer->seq->seq."\n";
- # Primer sequence is 'ACGTA'
-
-=head1 DESCRIPTION
-
-This module handles PCR primer sequences. The L<Bio::SeqFeature::Primer> object
-is a L<Bio::SeqFeature::Subseq> object that can additionally contain a primer
-sequence and its coordinates on a template sequence. The primary_tag() for this
-object is 'Primer'. A method is provided to calculate the melting temperature Tm
-of the primer. L<Bio::SeqFeature::Primer> objects are useful to build
-L<Bio::Seq::PrimedSeq> amplicon objects such as the ones returned by
-L<Bio::Tools::Primer3>.
-
-=head1 FEEDBACK
-
-=head2 Mailing Lists
-
-User feedback is an integral part of the evolution of this and other
-Bioperl modules. Send your comments and suggestions preferably to one
-of the Bioperl mailing lists. Your participation is much appreciated.
-
- bioperl-l at bioperl.org - General discussion
- http://bioperl.org/wiki/Mailing_lists - About the mailing lists
-
-=head2 Support
-
-Please direct usage questions or support issues to the mailing list:
-
-I<bioperl-l at bioperl.org>
-
-rather than to the module maintainer directly. Many experienced and
-reponsive experts will be able look at the problem and quickly
-address it. Please include a thorough description of the problem
-with code and data examples if at all possible.
-
-=head2 Reporting Bugs
-
-Report bugs to the Bioperl bug tracking system to help us keep track
-the bugs and their resolution. Bug reports can be submitted via the
-web:
-
- https://redmine.open-bio.org/projects/bioperl/
-
-=head1 AUTHOR
-
-Rob Edwards, redwards at utmem.edu
-
-The original concept and much of the code was written by
-Chad Matsalla, bioinformatics1 at dieselwurks.com
-
-=head1 APPENDIX
-
-The rest of the documentation details each of the object
-methods. Internal methods are usually preceded with a _
-
-=cut
-
-
-package Bio::SeqFeature::Primer;
-
-use strict;
-use Bio::PrimarySeq;
-use Bio::Tools::SeqStats;
-
-use base qw(Bio::SeqFeature::SubSeq);
-
-
-=head2 new()
-
- Title : new()
- Usage : my $primer = Bio::SeqFeature::Primer( -seq => $seq_object );
- Function: Instantiate a new Bio::SeqFeature::Primer object
- Returns : A Bio::SeqFeature::Primer object
- Args : -seq , a sequence object or a sequence string (optional)
- -id , the ID to give to the primer sequence, not feature (optional)
-
-=cut
-
-sub new {
- my ($class, %args) = @_;
-
- # Legacy stuff
- my $sequence = delete $args{-sequence};
- if ($sequence) {
- Bio::Root::Root->deprecated(
- -message => 'Creating a Bio::SeqFeature::Primer with -sequence is deprecated. Use -seq instead.',
- -warn_version => '1.006',
- -throw_version => '1.008',
- );
- $args{-seq} = $sequence;
- }
-
- # Initialize Primer object
- my $self = $class->SUPER::new(%args);
- my ($id) = $self->_rearrange([qw(ID)], %args);
- $id && $self->seq->id($id);
- $self->primary_tag('Primer');
- return $self;
-}
-
-
-# Bypass B::SF::Generic's location() when a string is passed (for compatibility)
-
-sub location {
- my ($self, $location) = @_;
- if ($location) {
- if ( not ref $location ) {
- # Use location as a string for backward compatibility
- Bio::Root::Root->deprecated(
- -message => 'Passing a string to location() is deprecated. Pass a Bio::Location::Simple object or use start() and end() instead.',
- -warn_version => '1.006',
- -throw_version => '1.008',
- );
- $self->{'_location'} = $location;
- } else {
- $self->SUPER::location($location);
- }
- }
- return $self->SUPER::location;
-}
-
-
-=head2 Tm()
-
- Title : Tm()
- Usage : my $tm = $primer->Tm(-salt => 0.05, -oligo => 0.0000001);
- Function: Calculate the Tm (melting temperature) of the primer
- Returns : A scalar containing the Tm.
- Args : -salt : set the Na+ concentration on which to base the calculation
- (default=0.05 molar).
- : -oligo : set the oligo concentration on which to base the
- calculation (default=0.00000025 molar).
- Notes : Calculation of Tm as per Allawi et. al Biochemistry 1997
- 36:10581-10594. Also see documentation at
- http://www.idtdna.com/Scitools/Scitools.aspx as they use this
- formula and have a couple nice help pages. These Tm values will be
- about are about 0.5-3 degrees off from those of the idtdna web tool.
- I don't know why.
-
- This was suggested by Barry Moore (thanks!). See the discussion on
- the bioperl-l with the subject "Bio::SeqFeature::Primer Calculating
- the PrimerTM"
-
-=cut
-
-sub Tm {
- my ($self, %args) = @_;
- my $salt_conc = 0.05; # salt concentration (molar units)
- my $oligo_conc = 0.00000025; # oligo concentration (molar units)
- if ($args{'-salt'}) {
- # Accept object defined salt concentration
- $salt_conc = $args{'-salt'};
- }
- if ($args{'-oligo'}) {
- # Accept object defined oligo concentration
- $oligo_conc = $args{'-oligo'};
- }
- my $seqobj = $self->seq();
- my $length = $seqobj->length();
- my $sequence = uc $seqobj->seq();
- my @dinucleotides;
- my $enthalpy;
- my $entropy;
- # Break sequence string into an array of all possible dinucleotides
- while ($sequence =~ /(.)(?=(.))/g) {
- push @dinucleotides, $1.$2;
- }
- # Build a hash with the thermodynamic values
- my %thermo_values = ('AA' => {'enthalpy' => -7.9,
- 'entropy' => -22.2},
- 'AC' => {'enthalpy' => -8.4,
- 'entropy' => -22.4},
- 'AG' => {'enthalpy' => -7.8,
- 'entropy' => -21},
- 'AT' => {'enthalpy' => -7.2,
- 'entropy' => -20.4},
- 'CA' => {'enthalpy' => -8.5,
- 'entropy' => -22.7},
- 'CC' => {'enthalpy' => -8,
- 'entropy' => -19.9},
- 'CG' => {'enthalpy' => -10.6,
- 'entropy' => -27.2},
- 'CT' => {'enthalpy' => -7.8,
- 'entropy' => -21},
- 'GA' => {'enthalpy' => -8.2,
- 'entropy' => -22.2},
- 'GC' => {'enthalpy' => -9.8,
- 'entropy' => -24.4},
- 'GG' => {'enthalpy' => -8,
- 'entropy' => -19.9},
- 'GT' => {'enthalpy' => -8.4,
- 'entropy' => -22.4},
- 'TA' => {'enthalpy' => -7.2,
- 'entropy' => -21.3},
- 'TC' => {'enthalpy' => -8.2,
- 'entropy' => -22.2},
- 'TG' => {'enthalpy' => -8.5,
- 'entropy' => -22.7},
- 'TT' => {'enthalpy' => -7.9,
- 'entropy' => -22.2},
- 'A' => {'enthalpy' => 2.3,
- 'entropy' => 4.1},
- 'C' => {'enthalpy' => 0.1,
- 'entropy' => -2.8},
- 'G' => {'enthalpy' => 0.1,
- 'entropy' => -2.8},
- 'T' => {'enthalpy' => 2.3,
- 'entropy' => 4.1}
- );
- # Loop through dinucleotides and calculate cumulative enthalpy and entropy values
- for (@dinucleotides) {
- $enthalpy += $thermo_values{$_}{enthalpy};
- $entropy += $thermo_values{$_}{entropy};
- }
- # Account for initiation parameters
- $enthalpy += $thermo_values{substr($sequence, 0, 1)}{enthalpy};
- $entropy += $thermo_values{substr($sequence, 0, 1)}{entropy};
- $enthalpy += $thermo_values{substr($sequence, -1, 1)}{enthalpy};
- $entropy += $thermo_values{substr($sequence, -1, 1)}{entropy};
- # Symmetry correction
- $entropy -= 1.4;
- my $r = 1.987; # molar gas constant
- my $tm = $enthalpy * 1000 / ($entropy + ($r * log($oligo_conc))) - 273.15 + (12* (log($salt_conc)/log(10)));
-
- return $tm;
- }
-
-=head2 Tm_estimate
-
- Title : Tm_estimate
- Usage : my $tm = $primer->Tm_estimate(-salt => 0.05);
- Function: Estimate the Tm (melting temperature) of the primer
- Returns : A scalar containing the Tm.
- Args : -salt set the Na+ concentration on which to base the calculation.
- Notes : This is only an estimate of the Tm that is kept in for comparative
- reasons. You should probably use Tm instead!
-
- This Tm calculations are taken from the Primer3 docs: They are
- based on Bolton and McCarthy, PNAS 84:1390 (1962)
- as presented in Sambrook, Fritsch and Maniatis,
- Molecular Cloning, p 11.46 (1989, CSHL Press).
-
- Tm = 81.5 + 16.6(log10([Na+])) + .41*(%GC) - 600/length
-
- where [Na+] is the molar sodium concentration, %GC is the
- %G+C of the sequence, and length is the length of the sequence.
-
- However.... I can never get this calculation to give me the same result
- as primer3 does. Don't ask why, I never figured it out. But I did
- want to include a Tm calculation here because I use these modules for
- other things besides reading primer3 output.
-
- The primer3 calculation is saved as 'PRIMER_LEFT_TM' or 'PRIMER_RIGHT_TM'
- and this calculation is saved as $primer->Tm so you can get both and
- average them!
-
-=cut
-
-sub Tm_estimate {
-
- # This should probably be put into seqstats as it is more generic, but what the heck.
-
- my ($self, %args) = @_;
- my $salt = 0.2;
- if ($args{'-salt'}) {
- $salt = $args{'-salt'}
- };
- my $seqobj = $self->seq();
- my $length = $seqobj->length();
- my $seqdata = Bio::Tools::SeqStats->count_monomers($seqobj);
- my $gc=$$seqdata{'G'} + $$seqdata{'C'};
- my $percent_gc = ($gc/$length)*100;
-
- my $tm = 81.5+(16.6*(log($salt)/log(10)))+(0.41*$percent_gc) - (600/$length);
-
- return $tm;
-}
-
-=head2 primary_tag, source_tag, location, start, end, strand...
-
-The documentation of L<Bio::SeqFeature::Generic> describes all the methods that
-L<Bio::SeqFeature::Primer> object inherit.
-
-=cut
-
-1;
diff --git a/lib/Bio/SeqFeature/SubSeq.pm b/lib/Bio/SeqFeature/SubSeq.pm
deleted file mode 100644
index 300bd53..0000000
--- a/lib/Bio/SeqFeature/SubSeq.pm
+++ /dev/null
@@ -1,208 +0,0 @@
-#
-# BioPerl module for Bio::SeqFeature::SubSeq
-#
-# Please direct questions and support issues to <bioperl-l at bioperl.org>
-#
-# Copyright Florent Angly
-#
-# You may distribute this module under the same terms as perl itself
-
-
-=head1 NAME
-
-Bio::SeqFeature::SubSeq - Feature representing a subsequence
-
-=head1 SYNOPSIS
-
- # SubSeq with implicit sequence
- use Bio::Seq;
- my $template = Bio::Seq->new( -seq => 'AAAAACCCCCGGGGGTTTTT' );
- $subseq = Bio::SeqFeature::Amplicon->new(
- -start => 6,
- -end => 15,
- -template => $template,
- );
- print "Subsequence is: ".$amplicon->seq->seq."\n"; # Should be 'CCCCCGGGGG'
-
- # SubSeq with explicit sequence
- use Bio::SeqFeature::Subseq;
- my $subseq = Bio::SeqFeature::Amplicon->new(
- -seq => $seq_object,
- );
-
-=head1 DESCRIPTION
-
-Bio::SeqFeature::SubSeq extends L<Bio::SeqFeature::Generic> features to
-represent a subsequence. When this feature is attached to a template sequence,
-the sequence of feature is the subsequence of the template at this location. The
-purpose of this class is to represent a sequence as a feature without having to
-explictly store its sequence string.
-
-Of course, you might have reasons to explicitly set a sequence. In that case,
-note that the length of the sequence is allowed to not match the position of the
-feature. For example, you can set sequence of length 10 in a SubSeq feature that
-spans positions 30 to 50 of the template if you so desire.
-
-=head1 FEEDBACK
-
-=head2 Mailing Lists
-
-User feedback is an integral part of the evolution of this and other
-Bioperl modules. Send your comments and suggestions preferably to one
-of the Bioperl mailing lists. Your participation is much appreciated.
-
- bioperl-l at bioperl.org - General discussion
- http://bioperl.org/wiki/Mailing_lists - About the mailing lists
-
-=head2 Support
-
-Please direct usage questions or support issues to the mailing list:
-
-I<bioperl-l at bioperl.org>
-
-rather than to the module maintainer directly. Many experienced and
-reponsive experts will be able look at the problem and quickly
-address it. Please include a thorough description of the problem
-with code and data examples if at all possible.
-
-=head2 Reporting Bugs
-
-Report bugs to the Bioperl bug tracking system to help us keep track
-the bugs and their resolution. Bug reports can be submitted via
-the web:
-
- https://redmine.open-bio.org/projects/bioperl/
-
-=head1 AUTHOR
-
-Florent Angly <florent.angly at gmail.com>
-
-=head1 APPENDIX
-
-The rest of the documentation details each of the object
-methods. Internal methods are usually preceded with a _
-
-=cut
-
-
-package Bio::SeqFeature::SubSeq;
-
-use strict;
-
-use base qw(Bio::SeqFeature::Generic);
-
-=head2 new
-
- Title : new()
- Usage : my $subseq = Bio::SeqFeature::SubSeq( -start => 1, -end => 10, -strand => -1);
- Function: Instantiate a new Bio::SeqFeature::SubSeq feature object
- Args : -seq , the sequence object or sequence string of the feature (optional)
- -template , attach the feature to the provided parent template sequence or feature (optional).
- Note that you must specify the feature location to do this.
- -start, -end, -location, -strand and all other L<Bio::SeqFeature::Generic> argument can be used.
- Returns : A Bio::SeqFeature::SubSeq object
-
-=cut
-
-sub new {
- my ($class, @args) = @_;
- my $self = $class->SUPER::new(@args);
- my ($seq, $template) = $self->_rearrange([qw(SEQ TEMPLATE)], @args);
- if (defined $seq) {
- # Set the subsequence explicitly
- if (not ref $seq) {
- # Convert string to sequence object
- $seq = Bio::PrimarySeq->new( -seq => $seq );
- } else {
- # Sanity check
- if (not $seq->isa('Bio::PrimarySeqI')) {
- $self->throw("Expected a sequence object but got a '".ref($seq)."'\n");
- }
- }
- $self->seq($seq);
- }
- if ($template) {
- if ( not($self->start) || not($self->end) ) {
- $self->throw('Could not attach feature to template $template because'.
- ' the feature location was not specified.');
- }
-
- # Need to attach to parent sequence and then add sequence feature
- my $template_seq;
- if ($template->isa('Bio::SeqFeature::Generic')) {
- $template_seq = $template->entire_seq;
- } elsif ($template->isa('Bio::SeqI')) {
- $template_seq = $template;
- } else {
- $self->throw("Expected a Bio::SeqFeature::Generic or Bio::SeqI object".
- " as template, but got '$template'.");
- }
- $self->attach_seq($template_seq);
- $template->add_SeqFeature($self);
-
- }
- return $self;
-}
-
-
-=head2 seq
-
- Title : seq()
- Usage : my $seq = $subseq->seq();
- Function: Get or set the sequence object of this SubSeq feature. If no sequence
- was provided, but the subseq is attached to a sequence, get the
- corresponding subsequence.
- Returns : A sequence object or undef
- Args : None.
-
-=cut
-
-sub seq {
- my ($self, $value) = @_;
- if (defined $value) {
- # The sequence is explicit
- if ( not(ref $value) || not $value->isa('Bio::PrimarySeqI') ) {
- $self->throw("Expected a sequence object but got a '".ref($value)."'\n");
- }
- $self->{seq} = $value;
- }
- my $seq = $self->{seq};
- if (not defined $seq) {
- # The sequence is implied
- $seq = $self->SUPER::seq;
- }
- return $seq;
-}
-
-
-=head2 length
-
- Title : seq()
- Usage : my $length = $subseq->seq();
- Function: Get the length of the SubSeq feature. It is similar to the length()
- method of L<Bio::Generic::SeqFeature>, which computes length based
- on the location of the feature. However, if the feature was not
- given a location, return the length of the subsequence if possible.
- Returns : integer or undef
- Args : None.
-
-=cut
-
-sub length {
- my ($self) = @_;
- # Try length from location first
- if ($self->start && $self->end) {
- return $self->SUPER::length();
- }
- # Then try length from subsequence
- my $seq = $self->seq;
- if (defined $seq) {
- return length $seq->seq;
- }
- # We failed
- return undef;
-}
-
-
-
-1;
diff --git a/lib/Bio/Tools/AmpliconSearch.pm b/lib/Bio/Tools/AmpliconSearch.pm
deleted file mode 100644
index 199cb5f..0000000
--- a/lib/Bio/Tools/AmpliconSearch.pm
+++ /dev/null
@@ -1,564 +0,0 @@
-# BioPerl module for Bio::Tools::AmpliconSearch
-#
-# Copyright Florent Angly
-#
-# You may distribute this module under the same terms as perl itself
-
-
-package Bio::Tools::AmpliconSearch;
-
-use strict;
-use warnings;
-use Bio::Tools::IUPAC;
-use Bio::SeqFeature::Amplicon;
-use Bio::Tools::SeqPattern;
-# we require Bio::SeqIO
-# and Bio::SeqFeature::Primer
-
-use base qw(Bio::Root::Root);
-
-my $template_str;
-
-
-=head1 NAME
-
-Bio::Tools::AmpliconSearch - Find amplicons in a template using degenerate PCR primers
-
-=head1 SYNOPSIS
-
- use Bio::PrimarySeq;
- use Bio::Tools::AmpliconSearch;
-
- my $template = Bio::PrimarySeq->new(
- -seq => 'aaaaaCCCCaaaaaaaaaaTTTTTTaaaaaCCACaaaaaTTTTTTaaaaaaaaaa',
- );
- my $fwd_primer = Bio::PrimarySeq->new(
- -seq => 'CCNC',
- );
- my $rev_primer = Bio::PrimarySeq->new(
- -seq => 'AAAAA',
- );
-
- my $search = Bio::Tools::AmpliconSearch->new(
- -template => $template,
- -fwd_primer => $fwd_primer,
- -rev_primer => $rev_primer,
- );
-
- while (my $amplicon = $search->next_amplicon) {
- print "Found amplicon at position ".$amplicon->start.'..'.$amplicon->end.":\n";
- print $amplicon->seq->seq."\n\n";
- }
-
- # Now change the template (but you could change the primers instead) and look
- # for amplicons again
-
- $template = Bio::PrimarySeq->new(
- -seq => 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa',
- );
- $search->template($template);
-
- while (my $amplicon = $search->next_amplicon) {
- print "Found amplicon at position ".$amplicon->start.'..'.$amplicon->end.":\n";
- print $amplicon->seq->seq."\n\n";
- }
-
-=head1 DESCRIPTION
-
-Perform an in silico PCR reaction, i.e. search for amplicons in a given template
-sequence using the specified degenerate primer.
-
-The template sequence is a sequence object, e.g. L<Bio::Seq>, and the primers
-can be a sequence or a L<Bio::SeqFeature::Primer> object and contain ambiguous
-residues as defined in the IUPAC conventions. The primer sequences are converted
-into regular expressions using L<Bio::Tools::IUPAC> and the matching regions of
-the template sequence, i.e. the amplicons, are returned as L<Bio::Seq::PrimedSeq>
-objects.
-
-AmpliconSearch will look for amplicons on both strands (forward and reverse-
-complement) of the specified template sequence. If the reverse primer is not
-provided, an amplicon will be returned and span a match of the forward primer to
-the end of the template. Similarly, when no forward primer is given, match from
-the beginning of the template sequence. When several amplicons overlap, only the
-shortest one to more accurately represent the biases of PCR. Future improvements
-may include modelling the effects of the number of PCR cycles or temperature on
-the PCR products.
-
-=head1 TODO
-
-Future improvements may include:
-
-=over
-
-=item *
-
-Allowing a small number of primer mismatches
-
-=item *
-
-Reporting all amplicons, including overlapping ones
-
-=item *
-
-Putting a limit on the length of amplicons, in accordance with the processivity
-of the polymerase used
-
-=back
-
-=head1 FEEDBACK
-
-=head2 Mailing Lists
-
-User feedback is an integral part of the evolution of this and other
-Bioperl modules. Send your comments and suggestions preferably to one
-of the Bioperl mailing lists. Your participation is much appreciated.
-
- bioperl-l at bioperl.org - General discussion
- http://bioperl.org/wiki/Mailing_lists - About the mailing lists
-
-=head2 Support
-
-Please direct usage questions or support issues to the mailing list:
-
-I<bioperl-l at bioperl.org>
-
-rather than to the module maintainer directly. Many experienced and
-reponsive experts will be able look at the problem and quickly
-address it. Please include a thorough description of the problem
-with code and data examples if at all possible.
-
-=head2 Reporting Bugs
-
-Report bugs to the Bioperl bug tracking system to help us keep track
-the bugs and their resolution. Bug reports can be submitted via the
-web:
-
- https://redmine.open-bio.org/projects/bioperl/
-
-=head1 AUTHOR
-
-Florent Angly <florent.angly at gmail.com>
-
-=head1 APPENDIX
-
-The rest of the documentation details each of the object
-methods. Internal methods are usually preceded with a _
-
-=head2 new
-
- Title : new
- Usage : my $search = Bio::Tools::AmpliconSearch->new( );
- Function : Initialize an amplicon search
- Args : -template Sequence object for the template sequence. This object
- will be converted to Bio::Seq if needed in since features
- (amplicons and primers) will be added to this object.
- -fwd_primer A sequence object representing the forward primer
- -rev_primer A sequence object representing the reverse primer
- -primer_file Read primers from a sequence file. It replaces
- -fwd_primer and -rev_primer (optional)
- -attach_primers Whether or not to attach primers to Amplicon objects. Default: 0 (off)
- Returns : A Bio::Tools::AmpliconSearch object
-
-=cut
-
-sub new {
- my ($class, @args) = @_;
- my $self = $class->SUPER::new(@args);
- my ($template, $primer_file, $fwd_primer, $rev_primer, $attach_primers) =
- $self->_rearrange([qw(TEMPLATE PRIMER_FILE FWD_PRIMER REV_PRIMER ATTACH_PRIMERS)],
- @args);
-
- # Get primers
- if (defined $primer_file) {
- $self->primer_file($primer_file);
- } else {
- $self->fwd_primer($fwd_primer || '');
- $self->rev_primer($rev_primer || '');
- }
-
- # Get template sequence
- $self->template($template) if defined $template;
-
- $self->attach_primers($attach_primers) if defined $attach_primers;
-
- return $self;
-}
-
-
-=head2 template
-
- Title : template
- Usage : my $template = $search->template;
- Function : Get/set the template sequence. Setting a new template resets any
- search in progress.
- Args : Optional Bio::Seq object
- Returns : A Bio::Seq object
-
-=cut
-
-sub template {
- my ($self, $template) = @_;
- if (defined $template) {
- if ( not(ref $template) || not $template->isa('Bio::PrimarySeqI') ) {
- # Not a Bio::Seq or Bio::PrimarySeq
- $self->throw("Expected a sequence object as input but got a '".ref($template)."'\n");
- }
- if (not $template->isa('Bio::SeqI')) {
- # Convert sequence object to Bio::Seq Seq so that features can be added
- my $primary_seq = $template;
- $template = Bio::Seq->new();
- $template->primary_seq($primary_seq);
- }
- $self->{template} = $template;
- # Reset search in progress
- $template_str = undef;
- }
- return $self->{template};
-}
-
-
-=head2 fwd_primer
-
- Title : fwd_primer
- Usage : my $primer = $search->fwd_primer;
- Function : Get/set the forward primer. Setting a new forward primer resets any
- search in progress.
- Args : Optional sequence object or primer object or '' to match beginning
- of sequence.
- Returns : A sequence object or primer object or undef
-
-=cut
-
-sub fwd_primer {
- my ($self, $primer) = @_;
- if (defined $primer) {
- $self->_set_primer('fwd', $primer);
- }
- return $self->{fwd_primer};
-}
-
-
-=head2 rev_primer
-
- Title : rev_primer
- Usage : my $primer = $search->rev_primer;
- Function : Get/set the reverse primer. Setting a new reverse primer resets any
- search in progress.
- Args : Optional sequence object or primer object or '' to match end of
- sequence.
- Returns : A sequence object or primer object or undef
-
-=cut
-
-sub rev_primer {
- my ($self, $primer) = @_;
- if (defined $primer) {
- $self->_set_primer('rev', $primer);
- }
- return $self->{rev_primer};
-}
-
-
-sub _set_primer {
- # Save a primer (sequence object) and convert it to regexp. Type is 'fwd' for
- # the forward primer or 'rev' for the reverse primer.
- my ($self, $type, $primer) = @_;
- my $re;
- my $match_rna = 1;
- if ($primer eq '') {
- $re = $type eq 'fwd' ? '^' : '$';
- } else {
- if ( not(ref $primer) || (
- not($primer->isa('Bio::PrimarySeqI')) &&
- not($primer->isa('Bio::SeqFeature::Primer')) ) ) {
- $self->throw('Expected a sequence or primer object as input but got a '.ref($primer)."\n");
- }
- $self->{$type.'_primer'} = $primer;
- my $seq = $primer->isa('Bio::SeqFeature::Primer') ? $primer->seq : $primer;
- $re = Bio::Tools::IUPAC->new(
- -seq => $type eq 'fwd' ? $seq : $seq->revcom,
- )->regexp($match_rna);
- }
- $self->{$type.'_regexp'} = $re;
- # Reset search in progress
- $template_str = undef;
- $self->{regexp} = undef;
- return $self->{$type.'_primer'};
-}
-
-
-=head2 primer_file
-
- Title : primer_file
- Usage : my ($fwd, $rev) = $search->primer_file;
- Function : Get/set a sequence file to read the primer from. The first sequence
- must be the forward primer, and the second is the optional reverse
- primer. After reading the file, the primers are set using fwd_primer()
- and rev_primer() and returned.
- Args : Sequence file
- Returns : Array containing forward and reverse primers as sequence objects.
-
-=cut
-
-sub primer_file {
- my ($self, $primer_file) = @_;
- # Read primer file and convert primers into regular expressions to catch
- # amplicons present in the database
-
- if (not defined $primer_file) {
- $self->throw("Need to provide an input file\n");
- }
-
- # Mandatory first primer
- require Bio::SeqIO;
- my $in = Bio::SeqIO->new( -file => $primer_file );
- my $fwd_primer = $in->next_seq;
- if (not defined $fwd_primer) {
- $self->throw("The file '$primer_file' contains no primers\n");
- }
- $fwd_primer->alphabet('dna'); # Force the alphabet since degenerate primers can look like protein sequences
-
- # Optional reverse primers
- my $rev_primer = $in->next_seq;
- if (defined $rev_primer) {
- $rev_primer->alphabet('dna');
- } else {
- $rev_primer = '';
- }
-
- $in->close;
-
- $self->fwd_primer($fwd_primer);
- $self->rev_primer($rev_primer);
-
- return ($fwd_primer, $rev_primer);
-}
-
-
-=head2 attach_primers
-
- Title : attach_primers
- Usage : my $attached = $search->attach_primers;
- Function : Get/set whether or not to attach primer objects to the amplicon
- objects.
- Args : Optional integer (1 for yes, 0 for no)
- Returns : Integer (1 for yes, 0 for no)
-
-=cut
-
-sub attach_primers {
- my ($self, $attach) = @_;
- if (defined $attach) {
- $self->{attach_primers} = $attach;
- require Bio::SeqFeature::Primer;
- }
- return $self->{attach_primers} || 0;
-}
-
-
-=head2 next_amplicon
-
- Title : next_amplicon
- Usage : my $amplicon = $search->next_amplicon;
- Function : Get the next amplicon
- Args : None
- Returns : A Bio::SeqFeature::Amplicon object
-
-=cut
-
-sub next_amplicon {
- my ($self) = @_;
-
- # Initialize search
- if (not defined $template_str) {
- $self->_init;
- }
-
- my $re = $self->_regexp;
-
- my $amplicon;
- if ($template_str =~ m/$re/g) {
- my ($match, $rev_match) = ($1, $2);
- my $strand = $rev_match ? -1 : 1;
- $match = $match || $rev_match;
- my $end = pos($template_str);
- my $start = $end - length($match) + 1;
- $amplicon = $self->_attach_amplicon($start, $end, $strand);
- }
-
- # If no more matches. Make sure calls to next_amplicon() will return undef.
- if (not $amplicon) {
- $template_str = '';
- }
-
- return $amplicon;
-}
-
-
-sub _init {
- my ($self) = @_;
- # Sanity checks
- if ( not $self->template ) {
- $self->throw('Need to provide a template sequence');
- }
- if ( not($self->fwd_primer) && not($self->rev_primer) ) {
- $self->throw('Need to provide at least a primer');
- }
- # Set the template sequence string
- $template_str = $self->template->seq;
- # Set the regular expression to match amplicons
- $self->_regexp;
-
- return 1;
-}
-
-
-sub _regexp {
- # Get the regexp to match amplicon. If the regexp is not set, initialize it.
- my ($self, $regexp) = @_;
-
- if ( not defined $self->{regexp} ) {
- # Build regexp that matches amplicons on both strands and reports shortest
- # amplicon when there are several overlapping amplicons
-
- my $fwd_regexp = $self->_fwd_regexp;
- my $rev_regexp = $self->_rev_regexp;
-
- my ($fwd_regexp_rc, $basic_fwd_match, $rev_regexp_rc, $basic_rev_match);
- if ($fwd_regexp eq '^') {
- $fwd_regexp_rc = '';
- $basic_fwd_match = "(?:.*?$rev_regexp)";
- } else {
- $fwd_regexp_rc = Bio::Tools::SeqPattern->new(
- -seq => $fwd_regexp,
- -type => 'dna',
- )->revcom->str;
- $basic_fwd_match = "(?:$fwd_regexp.*?$rev_regexp)";
- }
-
- if ($rev_regexp eq '$') {
- $rev_regexp_rc = '';
- $basic_rev_match = "(?:.*?$fwd_regexp_rc)";
- } else {
- $rev_regexp_rc = Bio::Tools::SeqPattern->new(
- -seq => $rev_regexp,
- -type => 'dna',
- )->revcom->str;
- $basic_rev_match = "(?:$rev_regexp_rc.*?$fwd_regexp_rc)";
- }
-
- my $fwd_exclude = "(?!$basic_rev_match".
- ($fwd_regexp eq '^' ? '' : "|$fwd_regexp").
- ")";
-
- my $rev_exclude = "(?!$basic_fwd_match".
- ($rev_regexp eq '$' ? '' : "|$rev_regexp_rc").
- ')';
-
- $self->{regexp} = qr/
- ( $fwd_regexp (?:$fwd_exclude.)*? $rev_regexp ) |
- ( $rev_regexp_rc (?:$rev_exclude.)*? $fwd_regexp_rc )
- /xi;
- }
-
- return $self->{regexp};
-}
-
-
-=head2 annotate_template
-
- Title : annotate_template
- Usage : my $template = $search->annotate_template;
- Function : Search for all amplicons and attach them to the template.
- This is equivalent to running:
- while (my $amplicon = $self->next_amplicon) {
- # do something
- }
- my $annotated = $self->template;
- Args : None
- Returns : A Bio::Seq object with attached Bio::SeqFeature::Amplicons (and
- Bio::SeqFeature::Primers if you set -attach_primers to 1).
-
-=cut
-
-sub annotate_template {
- my ($self) = @_;
- # Search all amplicons and attach them to template
- 1 while $self->next_amplicon;
- # Return annotated template
- return $self->template;
-}
-
-
-sub _fwd_regexp {
- my ($self) = @_;
- return $self->{fwd_regexp};
-}
-
-
-sub _rev_regexp {
- my ($self) = @_;
- return $self->{rev_regexp};
-}
-
-
-sub _attach_amplicon {
- # Create an amplicon object and attach it to template
- my ($self, $start, $end, $strand) = @_;
-
- # Create Bio::SeqFeature::Amplicon feature and attach it to the template
- my $amplicon = Bio::SeqFeature::Amplicon->new(
- -start => $start,
- -end => $end,
- -strand => $strand,
- -template => $self->template,
- );
-
- # Create Bio::SeqFeature::Primer feature and attach them to the amplicon
- if ($self->attach_primers) {
- for my $type ('fwd', 'rev') {
- my ($pstart, $pend, $pstrand, $primer_seq);
-
- # Coordinates relative to amplicon
- if ($type eq 'fwd') {
- # Forward primer
- $primer_seq = $self->fwd_primer;
- next if not defined $primer_seq;
- $pstart = 1;
- $pend = $primer_seq->length;
- $pstrand = $amplicon->strand;
- } else {
- # Optional reverse primer
- $primer_seq = $self->rev_primer;
- next if not defined $primer_seq;
- $pstart = $end - $primer_seq->length + 1;
- $pend = $end;
- $pstrand = -1 * $amplicon->strand;
- }
-
- # Absolute coordinates needed
- $pstart += $start - 1;
- $pend += $start - 1;
-
- my $primer = Bio::SeqFeature::Primer->new(
- -start => $pstart,
- -end => $pend,
- -strand => $pstrand,
- -template => $amplicon,
- );
-
- # Attach primer to amplicon
- if ($type eq 'fwd') {
- $amplicon->fwd_primer($primer);
- } else {
- $amplicon->rev_primer($primer);
- }
-
- }
- }
-
- return $amplicon;
-}
-
-
-1;
diff --git a/lib/Bio/Tools/IUPAC.pm b/lib/Bio/Tools/IUPAC.pm
deleted file mode 100644
index 3934e52..0000000
--- a/lib/Bio/Tools/IUPAC.pm
+++ /dev/null
@@ -1,560 +0,0 @@
-#
-# BioPerl module for IUPAC
-#
-# Please direct questions and support issues to <bioperl-l at bioperl.org>
-#
-# Cared for by Aaron Mackey <amackey at virginia.edu>
-#
-# Copyright Aaron Mackey
-#
-# You may distribute this module under the same terms as perl itself
-
-# POD documentation - main docs before the code
-
-=head1 NAME
-
-Bio::Tools::IUPAC - Generates unique sequence objects or regular expressions from
-an ambiguous IUPAC sequence
-
-=head1 SYNOPSIS
-
- use Bio::PrimarySeq;
- use Bio::Tools::IUPAC;
-
- # Get the IUPAC code for proteins
- my %iupac_prot = Bio::Tools::IUPAC->new->iupac_iup;
-
- # Create a sequence with degenerate residues
- my $ambiseq = Bio::PrimarySeq->new(-seq => 'ARTCGUTGN', -alphabet => 'dna');
-
- # Create all possible non-degenerate sequences
- my $iupac = Bio::Tools::IUPAC->new(-seq => $ambiseq);
- while ($uniqueseq = $iupac->next_seq()) {
- # process the unique Bio::Seq object.
- }
-
- # Get a regular expression that matches all possible sequences
- my $regexp = $iupac->regexp();
-
-=head1 DESCRIPTION
-
-Bio::Tools::IUPAC is a tool that manipulates sequences with ambiguous residues
-following the IUPAC conventions. Non-standard characters have the meaning
-described below:
-
- IUPAC-IUB SYMBOLS FOR NUCLEOTIDE (DNA OR RNA) NOMENCLATURE:
- Cornish-Bowden (1985) Nucl. Acids Res. 13: 3021-3030
-
- ------------------------------------------
- Symbol Meaning Nucleic Acid
- ------------------------------------------
- A A Adenine
- C C Cytosine
- G G Guanine
- T T Thymine
- U U Uracil
- M A or C
- R A or G
- W A or T
- S C or G
- Y C or T
- K G or T
- V A or C or G
- H A or C or T
- D A or G or T
- B C or G or T
- X G or A or T or C
- N G or A or T or C
-
-
- IUPAC-IUP AMINO ACID SYMBOLS:
- Biochem J. 1984 Apr 15; 219(2): 345-373
- Eur J Biochem. 1993 Apr 1; 213(1): 2
-
- ------------------------------------------
- Symbol Meaning
- ------------------------------------------
- A Alanine
- B Aspartic Acid, Asparagine
- C Cysteine
- D Aspartic Acid
- E Glutamic Acid
- F Phenylalanine
- G Glycine
- H Histidine
- I Isoleucine
- J Isoleucine/Leucine
- K Lysine
- L Leucine
- M Methionine
- N Asparagine
- O Pyrrolysine
- P Proline
- Q Glutamine
- R Arginine
- S Serine
- T Threonine
- U Selenocysteine
- V Valine
- W Tryptophan
- X Unknown
- Y Tyrosine
- Z Glutamic Acid, Glutamine
- * Terminator
-
-There are a few things Bio::Tools::IUPAC can do for you:
-
-=over
-
-=item *
-
-report the IUPAC mapping between ambiguous and non-ambiguous residues
-
-=item *
-
-produce a stream of all possible corresponding unambiguous Bio::Seq objects given
-an ambiguous sequence object
-
-=item *
-
-convert an ambiguous sequence object to a corresponding regular expression
-
-=back
-
-=head1 FEEDBACK
-
-=head2 Mailing Lists
-
-User feedback is an integral part of the evolution of this and other
-Bioperl modules. Send your comments and suggestions preferably to one
-of the Bioperl mailing lists. Your participation is much appreciated.
-
- bioperl-l at bioperl.org - General discussion
- http://bioperl.org/wiki/Mailing_lists - About the mailing lists
-
-=head2 Support
-
-Please direct usage questions or support issues to the mailing list:
-
-I<bioperl-l at bioperl.org>
-
-rather than to the module maintainer directly. Many experienced and
-reponsive experts will be able look at the problem and quickly
-address it. Please include a thorough description of the problem
-with code and data examples if at all possible.
-
-=head2 Reporting Bugs
-
-Report bugs to the Bioperl bug tracking system to help us keep track
-the bugs and their resolution. Bug reports can be submitted via the
-web:
-
- https://redmine.open-bio.org/projects/bioperl/
-
-=head1 AUTHOR - Aaron Mackey
-
-Email amackey-at-virginia.edu
-
-=head1 APPENDIX
-
-The rest of the documentation details each of the object
-methods. Internal methods are usually preceded with a _
-
-=cut
-
-
-package Bio::Tools::IUPAC;
-
-use strict;
-use base qw(Bio::Root::Root);
-use vars qw(%IUB %IUB_AMB %REV_IUB %IUP %IUP_AMB $AUTOLOAD);
-
-BEGIN {
- # Ambiguous nucleic residues are matched to unambiguous residues
- %IUB = (
- A => [qw(A)],
- C => [qw(C)],
- G => [qw(G)],
- T => [qw(T)],
- U => [qw(U)],
- M => [qw(A C)],
- R => [qw(A G)],
- S => [qw(C G)],
- W => [qw(A T)],
- Y => [qw(C T)],
- K => [qw(G T)],
- V => [qw(A C G)],
- H => [qw(A C T)],
- D => [qw(A G T)],
- B => [qw(C G T)],
- N => [qw(A C G T)],
- X => [qw(A C G T)],
- );
-
- # Same as %IUB but ambiguous residues are matched to ambiguous residues only
- %IUB_AMB = (
- M => [qw(M)],
- R => [qw(R)],
- W => [qw(W)],
- S => [qw(S)],
- Y => [qw(Y)],
- K => [qw(K)],
- V => [qw(M R S V)],
- H => [qw(H M W Y)],
- D => [qw(D K R W)],
- B => [qw(B K S Y)],
- N => [qw(B D H K M N R S V W Y)],
- );
-
- # The inverse of %IUB
- %REV_IUB = (
- A => 'A',
- T => 'T',
- U => 'U',
- C => 'C',
- G => 'G',
- AC => 'M',
- AG => 'R',
- AT => 'W',
- CG => 'S',
- CT => 'Y',
- GT => 'K',
- ACG => 'V',
- ACT => 'H',
- AGT => 'D',
- CGT => 'B',
- ACGT => 'N',
- N => 'N'
- );
-
- # Same thing with proteins now
- %IUP = (
- A => [qw(A)],
- B => [qw(D N)],
- C => [qw(C)],
- D => [qw(D)],
- E => [qw(E)],
- F => [qw(F)],
- G => [qw(G)],
- H => [qw(H)],
- I => [qw(I)],
- J => [qw(I L)],
- K => [qw(K)],
- L => [qw(L)],
- M => [qw(M)],
- N => [qw(N)],
- O => [qw(O)],
- P => [qw(P)],
- Q => [qw(Q)],
- R => [qw(R)],
- S => [qw(S)],
- T => [qw(T)],
- U => [qw(U)],
- V => [qw(V)],
- W => [qw(W)],
- X => [qw(X)],
- Y => [qw(Y)],
- Z => [qw(E Q)],
- '*' => [qw(*)],
- );
-
- %IUP_AMB = (
- B => [qw(B)],
- J => [qw(J)],
- Z => [qw(Z)],
- );
-
-}
-
-
-=head2 new
-
- Title : new
- Usage : Bio::Tools::IUPAC->new($seq);
- Function: Create a new IUPAC object, which acts as a sequence stream (akin to
- SeqIO)
- Args : an ambiguously coded sequence object that has a specified 'alphabet'
- Returns : a Bio::Tools::IUPAC object.
-
-=cut
-
-sub new {
- my ($class, at args) = @_;
- my $self = $class->SUPER::new(@args);
- my ($seq) = $self->_rearrange([qw(SEQ)], at args);
-
- if ( (not defined $seq) && @args && ref($args[0]) ) {
- # parameter not passed as named parameter?
- $seq = $args[0];
- }
-
- if (defined $seq) {
- if (not $seq->isa('Bio::PrimarySeqI')) {
- $self->throw('Must supply a sequence object');
- }
- if (length $seq->seq == 0) {
- $self->throw('Sequence had zero-length');
- }
- $self->{'_seq'} = $seq;
- }
-
- return $self;
-}
-
-
-sub _initialize {
- my ($self) = @_;
- my %iupac = $self->iupac;
- $self->{'_alpha'} = [ map { $iupac{uc $_} } split('', $self->{'_seq'}->seq) ];
- $self->{'_string'} = [(0) x length($self->{'_seq'}->seq())];
- $self->{'_string'}->[0] = -1;
-}
-
-
-=head2 next_seq
-
- Title : next_seq
- Usage : $iupac->next_seq();
- Function: returns the next unique sequence object
- Args : none.
- Returns : a Bio::Seq object
-
-=cut
-
-sub next_seq {
- my ($self) = @_;
-
- if (not exists $self->{'_string'}) {
- $self->_initialize();
- }
-
- for my $i ( 0 .. $#{$self->{'_string'}} ) {
- next unless $self->{'_string'}->[$i] || @{$self->{'_alpha'}->[$i]} > 1;
- if ( $self->{'_string'}->[$i] == $#{$self->{'_alpha'}->[$i]} ) { # rollover
- if ( $i == $#{$self->{'_string'}} ) { # end of possibilities
- return;
- } else {
- $self->{'_string'}->[$i] = 0;
- next;
- }
- } else {
- $self->{'_string'}->[$i]++;
- my $j = -1;
- my $seqstr = join('', map { $j++; $self->{'_alpha'}->[$j]->[$_]; } @{$self->{'_string'}});
- my $desc = $self->{'_seq'}->desc() || '';
- $self->{'_num'}++;
- 1 while $self->{'_num'} =~ s/(\d)(\d\d\d)(?!\d)/$1,$2/;
- $desc =~ s/( \[Bio::Tools::IUPAC-generated\sunique sequence # [^\]]*\])|$/ \[Bio::Tools::IUPAC-generated unique sequence # $self->{'_num'}\]/;
- $self->{'_num'} =~ s/,//g;
-
- # Return a fresh sequence object
- return Bio::PrimarySeq->new(-seq => $seqstr, -desc => $desc);
- }
- }
-}
-
-
-=head2 iupac
-
- Title : iupac
- Usage : my %symbols = $iupac->iupac;
- Function: Returns a hash of symbols -> symbol components of the right type
- for the given sequence, i.e. it is the same as iupac_iup() if
- Bio::Tools::IUPAC was given a proteic sequence, or iupac_iub() if the
- sequence was nucleic. For example, the key 'M' has the value ['A', 'C'].
- Args : none
- Returns : Hash
-
-=cut
-
-sub iupac {
- my ($self) = @_;
- my $alphabet = lc( $self->{'_seq'}->alphabet() );
- if ( ($alphabet eq 'dna') or ($alphabet eq 'rna') ) {
- return %IUB; # nucleic
- } elsif ( $alphabet eq 'protein' ) {
- return %IUP; # proteic
- } else {
- $self->throw("The input sequence had the unknown alphabet '$alphabet'\n");
- }
-}
-
-
-
-=head2 iupac_amb
-
- Title : iupac_amb
- Usage : my %symbols = $iupac->iupac_amb;
- Function: Same as iupac() but only contains a mapping between ambiguous residues
- and the ambiguous residues they map to. For example, the key 'N' has
- the value ['R', 'Y', 'K', 'M', 'S', 'W', 'B', 'D', 'H', 'V', 'N'],
- i.e. it matches all other ambiguous residues.
- Args : none
- Returns : Hash
-
-=cut
-
-sub iupac_amb {
- my ($self) = @_;
- my $alphabet = lc( $self->{'_seq'}->alphabet() );
- if ( ($alphabet eq 'dna') or ($alphabet eq 'rna') ) {
- return %IUB_AMB; # nucleic
- } elsif ( $alphabet eq 'protein' ) {
- return %IUP_AMB; # proteic
- } else {
- $self->throw("The input sequence had the unknown alphabet '$alphabet'\n");
- }
-}
-
-
-=head2 iupac_iup
-
- Title : iupac_iup
- Usage : my %aasymbols = $iupac->iupac_iup;
- Function: Returns a hash of PROTEIN symbols -> non-ambiguous symbol components
- Args : none
- Returns : Hash
-
-=cut
-
-sub iupac_iup {
- return %IUP;
-}
-
-
-=head2 iupac_iup_amb
-
- Title : iupac_iup_amb
- Usage : my %aasymbols = $iupac->iupac_iup_amb;
- Function: Returns a hash of PROTEIN symbols -> ambiguous symbol components
- Args : none
- Returns : Hash
-
-=cut
-
-sub iupac_iup_amb {
- return %IUP_AMB;
-}
-
-
-=head2 iupac_iub
-
- Title : iupac_iub
- Usage : my %dnasymbols = $iupac->iupac_iub;
- Function: Returns a hash of DNA symbols -> non-ambiguous symbol components
- Args : none
- Returns : Hash
-
-=cut
-
-sub iupac_iub {
- return %IUB;
-}
-
-
-=head2 iupac_iub_amb
-
- Title : iupac_iub_amb
- Usage : my %dnasymbols = $iupac->iupac_iub;
- Function: Returns a hash of DNA symbols -> ambiguous symbol components
- Args : none
- Returns : Hash
-
-=cut
-
-sub iupac_iub_amb {
- return %IUB_AMB;
-}
-
-
-=head2 iupac_rev_iub
-
- Title : iupac_rev_iub
- Usage : my %dnasymbols = $iupac->iupac_rev_iub;
- Function: Returns a hash of nucleotide combinations -> IUPAC code
- (a reverse of the iupac_iub hash).
- Args : none
- Returns : Hash
-
-=cut
-
-sub iupac_rev_iub {
- return %REV_IUB;
-}
-
-
-=head2 count
-
- Title : count
- Usage : my $total = $iupac->count();
- Function: Calculates the number of unique, unambiguous sequences that
- this ambiguous sequence could generate
- Args : none
- Return : int
-
-=cut
-
-sub count {
- my ($self) = @_;
- if (not exists $self->{'_string'}) {
- $self->_initialize();
- }
- my $count = 1;
- $count *= scalar(@$_) for (@{$self->{'_alpha'}});
- return $count;
-}
-
-
-=head2 regexp
-
- Title : regexp
- Usage : my $re = $iupac->regexp();
- Function: Converts the ambiguous sequence into a regular expression that
- matches all of the corresponding ambiguous and non-ambiguous sequences.
- You can further manipulate the resulting regular expression with the
- Bio::Tools::SeqPattern module. After you are done building your
- regular expression, you might want to compile it and make it case-
- insensitive:
- $re = qr/$re/i;
- Args : 1 to match RNA: T and U characters will match interchangeably
- Return : regular expression
-
-=cut
-
-sub regexp {
- my ($self, $match_rna) = @_;
- my $re;
- my $seq = $self->{'_seq'}->seq;
- my %iupac = $self->iupac;
- my %iupac_amb = $self->iupac_amb;
- for my $pos (0 .. length($seq)-1) {
- my $res = substr $seq, $pos, 1;
- my $iupacs = $iupac{$res};
- my $iupacs_amb = $iupac_amb{$res} || [];
- if (not defined $iupacs) {
- $self->throw("Primer sequence '$seq' is not a valid IUPAC sequence.".
- " Offending character was '$res'.\n");
- }
- my $part = join '', (@$iupacs, @$iupacs_amb);
- if ($match_rna) {
- $part =~ s/T/TU/i || $part =~ s/U/TU/i;
- }
- if (length $part > 1) {
- $part = '['.$part.']';
- }
- $re .= $part;
- }
- return $re;
-}
-
-
-sub AUTOLOAD {
- my $self = shift @_;
- my $method = $AUTOLOAD;
- $method =~ s/.*:://;
- return $self->{'_seq'}->$method(@_)
- unless $method eq 'DESTROY';
-}
-
-1;
-
diff --git a/lib/Grinder.pm b/lib/Grinder.pm
index 67db7b5..8e79e7d 100644
--- a/lib/Grinder.pm
+++ b/lib/Grinder.pm
@@ -17,7 +17,7 @@ use Bio::Tools::AmpliconSearch;
use Math::Random::MT qw(srand rand);
use Getopt::Euclid qw(:minimal_keys :defer);
-use version; our $VERSION = version->declare('0.5.3');
+use version; our $VERSION = version->declare('0.5.4');
#---------- GRINDER POD DOC ---------------------------------------------------#
@@ -33,7 +33,7 @@ libraries based on DNA, RNA or proteic reference sequences provided in a FASTA
file.
Grinder can produce genomic, metagenomic, transcriptomic, metatranscriptomic,
-proteomic, metaproteomic shotgun and amplicon datasets from current sequencing
+proteomic, metaproteomic shotgun and amplicon datasets from various sequencing
technologies such as Sanger, 454, Illumina. These simulated datasets can be used
to test the accuracy of bioinformatic tools under specific hypothesis, e.g. with
or without sequencing errors, or with low or high community diversity. Grinder
@@ -140,7 +140,7 @@ Available from L<http://dx.doi.org/10.1093/nar/gks251>.
=head1 VERSION
-0.5.3
+0.5.4
=head1 AUTHOR
@@ -176,13 +176,11 @@ for you:
=item *
-Bioperl modules (>=1.6.901).
-
-Note that some unreleased Bioperl modules have been included in Grinder.
+Bioperl modules (>=1.6.923)
=item *
-Getopt::Euclid (>= 0.3.4)
+Getopt::Euclid (>= 0.4.4)
=item *
@@ -192,7 +190,7 @@ First released with Perl v5.7.3
=item *
-Math::Random::MT (>= 1.13)
+Math::Random::MT (>= 1.16)
=item *
@@ -202,6 +200,55 @@ First released with Perl v5.9.0
=back
+=head2 Extra dependencies for Grinder development only
+
+
+Perl modules:
+
+=over
+
+=item *
+
+Module::Install
+
+=item *
+
+Module::Install::AuthorRequires
+
+=item *
+
+Module::Install::AutoLicense
+
+=item *
+
+Module::Install::PodFromEuclid
+
+=item *
+
+Module::Install::ReadmeFromPod (>= 0.14)
+
+=item *
+
+Module::Install::AutoManifest
+
+=item *
+
+Statistics::R (>= 0.32)
+
+=back
+
+The R interpreter (L<http://www.r-project.org>) and the following R library:
+
+=over
+
+=item *
+
+fitdistrplus
+
+=back
+
+When running R, install the library with this command: install.packages("fitdistrplus")
+
=head2 Procedure
To install Grinder globally on your system, run the following commands in a
@@ -231,6 +278,8 @@ will be installed in your home directory.
Then, install Grinder by following the instructions detailed in the "Procedure"
section.
+
+
=head1 RUNNING GRINDER
After installation, you can run Grinder using a command-line interface (CLI),
@@ -904,13 +953,15 @@ in FASTQ format.
=head1 API EXAMPLES
-The Grinder API allows to conveniently use Grinder within Perl scripts. Here is
-a synopsis:
+The Grinder API allows to conveniently use Grinder within Perl scripts. The same
+options as the CLI apply, but when passing multiple values to an options, you
+will need to pass them as an array (not a scalar or arrayref). Here is a example:
use Grinder;
- # Set up a new factory (see the OPTIONS section for a complete list of parameters)
- my $factory = Grinder->new( -reference_file => 'genomes.fna' );
+ # Set up a new factory
+ my $factory = Grinder->new( -reference_file => 'genomes.fna',
+ -read_dist => (100, 'uniform', 10) );
# Process all shotgun libraries requested
while ( my $struct = $factory->next_lib ) {
@@ -1299,15 +1350,15 @@ sub argparse {
"internal problem\n";
}
# Get parsed arguments from %ARGV and put them in $self
- for my $arg (keys %ARGV) {
+ while (my ($arg, $val) = each %ARGV) {
# Skip short argument names (they are also represented with long names)
next if length($arg) <= 2;
# Process long argument names. Copy their value into $self
- my $ref = ref($ARGV{$arg});
+ my $ref = ref($val);
if (not $ref) {
- $self->{$arg} = $ARGV{$arg};
+ $self->{$arg} = $val;
} elsif ($ref eq 'ARRAY') {
- @{$self->{$arg}} = @{$ARGV{$arg}};
+ @{$self->{$arg}} = @{$val};
} else {
die "Error: unsupported operation on argument '$arg' which is a reference".
"of type $ref\n";
@@ -1349,7 +1400,6 @@ sub process_profile_file {
sub initialize {
my ($self) = @_;
- # Returns:
# Parameter processing: read length distribution
if ( (not ref $self->{read_dist}) or (ref $self->{read_dist} eq 'SCALAR') ){
@@ -1563,6 +1613,7 @@ sub initialize_alphabet {
}
my $num_chars = scalar keys %alphabet_hash;
$self->{alphabet_hash} = \%alphabet_hash;
+ $self->{alphabet_arr} = [sort keys %alphabet_hash];
# CDF for this alphabet
$self->{alphabet_complete_cdf} = $self->proba_cumul([(1/$num_chars) x $num_chars]);
$self->{alphabet_truncated_cdf} = $self->proba_cumul([(1/($num_chars-1)) x ($num_chars-1)]);
@@ -1657,6 +1708,7 @@ sub community_structures {
# Shuffle the abundance-ranks of the most abundant genomes
($c_ids, $perc_permuted) = community_permuted($c_ids, $perc_permuted);
+
# Update values in $self object
$self->{overall_diversity} = $overall_diversity;
$self->{diversity} = $diversities;
@@ -1705,7 +1757,6 @@ sub community_calculate_diversities {
my $ab = $$c_struct{abs}[$i];
next if not $ab;
$richness++;
-
if (defined $all_ids{$id}) {
$all_ids{$id}++;
} else {
@@ -1716,11 +1767,12 @@ sub community_calculate_diversities {
}
$overall_diversity = scalar keys %all_ids;
-
# Calculate percent shared
my $nof_non_shared = 0;
- for my $id (keys %all_ids) {
- $nof_non_shared++ if $all_ids{$id} < $nof_libs;
+ while (my ($id, $nof_samples) = each %all_ids) {
+ if ($nof_samples < $nof_libs) {
+ $nof_non_shared++;
+ }
}
$perc_shared = ($overall_diversity - $nof_non_shared) * 100 / $overall_diversity;
@@ -1906,7 +1958,7 @@ sub community_shared {
}
# Add shared sequences
- my @ids = keys %$seq_ids;
+ my @ids = sort keys %$seq_ids;
my @shared_ids;
for (0 .. $nof_shared - 1) {
# Pick a random sequence
@@ -2025,7 +2077,7 @@ sub community_calculate_amplicon_abundance {
# of the species, sorted by decreasing abundance.
# Give amplicons from the same species the same sampling probability
- for (my $i = 0; $i < scalar @$r_spp_ids; $i++) {
+ for (my $i = 0; $i < scalar @$r_spp_ids; $i++) {
my $species_ab = $$r_spp_abs[$i];
my $species_id = $$r_spp_ids[$i];
my @amplicon_ids = keys %{$seq_ids->{$species_id}};
@@ -2492,19 +2544,19 @@ sub rand_kmer_from_collection {
sub rand_seq_with_kmer {
- # Pick a random sequence ID that contains the given kmer. An optional sequence
- # ID to exclude can be provided.
- my ($self, $kmer, $excl) = @_;
- my $source;
- my ($sources, $freqs) = $self->{chimera_kmer_col}->sources($kmer, $excl, 1);
-
- my $num_sources = scalar @$sources;
- if ($num_sources > 0) {
- my $cdf = $self->proba_cumul($freqs);
- $source = $$sources[rand_weighted($cdf)];
- }
+ # Pick a random sequence ID that contains the given kmer. An optional sequence
+ # ID to exclude can be provided.
+ my ($self, $kmer, $excl) = @_;
+ my $source;
+ my ($sources, $freqs) = $self->{chimera_kmer_col}->sources($kmer, $excl, 1);
+
+ my $num_sources = scalar @$sources;
+ if ($num_sources > 0) {
+ my $cdf = $self->proba_cumul($freqs);
+ $source = $$sources[rand_weighted($cdf)];
+ }
- return $source;
+ return $source;
}
@@ -2821,13 +2873,13 @@ sub rand_res {
my @res;
if (not defined $not_nuc) {
# Use complete alphabet
- @res = keys %{$self->{alphabet_hash}};
+ @res = @{$self->{alphabet_arr}};
$cdf = $self->{alphabet_complete_cdf};
} else {
# Remove non-desired residue from alphabet
my %res = %{$self->{alphabet_hash}};
delete $res{uc($not_nuc)};
- @res = keys %res;
+ @res = sort keys %res;
$cdf = $self->{alphabet_truncated_cdf};
}
my $res = $res[rand_weighted($cdf)];
@@ -3170,7 +3222,7 @@ sub database_get_children_seq {
# ID of the reference sequence
my ($self, $refseqid) = @_;
my @children;
- for my $child_oid ( keys %{$self->{database}->{ids}->{$refseqid}} ) {
+ while ( my ($child_oid, undef) = each %{$self->{database}->{ids}->{$refseqid}} ) {
push @children, $self->database_get_seq($child_oid);
}
return \@children;
@@ -3273,11 +3325,16 @@ sub lib_coverage {
$nof_seqs = int($nof_seqs + 1); # ceiling
}
}
+ # Make sure the last mate pair is always complete
+ if ( $self->{mate_length} && ($nof_seqs % 2)) {
+ $nof_seqs++;
+ if (not $coverage) {
+ warn "Warning: Added a read to make the last mate pair complete.\n"
+ }
+ }
$coverage = ($nof_seqs * $read_length) / $lib_length;
# 3/ Sanity check
-
# TODO: Warn only if diversity was explicitely specified on the command line
-
if ( $nof_seqs < $diversity) {
warn "Warning: The number of reads to produce is lower than the required ".
"diversity. Increase the coverage or number of reads to achieve this ".
diff --git a/lib/Grinder/Database.pm b/lib/Grinder/Database.pm
index 8a82166..9fd8afc 100644
--- a/lib/Grinder/Database.pm
+++ b/lib/Grinder/Database.pm
@@ -64,7 +64,7 @@ sub _init_db {
}
# Index input file
- my $db = Bio::DB::Fasta->new($fasta_file, -reindex => 1);
+ my $db = Bio::DB::Fasta->new($fasta_file, -reindex => 1, -clean => 1);
$self->_set_database($db);
# List sequences that are ok to use
@@ -198,6 +198,7 @@ sub _set_alphabet {
sub get_ids {
+ # Retrieve IDs from database, in no particular order
my ($self) = @_;
my @ids = keys %{$self->{'ids'}};
return \@ids;
diff --git a/man/average_genome_size.1 b/man/average_genome_size.1
index 04f59aa..1589bc2 100644
--- a/man/average_genome_size.1
+++ b/man/average_genome_size.1
@@ -1,4 +1,4 @@
-.\" Automatically generated by Pod::Man 2.25 (Pod::Simple 3.26)
+.\" Automatically generated by Pod::Man 2.28 (Pod::Simple 3.29)
.\"
.\" Standard preamble:
.\" ========================================================================
@@ -38,6 +38,8 @@
. ds PI \(*p
. ds L" ``
. ds R" ''
+. ds C`
+. ds C'
'br\}
.\"
.\" Escape single quotes in literal strings from groff's Unicode transform.
@@ -48,17 +50,24 @@
.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
.\" entries marked with X<> in POD. Of course, you'll have to process the
.\" output yourself in some meaningful fashion.
-.ie \nF \{\
-. de IX
-. tm Index:\\$1\t\\n%\t"\\$2"
+.\"
+.\" Avoid warning from groff about undefined register 'F'.
+.de IX
..
-. nr % 0
-. rr F
-.\}
-.el \{\
-. de IX
+.nr rF 0
+.if \n(.g .if rF .nr rF 1
+.if (\n(rF:(\n(.g==0)) \{
+. if \nF \{
+. de IX
+. tm Index:\\$1\t\\n%\t"\\$2"
..
+. if !\nF==2 \{
+. nr % 0
+. nr F 2
+. \}
+. \}
.\}
+.rr rF
.\"
.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
.\" Fear. Run. Save yourself. No user-serviceable parts.
@@ -124,7 +133,7 @@
.\" ========================================================================
.\"
.IX Title "AVERAGE_GENOME_SIZE 1"
-.TH AVERAGE_GENOME_SIZE 1 "2013-03-20" "perl v5.14.2" "User Contributed Perl Documentation"
+.TH AVERAGE_GENOME_SIZE 1 "2014-01-07" "perl v5.22.1" "User Contributed Perl Documentation"
.\" For nroff, turn off justification. Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
@@ -152,8 +161,8 @@ it under the terms of the \s-1GNU\s0 General Public License (\s-1GPL\s0) as publ
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Grinder is distributed in the hope that it will be useful,
-but \s-1WITHOUT\s0 \s-1ANY\s0 \s-1WARRANTY\s0; without even the implied warranty of
-\&\s-1MERCHANTABILITY\s0 or \s-1FITNESS\s0 \s-1FOR\s0 A \s-1PARTICULAR\s0 \s-1PURPOSE\s0. See the
+but \s-1WITHOUT ANY WARRANTY\s0; without even the implied warranty of
+\&\s-1MERCHANTABILITY\s0 or \s-1FITNESS FOR A PARTICULAR PURPOSE. \s0 See the
\&\s-1GNU\s0 General Public License for more details.
You should have received a copy of the \s-1GNU\s0 General Public License
along with Grinder. If not, see <http://www.gnu.org/licenses/>.
diff --git a/man/change_paired_read_orientation.1 b/man/change_paired_read_orientation.1
index e906f77..7e1eee0 100644
--- a/man/change_paired_read_orientation.1
+++ b/man/change_paired_read_orientation.1
@@ -1,4 +1,4 @@
-.\" Automatically generated by Pod::Man 2.25 (Pod::Simple 3.26)
+.\" Automatically generated by Pod::Man 2.28 (Pod::Simple 3.29)
.\"
.\" Standard preamble:
.\" ========================================================================
@@ -38,6 +38,8 @@
. ds PI \(*p
. ds L" ``
. ds R" ''
+. ds C`
+. ds C'
'br\}
.\"
.\" Escape single quotes in literal strings from groff's Unicode transform.
@@ -48,17 +50,24 @@
.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
.\" entries marked with X<> in POD. Of course, you'll have to process the
.\" output yourself in some meaningful fashion.
-.ie \nF \{\
-. de IX
-. tm Index:\\$1\t\\n%\t"\\$2"
+.\"
+.\" Avoid warning from groff about undefined register 'F'.
+.de IX
..
-. nr % 0
-. rr F
-.\}
-.el \{\
-. de IX
+.nr rF 0
+.if \n(.g .if rF .nr rF 1
+.if (\n(rF:(\n(.g==0)) \{
+. if \nF \{
+. de IX
+. tm Index:\\$1\t\\n%\t"\\$2"
..
+. if !\nF==2 \{
+. nr % 0
+. nr F 2
+. \}
+. \}
.\}
+.rr rF
.\"
.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
.\" Fear. Run. Save yourself. No user-serviceable parts.
@@ -124,7 +133,7 @@
.\" ========================================================================
.\"
.IX Title "CHANGE_PAIRED_READ_ORIENTATION 1"
-.TH CHANGE_PAIRED_READ_ORIENTATION 1 "2012-11-20" "perl v5.14.2" "User Contributed Perl Documentation"
+.TH CHANGE_PAIRED_READ_ORIENTATION 1 "2014-01-07" "perl v5.22.1" "User Contributed Perl Documentation"
.\" For nroff, turn off justification. Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
@@ -153,8 +162,8 @@ it under the terms of the \s-1GNU\s0 General Public License (\s-1GPL\s0) as publ
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Grinder is distributed in the hope that it will be useful,
-but \s-1WITHOUT\s0 \s-1ANY\s0 \s-1WARRANTY\s0; without even the implied warranty of
-\&\s-1MERCHANTABILITY\s0 or \s-1FITNESS\s0 \s-1FOR\s0 A \s-1PARTICULAR\s0 \s-1PURPOSE\s0. See the
+but \s-1WITHOUT ANY WARRANTY\s0; without even the implied warranty of
+\&\s-1MERCHANTABILITY\s0 or \s-1FITNESS FOR A PARTICULAR PURPOSE. \s0 See the
\&\s-1GNU\s0 General Public License for more details.
You should have received a copy of the \s-1GNU\s0 General Public License
along with Grinder. If not, see <http://www.gnu.org/licenses/>.
diff --git a/man/grinder.1 b/man/grinder.1
index 1de5c4f..e225981 100644
--- a/man/grinder.1
+++ b/man/grinder.1
@@ -1,4 +1,4 @@
-.\" Automatically generated by Pod::Man 2.25 (Pod::Simple 3.26)
+.\" Automatically generated by Pod::Man 2.28 (Pod::Simple 3.29)
.\"
.\" Standard preamble:
.\" ========================================================================
@@ -38,6 +38,8 @@
. ds PI \(*p
. ds L" ``
. ds R" ''
+. ds C`
+. ds C'
'br\}
.\"
.\" Escape single quotes in literal strings from groff's Unicode transform.
@@ -48,17 +50,24 @@
.\" titles (.TH), headers (.SH), subsections (.SS), items (.Ip), and index
.\" entries marked with X<> in POD. Of course, you'll have to process the
.\" output yourself in some meaningful fashion.
-.ie \nF \{\
-. de IX
-. tm Index:\\$1\t\\n%\t"\\$2"
+.\"
+.\" Avoid warning from groff about undefined register 'F'.
+.de IX
..
-. nr % 0
-. rr F
-.\}
-.el \{\
-. de IX
+.nr rF 0
+.if \n(.g .if rF .nr rF 1
+.if (\n(rF:(\n(.g==0)) \{
+. if \nF \{
+. de IX
+. tm Index:\\$1\t\\n%\t"\\$2"
..
+. if !\nF==2 \{
+. nr % 0
+. nr F 2
+. \}
+. \}
.\}
+.rr rF
.\"
.\" Accent mark definitions (@(#)ms.acc 1.5 88/02/08 SMI; from UCB 4.2).
.\" Fear. Run. Save yourself. No user-serviceable parts.
@@ -124,7 +133,7 @@
.\" ========================================================================
.\"
.IX Title "GRINDER 1"
-.TH GRINDER 1 "2013-05-30" "perl v5.14.2" "User Contributed Perl Documentation"
+.TH GRINDER 1 "2016-01-18" "perl v5.22.1" "User Contributed Perl Documentation"
.\" For nroff, turn off justification. Always turn off hyphenation; it makes
.\" way too many mistakes in technical documents.
.if n .ad l
@@ -134,7 +143,7 @@ grinder \- A versatile omics shotgun and amplicon sequencing read simulator
.SH "DESCRIPTION"
.IX Header "DESCRIPTION"
Grinder is a versatile program to create random shotgun and amplicon sequence
-libraries based on \s-1DNA\s0, \s-1RNA\s0 or proteic reference sequences provided in a \s-1FASTA\s0
+libraries based on \s-1DNA, RNA\s0 or proteic reference sequences provided in a \s-1FASTA\s0
file.
.PP
Grinder can produce genomic, metagenomic, transcriptomic, metatranscriptomic,
@@ -169,7 +178,7 @@ modeling of the bias created by varying genome lengths or gene copy number
.IP "\(bu" 4
profile mechanism to store preferred options
.IP "\(bu" 4
-available to biologists or power users through multiple interfaces: \s-1GUI\s0, \s-1CLI\s0 and \s-1API\s0
+available to biologists or power users through multiple interfaces: \s-1GUI, CLI\s0 and \s-1API\s0
.PP
Briefly, given a \s-1FASTA\s0 file containing reference sequence (genomes, genes,
transcripts or proteins), Grinder performs the following steps:
@@ -194,7 +203,7 @@ of reads to take.
Alter reads by inserting sequencing errors (indels, substitutions and homopolymer
errors) following a position-specific model to simulate reads created by current
sequencing technologies (Sanger, 454, Illumina). Write the reads and their
-quality scores in \s-1FASTA\s0, \s-1QUAL\s0 and \s-1FASTQ\s0 files.
+quality scores in \s-1FASTA, QUAL\s0 and \s-1FASTQ\s0 files.
.SH "CITATION"
.IX Header "CITATION"
If you use Grinder in your research, please cite:
@@ -207,7 +216,7 @@ If you use Grinder in your research, please cite:
Available from <http://dx.doi.org/10.1093/nar/gks251>.
.SH "VERSION"
.IX Header "VERSION"
-This document refers to grinder version 0.5.2
+This document refers to grinder version 0.5.3
.SH "AUTHOR"
.IX Header "AUTHOR"
Florent Angly <florent.angly at gmail.com>
@@ -269,7 +278,7 @@ If you do not have administrator privileges, Grinder needs to be installed in
your home directory.
.PP
First, follow the instructions to install local::lib
-at http://search.cpan.org/~apeiron/local\-lib\-1.008004/lib/local/lib.pm#The_bootstrapping_technique <http://search.cpan.org/~apeiron/local-lib-1.008004/lib/local/lib.pm#The_bootstrapping_technique>. After local::lib is installed, every Perl
+at <http://search.cpan.org/~apeiron/local\-lib\-1.008004/lib/local/lib.pm#The_bootstrapping_technique>. After local::lib is installed, every Perl
module that you install manually or through the \s-1CPAN\s0 command-line application
will be installed in your home directory.
.PP
@@ -281,20 +290,20 @@ After installation, you can run Grinder using a command-line interface (\s-1CLI\
an application programming interface (\s-1API\s0) or a graphical user interface (\s-1GUI\s0)
in Galaxy.
.PP
-To get the usage of the \s-1CLI\s0, type:
+To get the usage of the \s-1CLI,\s0 type:
.PP
.Vb 1
\& grinder \-\-help
.Ve
.PP
-More information, including the documentation of the Grinder \s-1API\s0, which allows
+More information, including the documentation of the Grinder \s-1API,\s0 which allows
you to run Grinder from within other Perl programs, is available by typing:
.PP
.Vb 1
\& perldoc Grinder
.Ve
.PP
-To run the \s-1GUI\s0, refer to the Galaxy documentation at <http://wiki.g2.bx.psu.edu/FrontPage>.
+To run the \s-1GUI,\s0 refer to the Galaxy documentation at <http://wiki.g2.bx.psu.edu/FrontPage>.
.PP
The 'utils' folder included in the Grinder package contains some utilities:
.IP "average genome size:" 4
@@ -314,8 +323,8 @@ be appropriate to produce a 16S rRNA amplicon dataset. A set of over 41,000 \s-1
representative sequences and their affiliation in seven different taxonomic
sytems can also be used for the same purpose (<http://greengenes.lbl.gov/Download/OTUs/gg_otus_6oct2010/rep_set/gg_97_otus_6oct2010.fasta>
and <http://greengenes.lbl.gov/Download/OTUs/gg_otus_6oct2010/taxonomies/>). The
-\&\s-1RDP\s0 (<http://rdp.cme.msu.edu/download/release10_27_unaligned.fa.gz>) and Silva
-(http://www.arb\-silva.de/no_cache/download/archive/release_108/Exports/ <http://www.arb-silva.de/no_cache/download/archive/release_108/Exports/>)
+\&\s-1RDP \s0(<http://rdp.cme.msu.edu/download/release10_27_unaligned.fa.gz>) and Silva
+(<http://www.arb\-silva.de/no_cache/download/archive/release_108/Exports/>)
databases also provide many 16S rRNA sequences and Silva includes eukaryotic
sequences. While 16S rRNA is a popular gene, datasets containing any type of gene
could be used in the same fashion to generate simulated amplicon datasets, provided
@@ -587,7 +596,7 @@ Sanger reads. Note that this parameter has no effect unless you specify the
.IP "\-hd <homopolymer_dist> | \-homopolymer_dist <homopolymer_dist>" 4
.IX Item "-hd <homopolymer_dist> | -homopolymer_dist <homopolymer_dist>"
Introduce sequencing errors in the reads under the form of homopolymeric
-stretches (e.g. \s-1AAA\s0, \s-1CCCCC\s0) using a specified model where the homopolymer length
+stretches (e.g. \s-1AAA, CCCCC\s0) using a specified model where the homopolymer length
follows a normal distribution N(mean, standard deviation) that is function of
the homopolymer length n:
.Sp
@@ -619,7 +628,7 @@ are shared between sequences. <chimera_kmer> represents k, the length of the
k\-mers (in bp). The longer the kmer, the more similar the sequences have to be
to be eligible to form chimeras. The more frequent a k\-mer is in the pool of
reference sequences (taking into account their relative abundance), the more
-often this k\-mer will be chosen. For example, \s-1CHSIM\s0 (Edgar et al. 2011) uses this
+often this k\-mer will be chosen. For example, \s-1CHSIM \s0(Edgar et al. 2011) uses this
method with a k\-mer length of 10 bp. If you do not want to use k\-mer information
to form chimeras, use 0, which will result in the reference sequences and
breakpoints to be taken randomly on the \*(L"aligned\*(R" reference sequences. Note that
@@ -655,10 +664,10 @@ different \s-1MID\s0 tags with <multiplex_mids>. Default: 1
.IP "\-mi <multiplex_ids> | \-multiplex_ids <multiplex_ids>" 4
.IX Item "-mi <multiplex_ids> | -multiplex_ids <multiplex_ids>"
Specify an optional \s-1FASTA\s0 file that contains multiplex sequence identifiers
-(a.k.a MIDs or barcodes) to add to the sequences (one sequence per library). The
-MIDs are included in the length specified with the \-read_dist option and can be
-altered by sequencing errors. See the MIDesigner or BarCrawl programs to
-generate \s-1MID\s0 sequences.
+(a.k.a MIDs or barcodes) to add to the sequences (one sequence per library, in
+the order given). The MIDs are included in the length specified with the
+\&\-read_dist option and can be altered by sequencing errors. See the MIDesigner or
+BarCrawl programs to generate \s-1MID\s0 sequences.
.IP "\-di <diversity>... | \-diversity <diversity>..." 4
.IX Item "-di <diversity>... | -diversity <diversity>..."
This option specifies alpha diversity, specifically the richness, i.e. number of
@@ -675,7 +684,7 @@ libraries, specify the percent of reference sequences they should have in common
.IX Item "-pp <permuted_perc> | -permuted_perc <permuted_perc>"
This option controls another aspect of beta-diversity. For multiple libraries,
choose the percent of the most-abundant reference sequences to permute (randomly
-shuffle) the rank-abundance of. Default: 0 %
+shuffle) the rank-abundance of. Default: 100 %
.IP "\-rs <random_seed> | \-random_seed <random_seed>" 4
.IX Item "-rs <random_seed> | -random_seed <random_seed>"
Seed number to use for the pseudo-random number generator.
@@ -845,15 +854,15 @@ Usage : my \f(CW$seed\fR = \f(CW$factory\fR\->get_random_seed;
Returns : seed number
.SH "COPYRIGHT"
.IX Header "COPYRIGHT"
-Copyright 2009\-2012 Florent \s-1ANGLY\s0 <florent.angly at gmail.com>
+Copyright 2009\-2013 Florent \s-1ANGLY\s0 <florent.angly at gmail.com>
.PP
Grinder is free software: you can redistribute it and/or modify
it under the terms of the \s-1GNU\s0 General Public License (\s-1GPL\s0) as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
Grinder is distributed in the hope that it will be useful,
-but \s-1WITHOUT\s0 \s-1ANY\s0 \s-1WARRANTY\s0; without even the implied warranty of
-\&\s-1MERCHANTABILITY\s0 or \s-1FITNESS\s0 \s-1FOR\s0 A \s-1PARTICULAR\s0 \s-1PURPOSE\s0. See the
+but \s-1WITHOUT ANY WARRANTY\s0; without even the implied warranty of
+\&\s-1MERCHANTABILITY\s0 or \s-1FITNESS FOR A PARTICULAR PURPOSE. \s0 See the
\&\s-1GNU\s0 General Public License for more details.
You should have received a copy of the \s-1GNU\s0 General Public License
along with Grinder. If not, see <http://www.gnu.org/licenses/>.
diff --git a/t/01-shotgun.t b/t/01-shotgun.t
index b7eae16..f9e9bed 100644
--- a/t/01-shotgun.t
+++ b/t/01-shotgun.t
@@ -19,7 +19,7 @@ ok $factory = Grinder->new(
ok $factory->next_read;
-# Long argument
+# Total reads (long argument)
ok $factory = Grinder->new(
-reference_file => data('shotgun_database_extended.fa'),
@@ -36,6 +36,24 @@ while ( $read = $factory->next_read ) {
};
is $nof_reads, 100;
+
+# Coverage fold
+
+ok $factory = Grinder->new(
+ -reference_file => data('shotgun_database_extended.fa'),
+ -read_dist => 48 ,
+ -coverage_fold => 6.04 ,
+), 'Coverage fold';
+
+ok $factory->next_lib;
+
+$nof_reads = 0;
+while ( $read = $factory->next_read ) {
+ $nof_reads++;
+};
+is $nof_reads, 111;
+
+
done_testing();
diff --git a/t/02-mates.t b/t/02-mates.t
index f7b24ff..c402489 100644
--- a/t/02-mates.t
+++ b/t/02-mates.t
@@ -3,6 +3,7 @@
use strict;
use warnings;
use Test::More;
+use Test::Warn;
use t::TestUtils;
use Grinder;
@@ -12,19 +13,38 @@ my ($factory, $read, $nof_reads);
ok $factory = Grinder->new(
-reference_file => data('shotgun_database_extended.fa'),
- -total_reads => 100 ,
+ -total_reads => 101 ,
-read_dist => 48 ,
-insert_dist => 250 ,
), 'Mate pairs';
+warning_like { $factory->next_lib } qr{.*added a read.*}i;
+
+$nof_reads = 0;
+while ( $read = $factory->next_read ) {
+ $nof_reads++;
+ ok_mate($read, undef, $nof_reads);
+};
+is $nof_reads, 102;
+
+
+
+# Coverage fold
+
+ok $factory = Grinder->new(
+ -reference_file => data('shotgun_database_extended.fa'),
+ -read_dist => 48 ,
+ -coverage_fold => 6.04 ,
+ -insert_dist => 250 ,
+), 'Coverage fold';
ok $factory->next_lib;
$nof_reads = 0;
while ( $read = $factory->next_read ) {
$nof_reads++;
- ok_mate($read, undef, $nof_reads);
};
-is $nof_reads, 100;
+is $nof_reads, 112;
+
done_testing();
diff --git a/t/05-forbidden.t b/t/05-forbidden.t
index 4eff1a4..5ccf188 100644
--- a/t/05-forbidden.t
+++ b/t/05-forbidden.t
@@ -28,7 +28,7 @@ ok $factory = Grinder->new(
-reference_file => data('dirty_database.fa'),
-exclude_chars => 'n-' , # case independent
-read_dist => 30 ,
- -random_seed => 1233567890 ,
+ -random_seed => 1233756782 ,
-total_reads => 10 ,
), 'Exclude chars';
diff --git a/t/15-multiplex.t b/t/15-multiplex.t
index 57f878a..9d9e2b4 100644
--- a/t/15-multiplex.t
+++ b/t/15-multiplex.t
@@ -3,6 +3,7 @@
use strict;
use warnings;
use Test::More;
+use Test::Warn;
use t::TestUtils;
use Grinder;
@@ -13,13 +14,14 @@ my ($factory, $nof_reads, $read);
# Prepend a single multiplex identifier (MID), ACGT, to shotgun reads
-ok $factory = Grinder->new(
+warning_like { $factory = Grinder->new(
-reference_file => data('shotgun_database.fa'),
-multiplex_ids => data('mids.fa') ,
-num_libraries => 1 ,
-read_dist => 52 ,
-total_reads => 9 ,
-), 'Single MID - shotgun';
+) } qr{.*Ignoring extraneous MIDs.*}i,
+'Single MID - shotgun';
while ( $read = $factory->next_read ) {
is $read->length, 52;
@@ -54,7 +56,7 @@ while ( $read = $factory->next_read ) {
# Prepend a single multiplex identifier to amplicon reads
-ok $factory = Grinder->new(
+warning_like { $factory = Grinder->new(
-reference_file => data('single_amplicon_database.fa'),
-multiplex_ids => data('mids.fa') ,
-num_libraries => 1 ,
@@ -62,7 +64,7 @@ ok $factory = Grinder->new(
-total_reads => 10 ,
-forward_reverse => data('forward_reverse_primers.fa') ,
-unidirectional => 1 ,
-), 'Single MID - amplicon';
+) } qr{.*Ignoring extraneous MIDs.*}i, 'Single MID - amplicon';
while ( $read = $factory->next_read ) {
is $read->seq, 'ACGTAAACTUAAAGGAATTGACGGaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaGTACACACCGC';
@@ -71,7 +73,7 @@ while ( $read = $factory->next_read ) {
# Request too long of a read
-ok $factory = Grinder->new(
+warning_like { $factory = Grinder->new(
-reference_file => data('single_amplicon_database.fa'),
-multiplex_ids => data('mids.fa') ,
-num_libraries => 1 ,
@@ -79,7 +81,7 @@ ok $factory = Grinder->new(
-total_reads => 10 ,
-forward_reverse => data('forward_reverse_primers.fa') ,
-unidirectional => 1 ,
-), 'Single MID - amplicon too long';
+) } qr{.*Ignoring extraneous MIDs.*}i, 'Single MID - amplicon too long';
while ( $read = $factory->next_read ) {
is $read->seq, 'ACGTAAACTUAAAGGAATTGACGGaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaGTACACACCGCCCGT';
diff --git a/t/17-libraries.t b/t/17-libraries.t
index d3ac296..c014abe 100644
--- a/t/17-libraries.t
+++ b/t/17-libraries.t
@@ -17,7 +17,7 @@ ok $factory = Grinder->new(
-reference_file => data('shotgun_database.fa'),
-read_dist => 48 ,
-num_libraries => 4 ,
- -total_reads => 99 ,
+ -total_reads => 100 ,
), 'Multiple shotgun libraries';
$nof_libs = 0;
@@ -28,7 +28,7 @@ while ( $lib = $factory->next_lib ) {
$nof_reads++;
ok_read($read, undef, $nof_reads, $nof_libs);
};
- is $nof_reads, 99;
+ is $nof_reads, 100;
}
is $nof_libs, 4;
@@ -37,7 +37,7 @@ is $nof_libs, 4;
ok $factory = Grinder->new(
-reference_file => data('shotgun_database.fa'),
- -total_reads => 99 ,
+ -total_reads => 100 ,
-read_dist => 48 ,
-num_libraries => 4 ,
-insert_dist => 250 ,
@@ -51,7 +51,7 @@ while ( $lib = $factory->next_lib ) {
$nof_reads++;
ok_mate($read, undef, $nof_reads, $nof_libs);
};
- is $nof_reads, 99;
+ is $nof_reads, 100;
}
is $nof_libs, 4;
diff --git a/t/20-community-structure.t b/t/20-community-structure.t
index 1e71428..2e28753 100644
--- a/t/20-community-structure.t
+++ b/t/20-community-structure.t
@@ -23,6 +23,7 @@ ok $factory = Grinder->new(
-length_bias => 0 ,
-abundance_model => ('uniform', 0) ,
-total_reads => 1000 ,
+ -random_seed => 1234567890 ,
), 'Uniform community structure';
while ( $read = $factory->next_read ) {
@@ -46,6 +47,7 @@ ok $factory = Grinder->new(
-length_bias => 0 ,
-abundance_model => ('linear', 0) ,
-total_reads => 1000 ,
+ -random_seed => 1234567890 ,
), 'Linear community structure';
while ( $read = $factory->next_read ) {
@@ -69,6 +71,7 @@ ok $factory = Grinder->new(
-length_bias => 0 ,
-abundance_model => ('powerlaw', 0.5) ,
-total_reads => 1000 ,
+ -random_seed => 1234567890 ,
), 'Power law community structure';
while ( $read = $factory->next_read ) {
@@ -92,6 +95,7 @@ ok $factory = Grinder->new(
-length_bias => 0 ,
-abundance_model => ('logarithmic', 0.5) ,
-total_reads => 1000 ,
+ -random_seed => 1234567890 ,
), 'Logarithmic community structure';
while ( $read = $factory->next_read ) {
@@ -115,6 +119,7 @@ ok $factory = Grinder->new(
-length_bias => 0 ,
-abundance_model => ('exponential', 0.5) ,
-total_reads => 1000 ,
+ -random_seed => 1234567890 ,
), 'Exponential community structure';
$struct = $factory->next_lib;
@@ -141,6 +146,7 @@ ok $factory = Grinder->new(
-shared_perc => 100 ,
-abundance_model => ('exponential') ,
-total_reads => 1000 ,
+ -random_seed => 1234567890 ,
), 'Communities with random structure parameter value';
$struct = $factory->next_lib;
diff --git a/t/21-errors.t b/t/21-errors.t
index 167c333..45f4b1b 100644
--- a/t/21-errors.t
+++ b/t/21-errors.t
@@ -106,6 +106,7 @@ ok $factory = Grinder->new(
-total_reads => 1000 ,
-mutation_ratio => (50, 50) ,
-mutation_dist => ('uniform', 10) ,
+ -random_seed => 1233567880 ,
), 'Uniform (frequent errors)';
while ( $read = $factory->next_read ) {
@@ -136,7 +137,8 @@ ok $factory = Grinder->new(
-read_dist => 50 ,
-total_reads => 10000 ,
-mutation_ratio => (50, 50) ,
- -mutation_dist => ('uniform', 0.1) ,
+ -mutation_dist => ('uniform', 0.1) ,
+ -random_seed => 1233567880 ,
), 'Uniform (rare errors)';
while ( $read = $factory->next_read ) {
@@ -146,10 +148,10 @@ while ( $read = $factory->next_read ) {
$prof = hist(\@epositions, 1, 50);
($min, $max, $mean, $stddev) = stats($prof);
-between_ok( $$prof[0] , 7, 13 ); # exp. number of errors at 1st pos is 100 (10%)
-between_ok( $$prof[24], 7, 13 ); # exp. number of errors at 25th pos is 100 (10%)
-between_ok( $$prof[-1], 7, 13 ); # exp. number of errors at last pos is 100 (10%)
-between_ok( $mean , 9, 11 ); # exp. mean number is 100 (10%)
+between_ok( $$prof[0] , 4, 16 ); # exp. number of errors at 1st pos is 100 (10%)
+between_ok( $$prof[24], 4, 16 ); # exp. number of errors at 25th pos is 100 (10%)
+between_ok( $$prof[-1], 4, 16 ); # exp. number of errors at last pos is 100 (10%)
+between_ok( $mean , 8, 12 ); # exp. mean number is 100 (10%)
SKIP: {
skip rfit_msg() if not can_rfit();
@@ -168,6 +170,7 @@ ok $factory = Grinder->new(
-total_reads => 1000 ,
-mutation_ratio => (50, 50) ,
-mutation_dist => ('linear', 5, 15) ,
+ -random_seed => 1233567880 ,
), 'Linear';
while ( $read = $factory->next_read ) {
@@ -178,18 +181,18 @@ while ( $read = $factory->next_read ) {
$prof = hist(\@epositions, 1, 50);
($min, $max, $mean, $stddev) = stats($prof);
between_ok( $$prof[0] , 30, 70 ); # exp. number of errors at 1st pos is 50 (5%)
-between_ok( $$prof[24], 70, 130 ); # exp. number of errors at 25th pos is 100 (10%)
-between_ok( $$prof[-1], 120, 180 ); # exp. number of errors at last pos is 150 (15%)
+between_ok( $$prof[24], 65, 135 ); # exp. number of errors at 25th pos is 100 (10%)
+between_ok( $$prof[-1], 115, 185 ); # exp. number of errors at last pos is 150 (15%)
between_ok( $mean , 97, 103 ); # exp. mean number of errors is 100
-SKIP: {
- skip rfit_msg() if not can_rfit();
+#SKIP: {
+ #skip rfit_msg() if not can_rfit();
#### TODO
#TODO: {
# $TODO = "Need to implement a linear density distribution in R";
# test_linear_dist(\@epositions, 1, 50, 0.0000000001);
#}
-}
+#}
@epositions = ();
@@ -203,6 +206,7 @@ ok $factory = Grinder->new(
-total_reads => 1000 ,
-mutation_ratio => (50, 50) ,
-mutation_dist => ('poly4', 1, 4.4e-7) ,
+ -random_seed => 1233567880 ,
), 'Polynomial';
while ( $read = $factory->next_read ) {
@@ -214,17 +218,17 @@ $prof = hist(\@epositions, 1, 100);
($min, $max, $mean, $stddev) = stats($prof);
between_ok( $$prof[0] , 1, 27 ); # exp. number of errors at 1st is 10 (1%)
between_ok( $$prof[49], 7, 67 ); # exp. number of errors at 50th is 37.4 (3.74%)
-between_ok( $$prof[-1], 410, 488 ); # exp. number of errors at last is 449 (44.9%)
+between_ok( $$prof[-1], 405, 492 ); # exp. number of errors at last is 449 (44.9%)
between_ok( $mean , 97, 103 ); # exp. mean number of errors is 100 (10.02%)
-SKIP: {
- skip rfit_msg() if not can_rfit();
+#SKIP: {
+ #skip rfit_msg() if not can_rfit();
#### TODO
#TODO: {
# $TODO = "Need to implement a polynomial distribution in R";
# test_polynomial_dist(\@epositions, 1, 50, 0.0000000001);
#}
-}
+#}
@epositions = ();
diff --git a/t/29-kmer-collection.t b/t/29-kmer-collection.t
index 51f6ef5..463086a 100644
--- a/t/29-kmer-collection.t
+++ b/t/29-kmer-collection.t
@@ -69,7 +69,7 @@ ok $col = Grinder::KmerCollection->new( -k => 8, -seqs => [$seq1, $seq2] );
# Count of all kmers
($kmers, $counts) = $col->counts();
-$kmers = [sort @$kmers];
+$kmers = [sort {$a cmp $b} @$kmers];
$counts = [sort {$a <=> $b} @$counts];
is_deeply $kmers , [
'AAAAAAAA',
@@ -140,7 +140,7 @@ is_deeply $counts, [
# Frequency of kmers from position >= 40
($kmers, $freqs) = $col->counts(undef, 40, 1);
-$kmers = [sort @$kmers];
+$kmers = [sort {$a cmp $b} @$kmers];
$freqs = [sort {$a <=> $b} @$freqs];
is_deeply $kmers , [
'AAAAAAAA',
@@ -190,7 +190,7 @@ is_deeply $kmers, [ 'AAAAAAAA' ];
is_deeply $freqs, [ 1 ];
($kmers, $freqs) = $col->counts('seq4', 40, 1);
-$kmers = [sort @$kmers];
+$kmers = [sort {$a cmp $b} @$kmers];
$freqs = [sort {$a <=> $b} @$freqs];
is_deeply $kmers , [
'AACCCCCC',
@@ -255,12 +255,13 @@ ok not exists $by_kmer->{'seq4'}->{'CCCCGGGG'};
ok not exists $by_kmer->{'seq4'}->{'ACCCCCCC'};
($sources, $counts) = $col->sources('AAAAAAAA');
-is_deeply $sources, ['seq4', 'seq1'];
-is_deeply $counts , [ 1 , 73 ];
+my %values = ('seq1' => 73,
+ 'seq4' => 1);
+is $values{$sources->[0]}, $counts->[0];
+is $values{$sources->[1]}, $counts->[1];
($sources, $counts) = $col->sources('AAAAAAAA', 'seq1');
-is_deeply $sources, ['seq4'];
-is_deeply $counts , [ 1 ];
+is $values{$sources->[0]}, $counts->[0];
($sources, $counts) = $col->sources('ZZZZZZZZ');
is_deeply $sources, [];
@@ -308,12 +309,13 @@ ok exists $by_kmer->{'abc'}->{'AAAAAAAA'};
ok exists $by_kmer->{'123'}->{'AAAAAAAA'};
($sources, $counts) = $col->sources('AAAAAAAA');
-is_deeply $sources, ['123', 'abc'];
-is_deeply $counts , [ 1 , 73 ];
+%values = ('123' => 1,
+ 'abc' => 73);
+is $values{$sources->[0]}, $counts->[0];
+is $values{$sources->[1]}, $counts->[1];
($sources, $counts) = $col->sources('AAAAAAAA', 'abc');
-is_deeply $sources, ['123'];
-is_deeply $counts , [ 1 ];
+is $values{$sources->[0]}, $counts->[0];
# Using weights
@@ -326,8 +328,10 @@ $weights = { 'seq1' => 10, 'seq4' => 0.1 };
ok $col->weights($weights);
($sources, $counts) = $col->sources('AAAAAAAA');
-is_deeply $sources, ['seq4', 'seq1'];
-is_deeply $counts , [ 0.1 , 730 ];
+%values = ('seq1' => 730,
+ 'seq4' => 0.1);
+is $values{$sources->[0]}, $counts->[0];
+is $values{$sources->[1]}, $counts->[1];
($kmers, $counts) = $col->counts();
is_deeply $kmers , ['AAAAAAAA'];
diff --git a/t/32-database.t b/t/32-database.t
index 5cce163..6be4a9b 100644
--- a/t/32-database.t
+++ b/t/32-database.t
@@ -18,6 +18,7 @@ isa_ok $db, 'Grinder::Database';
is $db->get_minimum_length, 1;
is $db->get_delete_chars, '';
is_deeply [sort @{$db->get_ids}], ['seq1', 'seq2', 'seq3', 'seq4', 'seq5'];
+$db->get_database->DESTROY;
ok $db = Grinder::Database->new(
@@ -25,7 +26,8 @@ ok $db = Grinder::Database->new(
-minimum_length => 200,
);
is $db->get_minimum_length, 200;
-is_deeply $db->get_ids, ['seq1', 'seq2'];
+is_deeply [sort @{$db->get_ids}], ['seq1', 'seq2'];
+$db->get_database->DESTROY;
ok $db = Grinder::Database->new(
@@ -60,32 +62,37 @@ is $seq->seq, 'attttttttt';
# Test alphabet
is $db->get_alphabet, 'dna';
+$db->get_database->DESTROY;
ok $db = Grinder::Database->new(
-fasta_file => data('database_dna.fa'),
);
is $db->get_alphabet, 'dna';
+$db->get_database->DESTROY;
ok $db = Grinder::Database->new(
-fasta_file => data('database_rna.fa'),
);
is $db->get_alphabet, 'rna';
+$db->get_database->DESTROY;
ok $db = Grinder::Database->new(
-fasta_file => data('database_protein.fa'),
-unidirectional => 1,
);
is $db->get_alphabet, 'protein';
+$db->get_database->DESTROY;
ok $db = Grinder::Database->new(
-fasta_file => data('database_mixed.fa'),
-unidirectional => 1,
);
is $db->get_alphabet, 'protein';
+$db->get_database->DESTROY;
####ok $db = Grinder::Database->new(
-#### -fasta_file => data('shotgun_database.fa'),
+#### -fasta_file => data('shotgun_database.fa'),
#### -unidirectional => -1,
####);
####is $db->get_unidirectional, -1;
@@ -97,7 +104,7 @@ is $db->get_alphabet, 'protein';
#### -forward_reverse_primers =>
#### -abundance_file =>
#### -delete_chars =>
-#### -min_len => 1;
+#### -min_len => 1
####);
diff --git a/t/TestUtils.pm b/t/TestUtils.pm
index 73c5905..a8b8c71 100644
--- a/t/TestUtils.pm
+++ b/t/TestUtils.pm
@@ -207,10 +207,7 @@ sub can_rfit {
};
if ($@) {
$can_rfit = 0;
- my $msg = "Note: The Statistics::R module for Perl, R (R-Project) ".
- "or the fitdistrplus module for R could not be found on this system.".
- " Some tests will be skipped...\n";
- warn $msg;
+ warn "Skip: ".rfit_msg()."\n";
} else {
$can_rfit = 1;
}
@@ -220,7 +217,7 @@ sub can_rfit {
sub rfit_msg {
- return "fitdistrplus not available...";
+ return "Statistics::R, R or fitdistrplus not found";
}
diff --git a/t/data/database_dna.fa.index b/t/data/database_dna.fa.index
deleted file mode 100644
index ae5ca76..0000000
Binary files a/t/data/database_dna.fa.index and /dev/null differ
diff --git a/t/data/database_mixed.fa.index b/t/data/database_mixed.fa.index
deleted file mode 100644
index 5af472b..0000000
Binary files a/t/data/database_mixed.fa.index and /dev/null differ
diff --git a/t/data/database_protein.fa.index b/t/data/database_protein.fa.index
deleted file mode 100644
index 839e27d..0000000
Binary files a/t/data/database_protein.fa.index and /dev/null differ
diff --git a/t/data/database_rna.fa.index b/t/data/database_rna.fa.index
deleted file mode 100644
index 58676fe..0000000
Binary files a/t/data/database_rna.fa.index and /dev/null differ
diff --git a/t/data/shotgun_database.fa.index b/t/data/shotgun_database.fa.index
deleted file mode 100644
index 048d6fd..0000000
Binary files a/t/data/shotgun_database.fa.index and /dev/null differ
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/grinder.git
More information about the debian-med-commit
mailing list