[med-svn] [roary] 01/03: New upstream version 3.9.1+dfsg
Sascha Steinbiss
satta at debian.org
Tue Aug 22 14:03:08 UTC 2017
This is an automated email from the git hooks/post-receive script.
satta pushed a commit to branch master
in repository roary.
commit c6fdb3ca02cf16028191b815709c50e118f94643
Author: Sascha Steinbiss <satta at debian.org>
Date: Tue Aug 22 15:54:11 2017 +0200
New upstream version 3.9.1+dfsg
---
dist.ini | 2 +-
lib/Bio/Roary.pm | 2 +
lib/Bio/Roary/CommandLine/Roary.pm | 10 +-
lib/Bio/Roary/CommandLine/RoaryCoreAlignment.pm | 9 +-
lib/Bio/Roary/CommandLine/RoaryPostAnalysis.pm | 7 +-
.../Roary/External/GeneAlignmentFromNucleotides.pm | 2 +
lib/Bio/Roary/External/PostAnalysis.pm | 5 +
lib/Bio/Roary/ExtractCoreGenesFromSpreadsheet.pm | 227 ++++++++++-----------
lib/Bio/Roary/MergeMultifastaAlignments.pm | 2 +-
t/Bio/Roary/ExtractCoreGenesFromSpreadsheet.t | 67 ++++--
10 files changed, 194 insertions(+), 139 deletions(-)
diff --git a/dist.ini b/dist.ini
index 39fc6d2..4d8b414 100644
--- a/dist.ini
+++ b/dist.ini
@@ -1,5 +1,5 @@
name = Bio-Roary
-version = 3.9.0
+version = 3.9.1
author = Andrew J. Page <ap13 at sanger.ac.uk>
license = GPL_3
copyright_holder = Wellcome Trust Sanger Institute
diff --git a/lib/Bio/Roary.pm b/lib/Bio/Roary.pm
index feccc74..9ccd79a 100644
--- a/lib/Bio/Roary.pm
+++ b/lib/Bio/Roary.pm
@@ -48,6 +48,7 @@ has 'core_definition' => ( is => 'rw', isa => 'Num', default =
has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'mafft' => ( is => 'ro', isa => 'Bool', default => 0 );
has 'inflation_value' => ( is => 'rw', isa => 'Num', default => 1.5 );
+has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'output_multifasta_files' => ( is => 'ro', isa => 'Bool', default => 0 );
@@ -136,6 +137,7 @@ sub run {
core_definition => $self->core_definition,
verbose => $self->verbose,
mafft => $self->mafft,
+ allow_paralogs => $self->allow_paralogs,
);
$post_analysis->run();
diff --git a/lib/Bio/Roary/CommandLine/Roary.pm b/lib/Bio/Roary/CommandLine/Roary.pm
index 8c8b0c8..3052b6c 100644
--- a/lib/Bio/Roary/CommandLine/Roary.pm
+++ b/lib/Bio/Roary/CommandLine/Roary.pm
@@ -47,6 +47,7 @@ has 'dont_split_groups' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'verbose_stats' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'translation_table' => ( is => 'rw', isa => 'Int', default => 11 );
has 'mafft' => ( is => 'rw', isa => 'Bool', default => 0 );
+has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'group_limit' => ( is => 'rw', isa => 'Num', default => 50000 );
has 'core_definition' => ( is => 'rw', isa => 'Num', default => 0.99 );
has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 );
@@ -71,7 +72,7 @@ sub BUILD {
$job_runner, $makeblastdb_exec, $mcxdeblast_exec, $mcl_exec, $blastp_exec,
$apply_unknowns_filter, $cpus, $output_multifasta_files, $verbose_stats, $translation_table,
$run_qc, $core_definition, $help, $kraken_db, $cmd_version,
- $mafft, $output_directory, $check_dependancies, $inflation_value,
+ $mafft, $output_directory, $check_dependancies, $inflation_value, $allow_paralogs,
);
GetOptionsFromArray(
@@ -98,6 +99,7 @@ sub BUILD {
'cd|core_definition=f' => \$core_definition,
'v|verbose' => \$verbose,
'n|mafft' => \$mafft,
+ 'ap|allow_paralogs' => \$allow_paralogs,
'k|kraken_db=s' => \$kraken_db,
'w|version' => \$cmd_version,
'a|check_dependancies' => \$check_dependancies,
@@ -302,7 +304,8 @@ sub run {
core_definition => $self->core_definition,
verbose => $self->verbose,
mafft => $self->mafft,
- inflation_value => $self->inflation_value,
+ allow_paralogs => $self->allow_paralogs,
+ inflation_value => $self->inflation_value,
);
$pan_genome_obj->run();
@@ -343,11 +346,12 @@ Options: -p INT number of threads [1]
-r create R plots, requires R and ggplot2
-s dont split paralogs
-t INT translation table [11]
+ -ap allow paralogs in core alignment
-z dont delete intermediate files
-v verbose output to STDOUT
-w print version and exit
-y add gene inference information to spreadsheet, doesnt work with -e
- -iv STR Change the MCL inflation value [1.5]
+ -iv STR Change the MCL inflation value [1.5]
-h this help message
Example: Quickly generate a core gene alignment using 8 threads
diff --git a/lib/Bio/Roary/CommandLine/RoaryCoreAlignment.pm b/lib/Bio/Roary/CommandLine/RoaryCoreAlignment.pm
index dc52a60..f586a27 100644
--- a/lib/Bio/Roary/CommandLine/RoaryCoreAlignment.pm
+++ b/lib/Bio/Roary/CommandLine/RoaryCoreAlignment.pm
@@ -27,13 +27,14 @@ has 'spreadsheet_filename' => ( is => 'rw', isa => 'Str', default => 'gene_
has 'output_filename' => ( is => 'rw', isa => 'Str', default => 'core_gene_alignment.aln' );
has 'core_definition' => ( is => 'rw', isa => 'Num', default => 0.99 );
has 'dont_delete_files' => ( is => 'rw', isa => 'Bool', default => 0 );
+has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 );
has '_error_message' => ( is => 'rw', isa => 'Str' );
has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 );
sub BUILD {
my ($self) = @_;
- my ( $multifasta_base_directory, $spreadsheet_filename, $output_filename, $core_definition,$verbose, $help, $mafft, $dont_delete_files );
+ my ( $multifasta_base_directory, $spreadsheet_filename, $output_filename, $core_definition,$verbose, $help, $mafft, $allow_paralogs, $dont_delete_files );
GetOptionsFromArray(
$self->args,
@@ -42,6 +43,7 @@ sub BUILD {
'o|output_filename=s' => \$output_filename,
'cd|core_definition=f' => \$core_definition,
'z|dont_delete_files' => \$dont_delete_files,
+ 'p|allow_paralogs' => \$allow_paralogs,
'v|verbose' => \$verbose,
'h|help' => \$help,
);
@@ -51,6 +53,7 @@ sub BUILD {
$self->logger->level(10000);
}
$self->help($help) if(defined($help));
+ $self->allow_paralogs($allow_paralogs) if(defined($allow_paralogs));
if ( defined($multifasta_base_directory) && ( -d $multifasta_base_directory ) ) {
$self->multifasta_base_directory( abs_path($multifasta_base_directory));
@@ -95,7 +98,8 @@ sub run {
$self->logger->info("Extract core genes from spreadsheet");
my $core_genes_obj = Bio::Roary::ExtractCoreGenesFromSpreadsheet->new(
spreadsheet => $self->spreadsheet_filename,
- core_definition => $self->core_definition
+ core_definition => $self->core_definition,
+ allow_paralogs => $self->allow_paralogs
);
$self->logger->info("Looking up genes in files");
@@ -130,6 +134,7 @@ Options: -o STR output filename [core_gene_alignment.aln]
-cd FLOAT percentage of isolates a gene must be in to be core [99]
-m STR directory containing gene multi-FASTAs [pan_genome_sequences]
-s STR gene presence and absence spreadsheet [gene_presence_absence.csv]
+ -p allow paralogs
-z dont delete intermediate files
-v verbose output to STDOUT
-h this help message
diff --git a/lib/Bio/Roary/CommandLine/RoaryPostAnalysis.pm b/lib/Bio/Roary/CommandLine/RoaryPostAnalysis.pm
index f6a59d0..fdbfbd6 100644
--- a/lib/Bio/Roary/CommandLine/RoaryPostAnalysis.pm
+++ b/lib/Bio/Roary/CommandLine/RoaryPostAnalysis.pm
@@ -41,6 +41,7 @@ has 'group_limit' => ( is => 'rw', isa => 'Num', default => 500
has 'core_definition' => ( is => 'rw', isa => 'Num', default => 0.99 );
has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'mafft' => ( is => 'rw', isa => 'Bool', default => 0 );
+has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 );
sub BUILD {
my ($self) = @_;
@@ -48,7 +49,7 @@ sub BUILD {
my (
$output_filename, $dont_create_rplots, $dont_delete_files, $dont_split_groups, $output_pan_geneome_filename,
$job_runner, $output_statistics_filename, $output_multifasta_files, $clusters_filename, $core_definition,
- $fasta_files, $input_files, $verbose_stats, $translation_table, $help, $cpus,$group_limit,$verbose,$mafft
+ $fasta_files, $input_files, $verbose_stats, $translation_table, $help, $cpus,$group_limit,$verbose,$mafft, $allow_paralogs
);
@@ -72,6 +73,7 @@ sub BUILD {
'cd|core_definition=f' => \$core_definition,
'v|verbose' => \$verbose,
'n|mafft' => \$mafft,
+ 'q|allow_paralogs' => \$allow_paralogs,
'h|help' => \$help,
);
@@ -93,6 +95,7 @@ sub BUILD {
$self->group_limit($group_limit) if ( defined($group_limit) );
$self->core_definition( $core_definition/100 ) if ( defined($core_definition) );
$self->mafft($mafft) if ( defined($mafft) );
+ $self->allow_paralogs($allow_paralogs) if ( defined($allow_paralogs) );
if ( defined($verbose) ) {
$self->verbose($verbose);
$self->logger->level(10000);
@@ -158,6 +161,7 @@ sub run {
cpus => $self->cpus,
verbose => $self->verbose,
mafft => $self->mafft,
+ allow_paralogs => $self->allow_paralogs,
dont_delete_files => $self->dont_delete_files,
num_input_files => $#{$input_files},
);
@@ -222,6 +226,7 @@ Options: -a dont delete intermediate files
-n fast core gene alignement with MAFFT instead of PRANK
-o STR clusters output filename [clustered_proteins]
-p STR output pan genome filename [pan_genome.fa]
+ -q allow paralogs in core alignment
-s STR output gene presence and absence filename [gene_presence_absence.csv]
-t INT translation table [11]
-z INT number of threads [1]
diff --git a/lib/Bio/Roary/External/GeneAlignmentFromNucleotides.pm b/lib/Bio/Roary/External/GeneAlignmentFromNucleotides.pm
index 122628a..3f65daa 100644
--- a/lib/Bio/Roary/External/GeneAlignmentFromNucleotides.pm
+++ b/lib/Bio/Roary/External/GeneAlignmentFromNucleotides.pm
@@ -29,6 +29,7 @@ has 'translation_table' => ( is => 'rw', isa => 'Int', default =>
has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1 );
has 'mafft' => ( is => 'ro', isa => 'Bool', default => 0 );
has 'dont_delete_files' => ( is => 'rw', isa => 'Bool', default => 0 );
+has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'num_input_files' => ( is => 'ro', isa => 'Int', required => 1);
# Overload Role`
@@ -85,6 +86,7 @@ sub _build__core_alignment_cmd {
my $core_cmd = "pan_genome_core_alignment";
$core_cmd .= " -cd " . ($self->core_definition*100) if ( defined $self->core_definition );
$core_cmd .= " --dont_delete_files " if ( defined $self->dont_delete_files && $self->dont_delete_files == 1 );
+ $core_cmd .= " --allow_paralogs " if ( defined $self->allow_paralogs && $self->allow_paralogs == 1 );
return $core_cmd;
}
diff --git a/lib/Bio/Roary/External/PostAnalysis.pm b/lib/Bio/Roary/External/PostAnalysis.pm
index 211b746..1de663a 100644
--- a/lib/Bio/Roary/External/PostAnalysis.pm
+++ b/lib/Bio/Roary/External/PostAnalysis.pm
@@ -37,6 +37,7 @@ has 'group_limit' => ( is => 'rw', isa => 'Num', default => 50
has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1.0 );
has 'verbose' => ( is => 'rw', isa => 'Bool', default => 0 );
has 'mafft' => ( is => 'ro', isa => 'Bool', default => 0 );
+has 'allow_paralogs' => ( is => 'ro', isa => 'Bool', default => 0 );
has '_working_directory' => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
has '_gff_fofn' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__gff_fofn' );
has '_fasta_fofn' => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__fasta_fofn' );
@@ -137,6 +138,9 @@ sub _command_to_run {
my $verbose_flag = '';
$verbose_flag = '-v' if ( defined($self->verbose) && $self->verbose == 1 );
+
+ my $allow_paralogs_flag = '';
+ $allow_paralogs_flag = '--allow_paralogs' if ( defined($self->allow_paralogs) && $self->allow_paralogs == 1 );
return join(
" ",
@@ -156,6 +160,7 @@ sub _command_to_run {
$verbose_stats_flag,
$verbose_flag,
$mafft_flag,
+ $allow_paralogs_flag,
'-j', $self->job_runner,
'--processors', $self->cpus,
'--group_limit', $self->group_limit,
diff --git a/lib/Bio/Roary/ExtractCoreGenesFromSpreadsheet.pm b/lib/Bio/Roary/ExtractCoreGenesFromSpreadsheet.pm
index 9fa6fed..7c898fc 100644
--- a/lib/Bio/Roary/ExtractCoreGenesFromSpreadsheet.pm
+++ b/lib/Bio/Roary/ExtractCoreGenesFromSpreadsheet.pm
@@ -19,33 +19,33 @@ use Text::CSV;
use Bio::Roary::GroupStatistics;
use POSIX;
-has 'spreadsheet' => ( is => 'ro', isa => 'Str', required => 1 );
-has '_csv_parser' => ( is => 'ro', isa => 'Text::CSV',lazy => 1, builder => '_build__csv_parser' );
-has '_input_spreadsheet_fh' => ( is => 'ro', lazy => 1, builder => '_build__input_spreadsheet_fh' );
-has 'ordered_core_genes' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_ordered_core_genes' );
-has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1 );
-has 'sample_names' => ( is => 'rw', isa => 'ArrayRef', default => sub {[]} );
-has 'sample_names_to_genes' => ( is => 'rw', isa => 'HashRef', default => sub {{}} );
-
-has '_number_of_isolates' => ( is => 'rw', isa => 'Int');
-has '_gene_column' => ( is => 'rw', isa => 'Int');
-has '_num_isolates_column' => ( is => 'rw', isa => 'Int');
-has '_avg_sequences_per_isolate_column' => ( is => 'rw', isa => 'Int');
-has '_genome_fragement_column' => ( is => 'rw', isa => 'Int');
-has '_order_within_fragement_column' => ( is => 'rw', isa => 'Int');
-has '_min_no_isolates_for_core' => ( is => 'rw', isa => 'Num', lazy => 1, builder => '_build__min_no_isolates_for_core' );
+has 'spreadsheet' => ( is => 'ro', isa => 'Str', required => 1 );
+has '_csv_parser' => ( is => 'ro', isa => 'Text::CSV', lazy => 1, builder => '_build__csv_parser' );
+has '_input_spreadsheet_fh' => ( is => 'ro', lazy => 1, builder => '_build__input_spreadsheet_fh' );
+has 'ordered_core_genes' => ( is => 'ro', isa => 'ArrayRef', lazy => 1, builder => '_build_ordered_core_genes' );
+has 'core_definition' => ( is => 'ro', isa => 'Num', default => 1 );
+has 'sample_names' => ( is => 'rw', isa => 'ArrayRef', default => sub { [] } );
+has 'sample_names_to_genes' => ( is => 'rw', isa => 'HashRef', default => sub { {} } );
+has 'allow_paralogs' => ( is => 'rw', isa => 'Bool', default => 0 );
+
+has '_number_of_isolates' => ( is => 'rw', isa => 'Int' );
+has '_gene_column' => ( is => 'rw', isa => 'Int' );
+has '_num_isolates_column' => ( is => 'rw', isa => 'Int' );
+has '_avg_sequences_per_isolate_column' => ( is => 'rw', isa => 'Int' );
+has '_genome_fragement_column' => ( is => 'rw', isa => 'Int' );
+has '_order_within_fragement_column' => ( is => 'rw', isa => 'Int' );
+has '_min_no_isolates_for_core' => ( is => 'rw', isa => 'Num', lazy => 1, builder => '_build__min_no_isolates_for_core' );
sub _build__min_no_isolates_for_core {
- my ($self) = @_;
- my $threshold = $self->_number_of_isolates * $self->core_definition;
+ my ($self) = @_;
+ my $threshold = $self->_number_of_isolates * $self->core_definition;
- return $threshold;
+ return $threshold;
}
-sub _build__csv_parser
-{
- my ($self) = @_;
- return Text::CSV->new( { binary => 1, always_quote => 1} );
+sub _build__csv_parser {
+ my ($self) = @_;
+ return Text::CSV->new( { binary => 1, always_quote => 1 } );
}
sub _build__input_spreadsheet_fh {
@@ -54,115 +54,112 @@ sub _build__input_spreadsheet_fh {
return $fh;
}
-sub _update_number_of_isolates
-{
- my ($self, $header_row) = @_;
- my $number_of_isolates = @{$header_row} - @{Bio::Roary::GroupStatistics->fixed_headers};
- $self->_number_of_isolates($number_of_isolates);
+sub _update_number_of_isolates {
+ my ( $self, $header_row ) = @_;
+ my $number_of_isolates = @{$header_row} - @{ Bio::Roary::GroupStatistics->fixed_headers };
+ $self->_number_of_isolates($number_of_isolates);
}
-sub _setup_column_mappings
-{
- my ($self, $header_row) = @_;
- # current ordering
- my %columns_of_interest_mappings = (
- 'Gene' => 0,
- 'No. isolates' => 3,
- 'Avg sequences per isolate' => 5,
- 'Genome Fragment' => 6,
- 'Order within Fragment' => 7,
- 'QC' => 10,
+sub _setup_column_mappings {
+ my ( $self, $header_row ) = @_;
+
+ # current ordering
+ my %columns_of_interest_mappings = (
+ 'Gene' => 0,
+ 'No. isolates' => 3,
+ 'Avg sequences per isolate' => 5,
+ 'Genome Fragment' => 6,
+ 'Order within Fragment' => 7,
+ 'QC' => 10,
);
-
- # Dynamically overwrite the default ordering
- for(my $i = 0; $i < @{$header_row}; $i++)
- {
- for my $col_name (%columns_of_interest_mappings)
- {
- if($header_row->[$i] eq $col_name)
- {
- $columns_of_interest_mappings{$col_name} = $i;
- last;
- }
+
+ # Dynamically overwrite the default ordering
+ for ( my $i = 0 ; $i < @{$header_row} ; $i++ ) {
+ for my $col_name (%columns_of_interest_mappings) {
+ if ( $header_row->[$i] eq $col_name ) {
+ $columns_of_interest_mappings{$col_name} = $i;
+ last;
+ }
+ }
}
- }
- $self->_gene_column($columns_of_interest_mappings{'Gene'});
- $self->_num_isolates_column($columns_of_interest_mappings{'No. isolates'});
- $self->_avg_sequences_per_isolate_column($columns_of_interest_mappings{'Avg sequences per isolate'});
- $self->_genome_fragement_column($columns_of_interest_mappings{'Genome Fragment'});
- $self->_order_within_fragement_column($columns_of_interest_mappings{'Order within Fragment'});
- $self->_update_number_of_isolates($header_row);
-
- # Get the sample_names
- my @sample_names;
- for(my $i = $self->_length_of_fixed_headers(); $i < @{$header_row}; $i++)
- {
- push(@sample_names,$header_row->[$i]);
- }
- $self->sample_names(\@sample_names);
+ $self->_gene_column( $columns_of_interest_mappings{'Gene'} );
+ $self->_num_isolates_column( $columns_of_interest_mappings{'No. isolates'} );
+ $self->_avg_sequences_per_isolate_column( $columns_of_interest_mappings{'Avg sequences per isolate'} );
+ $self->_genome_fragement_column( $columns_of_interest_mappings{'Genome Fragment'} );
+ $self->_order_within_fragement_column( $columns_of_interest_mappings{'Order within Fragment'} );
+ $self->_update_number_of_isolates($header_row);
+
+ # Get the sample_names
+ my @sample_names;
+ for ( my $i = $self->_length_of_fixed_headers() ; $i < @{$header_row} ; $i++ ) {
+ push( @sample_names, $header_row->[$i] );
+ }
+ $self->sample_names( \@sample_names );
}
-sub _length_of_fixed_headers
-{
- my ($self) = @_;
- return @{Bio::Roary::GroupStatistics->fixed_headers()};
+sub _length_of_fixed_headers {
+ my ($self) = @_;
+ return @{ Bio::Roary::GroupStatistics->fixed_headers() };
}
-sub _populate_sample_to_gene_lookup_with_row
-{
- my ($self, $row) = @_;
-
- for(my $i = $self->_length_of_fixed_headers(); $i < @{$row}; $i++ )
- {
- if(defined($row->[$i]) && $row->[$i] ne "" )
- {
- my $sample_name = $self->sample_names->[$i - $self->_length_of_fixed_headers()];
-
- $self->sample_names_to_genes->{$sample_name}->{$row->[$i]} = 1;
- }
- }
- return 1;
-}
+sub _populate_sample_to_gene_lookup_with_row {
+ my ( $self, $row ) = @_;
+ for ( my $i = $self->_length_of_fixed_headers() ; $i < @{$row} ; $i++ ) {
+ if ( defined( $row->[$i] ) && $row->[$i] ne "" ) {
+ my $sample_name = $self->sample_names->[ $i - $self->_length_of_fixed_headers() ];
-sub _ordered_core_genes
-{
- my ($self) = @_;
- my %ordered_genes;
- while ( my $row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ) )
- {
- next if(@{$row} < 12); # no genes in group
- next if(!defined($row->[$self->_gene_column]) || $row->[$self->_gene_column] eq '' ); # no gene name
- next if(!defined($row->[$self->_avg_sequences_per_isolate_column]) || $row->[$self->_avg_sequences_per_isolate_column] eq '' ); # no average
- next if(!defined($row->[$self->_genome_fragement_column]) || $row->[$self->_genome_fragement_column] eq '' ); # fragment not defined
-
- # next if($self->_number_of_isolates != $row->[$self->_num_isolates_column]); # if gene is not in all isolates
- next if ( $row->[$self->_num_isolates_column] < $self->_min_no_isolates_for_core );
- next if($row->[$self->_avg_sequences_per_isolate_column] != 1);
- $ordered_genes{$row->[$self->_genome_fragement_column]}{$row->[$self->_order_within_fragement_column]} = $row->[$self->_gene_column];
- $self->_populate_sample_to_gene_lookup_with_row($row);
- }
-
- my @ordered_core_genes ;
- for my $fragment_key(sort {$a <=> $b } keys %ordered_genes)
- {
- for my $order_within_fragement(sort {$a <=> $b } keys %{$ordered_genes{$fragment_key}})
- {
- push(@ordered_core_genes,$ordered_genes{$fragment_key}{$order_within_fragement});
+ $self->sample_names_to_genes->{$sample_name}->{ $row->[$i] } = 1;
+ }
}
- }
- return \@ordered_core_genes;
+ return 1;
}
-sub _build_ordered_core_genes
-{
- my ($self) = @_;
- my $header_row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh );
- $self->_setup_column_mappings($header_row);
+sub _ordered_core_genes {
+ my ($self) = @_;
+ my %ordered_genes;
+ while ( my $row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ) ) {
+ next if ( @{$row} < 12 ); # no genes in group
+ next if ( !defined( $row->[ $self->_gene_column ] ) || $row->[ $self->_gene_column ] eq '' ); # no gene name
+ next
+ if ( !defined( $row->[ $self->_avg_sequences_per_isolate_column ] ) || $row->[ $self->_avg_sequences_per_isolate_column ] eq '' )
+ ; # no average
+ next
+ if ( !defined( $row->[ $self->_genome_fragement_column ] ) || $row->[ $self->_genome_fragement_column ] eq '' )
+ ; # fragment not defined
+
+ # next if($self->_number_of_isolates != $row->[$self->_num_isolates_column]); # if gene is not in all isolates
+ next if ( $row->[ $self->_num_isolates_column ] < $self->_min_no_isolates_for_core );
+
+ if ( $self->allow_paralogs ) {
+ # should never happen
+ next if ( $row->[ $self->_avg_sequences_per_isolate_column ] < 1 );
+ }
+ else {
+ next if ( $row->[ $self->_avg_sequences_per_isolate_column ] != 1 );
+ }
+
+ $ordered_genes{ $row->[ $self->_genome_fragement_column ] }{ $row->[ $self->_order_within_fragement_column ] } =
+ $row->[ $self->_gene_column ];
+ $self->_populate_sample_to_gene_lookup_with_row($row);
+ }
- return $self->_ordered_core_genes();
+ my @ordered_core_genes;
+ for my $fragment_key ( sort { $a <=> $b } keys %ordered_genes ) {
+ for my $order_within_fragement ( sort { $a <=> $b } keys %{ $ordered_genes{$fragment_key} } ) {
+ push( @ordered_core_genes, $ordered_genes{$fragment_key}{$order_within_fragement} );
+ }
+ }
+ return \@ordered_core_genes;
}
+sub _build_ordered_core_genes {
+ my ($self) = @_;
+ my $header_row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh );
+ $self->_setup_column_mappings($header_row);
+
+ return $self->_ordered_core_genes();
+}
no Moose;
__PACKAGE__->meta->make_immutable;
diff --git a/lib/Bio/Roary/MergeMultifastaAlignments.pm b/lib/Bio/Roary/MergeMultifastaAlignments.pm
index d2dd55e..f74d4ac 100644
--- a/lib/Bio/Roary/MergeMultifastaAlignments.pm
+++ b/lib/Bio/Roary/MergeMultifastaAlignments.pm
@@ -72,7 +72,7 @@ sub _sequence_for_sample_from_gene_file {
my ( $self, $sample_name, $gene_file ) = @_;
# loop over this to get the geneIDs
- for my $gene_id ( keys %{ $self->_gene_to_sequence->{$gene_file} } ) {
+ for my $gene_id ( sort keys %{ $self->_gene_to_sequence->{$gene_file} } ) {
if ( defined( $self->sample_names_to_genes->{$sample_name}->{$gene_id} ) ) {
return $self->_gene_to_sequence->{$gene_file}->{$gene_id};
}
diff --git a/t/Bio/Roary/ExtractCoreGenesFromSpreadsheet.t b/t/Bio/Roary/ExtractCoreGenesFromSpreadsheet.t
index 92c747d..edfc2b3 100755
--- a/t/Bio/Roary/ExtractCoreGenesFromSpreadsheet.t
+++ b/t/Bio/Roary/ExtractCoreGenesFromSpreadsheet.t
@@ -13,21 +13,56 @@ BEGIN {
my $obj;
-ok($obj = Bio::Roary::ExtractCoreGenesFromSpreadsheet->new(
- spreadsheet => 't/data/core_group_statistics.csv',
-),'initalise obj');
-is_deeply($obj->ordered_core_genes, ['argF','speH','group_5'], 'Correct ordering');
-is_deeply($obj->sample_names_to_genes, {
- 'query_2' => {
- '2_3' => 1,
- '2_7' => 1,
- '2_2' => 1
- },
- 'query_1' => {
- '1_6' => 1,
- '1_3' => 1,
- '1_2' => 1
- }
- }, 'Correct of sample names to genes is correct');
+ok(
+ $obj = Bio::Roary::ExtractCoreGenesFromSpreadsheet->new(
+ spreadsheet => 't/data/core_group_statistics.csv',
+ ),
+ 'initalise obj'
+);
+is_deeply( $obj->ordered_core_genes, [ 'argF', 'speH', 'group_5' ], 'Correct ordering' );
+is_deeply(
+ $obj->sample_names_to_genes,
+ {
+ 'query_2' => {
+ '2_3' => 1,
+ '2_7' => 1,
+ '2_2' => 1
+ },
+ 'query_1' => {
+ '1_6' => 1,
+ '1_3' => 1,
+ '1_2' => 1
+ }
+ },
+ 'Correct of sample names to genes is correct'
+);
+
+ok(
+ $obj = Bio::Roary::ExtractCoreGenesFromSpreadsheet->new(
+ spreadsheet => 't/data/core_group_statistics.csv',
+ allow_paralogs => 1,
+ ),
+ 'initalise obj where paralogs allowed'
+);
+is_deeply( $obj->ordered_core_genes, [ 'argF', 'hly', 'speH', 'group_5' ], 'Correct ordering where paralogs allowed' );
+
+is_deeply(
+ $obj->sample_names_to_genes,
+ {
+ 'query_2' => {
+ '2_3' => 1,
+ '2_7' => 1,
+ '2_1' => 1,
+ '2_2' => 1
+ },
+ 'query_1' => {
+ '1_6' => 1,
+ '1_3' => 1,
+ '1_1' => 1,
+ '1_2' => 1
+ }
+ },
+ 'Correct of sample names to genes is correct where paralogs allowed'
+);
done_testing();
--
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/roary.git
More information about the debian-med-commit
mailing list