[med-svn] [roary] 01/03: New upstream version 3.9.1+dfsg

Tue Aug 22 14:03:08 UTC 2017

This is an automated email from the git hooks/post-receive script.

satta pushed a commit to branch master
in repository roary.

commit c6fdb3ca02cf16028191b815709c50e118f94643
Author: Sascha Steinbiss <satta at debian.org>
Date:   Tue Aug 22 15:54:11 2017 +0200

    New upstream version 3.9.1+dfsg
---
 dist.ini                                           |   2 +-
 lib/Bio/Roary.pm                                   |   2 +
 lib/Bio/Roary/CommandLine/Roary.pm                 |  10 +-
 lib/Bio/Roary/CommandLine/RoaryCoreAlignment.pm    |   9 +-
 lib/Bio/Roary/CommandLine/RoaryPostAnalysis.pm     |   7 +-
 .../Roary/External/GeneAlignmentFromNucleotides.pm |   2 +
 lib/Bio/Roary/External/PostAnalysis.pm             |   5 +
 lib/Bio/Roary/ExtractCoreGenesFromSpreadsheet.pm   | 227 ++++++++++-----------
 lib/Bio/Roary/MergeMultifastaAlignments.pm         |   2 +-
 t/Bio/Roary/ExtractCoreGenesFromSpreadsheet.t      |  67 ++++--
 10 files changed, 194 insertions(+), 139 deletions(-)

diff --git a/dist.ini b/dist.ini
index 39fc6d2..4d8b414 100644
--- a/dist.ini
+++ b/dist.ini
@@ -1,5 +1,5 @@
 name    = Bio-Roary
-version = 3.9.0
+version = 3.9.1
 author  = Andrew J. Page <ap13 at sanger.ac.uk>
 license = GPL_3
 copyright_holder = Wellcome Trust Sanger Institute
diff --git a/lib/Bio/Roary.pm b/lib/Bio/Roary.pm
index feccc74..9ccd79a 100644
--- a/lib/Bio/Roary.pm
+++ b/lib/Bio/Roary.pm
@@ -48,6 +48,7 @@ has 'core_definition'             => ( is => 'rw', isa => 'Num',      default  =
 has 'verbose'                     => ( is => 'rw', isa => 'Bool',     default  => 0 );
 has 'mafft'                       => ( is => 'ro', isa => 'Bool',     default  => 0 );
 has 'inflation_value'             => ( is => 'rw', isa => 'Num',      default  => 1.5 );
+has 'allow_paralogs'              => ( is => 'rw', isa => 'Bool',     default => 0 );
 
 has 'output_multifasta_files' => ( is => 'ro', isa => 'Bool', default => 0 );
 
@@ -136,6 +137,7 @@ sub run {
         core_definition             => $self->core_definition,
 		verbose                     => $self->verbose,
 		mafft                       => $self->mafft,
+		allow_paralogs              => $self->allow_paralogs,
     );
     $post_analysis->run();
 
diff --git a/lib/Bio/Roary/CommandLine/Roary.pm b/lib/Bio/Roary/CommandLine/Roary.pm
index 8c8b0c8..3052b6c 100644
--- a/lib/Bio/Roary/CommandLine/Roary.pm
+++ b/lib/Bio/Roary/CommandLine/Roary.pm
@@ -47,6 +47,7 @@ has 'dont_split_groups'       => ( is => 'rw', isa => 'Bool', default => 0 );
 has 'verbose_stats'           => ( is => 'rw', isa => 'Bool', default => 0 );
 has 'translation_table'       => ( is => 'rw', isa => 'Int',  default => 11 );
 has 'mafft'                   => ( is => 'rw', isa => 'Bool', default => 0 );
+has 'allow_paralogs'          => ( is => 'rw', isa => 'Bool', default => 0 );
 has 'group_limit'             => ( is => 'rw', isa => 'Num',  default => 50000 );
 has 'core_definition'         => ( is => 'rw', isa => 'Num',  default => 0.99 );
 has 'verbose'                 => ( is => 'rw', isa => 'Bool', default => 0 );
@@ -71,7 +72,7 @@ sub BUILD {
         $job_runner,            $makeblastdb_exec,  $mcxdeblast_exec,         $mcl_exec,      $blastp_exec,
         $apply_unknowns_filter, $cpus,              $output_multifasta_files, $verbose_stats, $translation_table,
         $run_qc,                $core_definition,   $help,                    $kraken_db,     $cmd_version,
-        $mafft,                 $output_directory,  $check_dependancies, $inflation_value,
+        $mafft,                 $output_directory,  $check_dependancies, $inflation_value, $allow_paralogs,
     );
 
     GetOptionsFromArray(
@@ -98,6 +99,7 @@ sub BUILD {
         'cd|core_definition=f'      => \$core_definition,
         'v|verbose'                 => \$verbose,
         'n|mafft'                   => \$mafft,
+		'ap|allow_paralogs'         => \$allow_paralogs,
         'k|kraken_db=s'             => \$kraken_db,
         'w|version'                 => \$cmd_version,
         'a|check_dependancies'      => \$check_dependancies,
@@ -302,7 +304,8 @@ sub run {
         core_definition         => $self->core_definition,
         verbose                 => $self->verbose,
         mafft                   => $self->mafft,
-	inflation_value         => $self->inflation_value,
+        allow_paralogs          => $self->allow_paralogs,
+	    inflation_value         => $self->inflation_value,
     );
     $pan_genome_obj->run();
 
@@ -343,11 +346,12 @@ Options: -p INT    number of threads [1]
          -r        create R plots, requires R and ggplot2
          -s        dont split paralogs
          -t INT    translation table [11]
+         -ap       allow paralogs in core alignment
          -z        dont delete intermediate files
          -v        verbose output to STDOUT
          -w        print version and exit
          -y        add gene inference information to spreadsheet, doesnt work with -e
-	 -iv STR   Change the MCL inflation value [1.5]
+         -iv STR   Change the MCL inflation value [1.5]
          -h        this help message
 
 Example: Quickly generate a core gene alignment using 8 threads
diff --git a/lib/Bio/Roary/CommandLine/RoaryCoreAlignment.pm b/lib/Bio/Roary/CommandLine/RoaryCoreAlignment.pm
index dc52a60..f586a27 100644
--- a/lib/Bio/Roary/CommandLine/RoaryCoreAlignment.pm
+++ b/lib/Bio/Roary/CommandLine/RoaryCoreAlignment.pm
@@ -27,13 +27,14 @@ has 'spreadsheet_filename'      => ( is => 'rw', isa => 'Str', default => 'gene_
 has 'output_filename'           => ( is => 'rw', isa => 'Str', default => 'core_gene_alignment.aln' );
 has 'core_definition'           => ( is => 'rw', isa => 'Num', default => 0.99 );
 has 'dont_delete_files'         => ( is => 'rw', isa => 'Bool', default => 0 );
+has 'allow_paralogs'            => ( is => 'rw', isa => 'Bool', default => 0 );
 has '_error_message'            => ( is => 'rw', isa => 'Str' );
 has 'verbose'                   => ( is => 'rw', isa => 'Bool', default => 0 );
 
 sub BUILD {
     my ($self) = @_;
 
-    my ( $multifasta_base_directory, $spreadsheet_filename, $output_filename, $core_definition,$verbose,  $help, $mafft, $dont_delete_files );
+    my ( $multifasta_base_directory, $spreadsheet_filename, $output_filename, $core_definition,$verbose,  $help, $mafft, $allow_paralogs, $dont_delete_files );
 
     GetOptionsFromArray(
         $self->args,
@@ -42,6 +43,7 @@ sub BUILD {
         'o|output_filename=s'           => \$output_filename,
         'cd|core_definition=f'          => \$core_definition,
         'z|dont_delete_files'           => \$dont_delete_files,
+		'p|allow_paralogs'              => \$allow_paralogs,
 		'v|verbose'                     => \$verbose,
         'h|help'                        => \$help,
     );
@@ -51,6 +53,7 @@ sub BUILD {
         $self->logger->level(10000);
     }
     $self->help($help) if(defined($help));
+	$self->allow_paralogs($allow_paralogs) if(defined($allow_paralogs));
 
     if ( defined($multifasta_base_directory) && ( -d $multifasta_base_directory ) ) {
         $self->multifasta_base_directory( abs_path($multifasta_base_directory));
@@ -95,7 +98,8 @@ sub run {
 	$self->logger->info("Extract core genes from spreadsheet");
     my $core_genes_obj = Bio::Roary::ExtractCoreGenesFromSpreadsheet->new( 
         spreadsheet     => $self->spreadsheet_filename,
-        core_definition => $self->core_definition
+        core_definition => $self->core_definition,
+		allow_paralogs => $self->allow_paralogs
     );
 	
 	$self->logger->info("Looking up genes in files");
@@ -130,6 +134,7 @@ Options: -o STR    output filename [core_gene_alignment.aln]
          -cd FLOAT percentage of isolates a gene must be in to be core [99]
          -m STR    directory containing gene multi-FASTAs [pan_genome_sequences]
          -s STR    gene presence and absence spreadsheet [gene_presence_absence.csv]
+         -p        allow paralogs
          -z        dont delete intermediate files
          -v        verbose output to STDOUT
          -h        this help message
diff --git a/lib/Bio/Roary/CommandLine/RoaryPostAnalysis.pm b/lib/Bio/Roary/CommandLine/RoaryPostAnalysis.pm
index f6a59d0..fdbfbd6 100644
--- a/lib/Bio/Roary/CommandLine/RoaryPostAnalysis.pm
+++ b/lib/Bio/Roary/CommandLine/RoaryPostAnalysis.pm
@@ -41,6 +41,7 @@ has 'group_limit'                 => ( is => 'rw', isa => 'Num',  default => 500
 has 'core_definition'             => ( is => 'rw', isa => 'Num',  default => 0.99 );
 has 'verbose'                     => ( is => 'rw', isa => 'Bool', default => 0 );
 has 'mafft'                       => ( is => 'rw', isa => 'Bool', default => 0 );
+has 'allow_paralogs'              => ( is => 'rw', isa => 'Bool', default => 0 );
 
 sub BUILD {
     my ($self) = @_;
@@ -48,7 +49,7 @@ sub BUILD {
     my ( 
       $output_filename, $dont_create_rplots, $dont_delete_files, $dont_split_groups, $output_pan_geneome_filename, 
       $job_runner, $output_statistics_filename, $output_multifasta_files, $clusters_filename, $core_definition,
-      $fasta_files, $input_files, $verbose_stats, $translation_table, $help, $cpus,$group_limit,$verbose,$mafft
+      $fasta_files, $input_files, $verbose_stats, $translation_table, $help, $cpus,$group_limit,$verbose,$mafft, $allow_paralogs
     );
 
 
@@ -72,6 +73,7 @@ sub BUILD {
         'cd|core_definition=f'      => \$core_definition,
 		'v|verbose'                 => \$verbose,
 		'n|mafft'                   => \$mafft,
+		'q|allow_paralogs'          => \$allow_paralogs,
         'h|help'                    => \$help,
     );
     
@@ -93,6 +95,7 @@ sub BUILD {
     $self->group_limit($group_limit)                                 if ( defined($group_limit) );
     $self->core_definition( $core_definition/100 )                   if ( defined($core_definition) );
 	$self->mafft($mafft)                                             if ( defined($mafft) );
+	$self->allow_paralogs($allow_paralogs)                           if ( defined($allow_paralogs) );
     if ( defined($verbose) ) {
         $self->verbose($verbose);
         $self->logger->level(10000);
@@ -158,6 +161,7 @@ sub run {
         cpus                => $self->cpus,
 		verbose             => $self->verbose,
 		mafft               => $self->mafft,
+		allow_paralogs      => $self->allow_paralogs,
         dont_delete_files   => $self->dont_delete_files,
         num_input_files     => $#{$input_files},
       );
@@ -222,6 +226,7 @@ Options: -a        dont delete intermediate files
          -n        fast core gene alignement with MAFFT instead of PRANK
          -o STR    clusters output filename [clustered_proteins]
          -p STR    output pan genome filename [pan_genome.fa]
+         -q        allow paralogs in core alignment
          -s STR    output gene presence and absence filename [gene_presence_absence.csv]
          -t INT    translation table [11]
          -z INT    number of threads [1]
diff --git a/lib/Bio/Roary/External/GeneAlignmentFromNucleotides.pm b/lib/Bio/Roary/External/GeneAlignmentFromNucleotides.pm
index 122628a..3f65daa 100644
--- a/lib/Bio/Roary/External/GeneAlignmentFromNucleotides.pm
+++ b/lib/Bio/Roary/External/GeneAlignmentFromNucleotides.pm
@@ -29,6 +29,7 @@ has 'translation_table'           => ( is => 'rw', isa => 'Int',      default =>
 has 'core_definition'             => ( is => 'ro', isa => 'Num',      default => 1 );
 has 'mafft'                       => ( is => 'ro', isa => 'Bool',     default => 0 );
 has 'dont_delete_files'           => ( is => 'rw', isa => 'Bool',     default  => 0 );
+has 'allow_paralogs'              => ( is => 'rw', isa => 'Bool',     default  => 0 );
 has 'num_input_files'             => ( is => 'ro', isa => 'Int',      required => 1);
 
 # Overload Role`
@@ -85,6 +86,7 @@ sub _build__core_alignment_cmd {
     my $core_cmd = "pan_genome_core_alignment";
     $core_cmd .= " -cd " . ($self->core_definition*100) if ( defined $self->core_definition );
     $core_cmd .= " --dont_delete_files " if ( defined $self->dont_delete_files  && $self->dont_delete_files == 1 );
+	$core_cmd .= " --allow_paralogs " if ( defined $self->allow_paralogs  && $self->allow_paralogs == 1 );
 
     return $core_cmd;
 }
diff --git a/lib/Bio/Roary/External/PostAnalysis.pm b/lib/Bio/Roary/External/PostAnalysis.pm
index 211b746..1de663a 100644
--- a/lib/Bio/Roary/External/PostAnalysis.pm
+++ b/lib/Bio/Roary/External/PostAnalysis.pm
@@ -37,6 +37,7 @@ has 'group_limit'                 => ( is => 'rw', isa => 'Num',  default  => 50
 has 'core_definition'             => ( is => 'ro', isa => 'Num',  default  => 1.0 );
 has 'verbose'                     => ( is => 'rw', isa => 'Bool', default  => 0 );
 has 'mafft'                       => ( is => 'ro', isa => 'Bool', default  => 0 );
+has 'allow_paralogs'              => ( is => 'ro', isa => 'Bool', default  => 0 );
 has '_working_directory'          => ( is => 'ro', isa => 'File::Temp::Dir', default => sub { File::Temp->newdir( DIR => getcwd, CLEANUP => 1 ); } );
 has '_gff_fofn'                   => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__gff_fofn' );
 has '_fasta_fofn'                 => ( is => 'ro', isa => 'Str', lazy => 1, builder => '_build__fasta_fofn'  );
@@ -137,6 +138,9 @@ sub _command_to_run {
 	
     my $verbose_flag = '';
     $verbose_flag = '-v' if ( defined($self->verbose) && $self->verbose == 1 );
+	
+	my $allow_paralogs_flag = '';
+	$allow_paralogs_flag = '--allow_paralogs'  if ( defined($self->allow_paralogs) && $self->allow_paralogs == 1 );
     
     return join(
         " ",
@@ -156,6 +160,7 @@ sub _command_to_run {
             $verbose_stats_flag,
 			$verbose_flag,
 			$mafft_flag,
+			$allow_paralogs_flag,
             '-j', $self->job_runner,
             '--processors', $self->cpus,
             '--group_limit', $self->group_limit,
diff --git a/lib/Bio/Roary/ExtractCoreGenesFromSpreadsheet.pm b/lib/Bio/Roary/ExtractCoreGenesFromSpreadsheet.pm
index 9fa6fed..7c898fc 100644
--- a/lib/Bio/Roary/ExtractCoreGenesFromSpreadsheet.pm
+++ b/lib/Bio/Roary/ExtractCoreGenesFromSpreadsheet.pm
@@ -19,33 +19,33 @@ use Text::CSV;
 use Bio::Roary::GroupStatistics;
 use POSIX;
 
-has 'spreadsheet'            => ( is => 'ro', isa  => 'Str',      required => 1 );
-has '_csv_parser'            => ( is => 'ro', isa  => 'Text::CSV',lazy     => 1, builder => '_build__csv_parser' );
-has '_input_spreadsheet_fh'  => ( is => 'ro', lazy => 1,          builder  => '_build__input_spreadsheet_fh' );
-has 'ordered_core_genes'     => ( is => 'ro', isa  => 'ArrayRef', lazy     => 1, builder  => '_build_ordered_core_genes' );
-has 'core_definition'        => ( is => 'ro', isa => 'Num', default => 1 );
-has 'sample_names'           => ( is => 'rw', isa => 'ArrayRef', default => sub {[]} );
-has 'sample_names_to_genes'  => ( is => 'rw', isa => 'HashRef',  default => sub {{}} );
-
-has '_number_of_isolates'                 => ( is => 'rw', isa  => 'Int');
-has '_gene_column'                        => ( is => 'rw', isa  => 'Int');
-has '_num_isolates_column'                => ( is => 'rw', isa  => 'Int');
-has '_avg_sequences_per_isolate_column'   => ( is => 'rw', isa  => 'Int');
-has '_genome_fragement_column'            => ( is => 'rw', isa  => 'Int');
-has '_order_within_fragement_column'      => ( is => 'rw', isa  => 'Int');
-has '_min_no_isolates_for_core'           => ( is => 'rw', isa  => 'Num', lazy => 1, builder => '_build__min_no_isolates_for_core' );
+has 'spreadsheet'           => ( is => 'ro', isa  => 'Str',       required => 1 );
+has '_csv_parser'           => ( is => 'ro', isa  => 'Text::CSV', lazy     => 1, builder => '_build__csv_parser' );
+has '_input_spreadsheet_fh' => ( is => 'ro', lazy => 1,           builder  => '_build__input_spreadsheet_fh' );
+has 'ordered_core_genes'    => ( is => 'ro', isa  => 'ArrayRef',  lazy     => 1, builder => '_build_ordered_core_genes' );
+has 'core_definition'       => ( is => 'ro', isa  => 'Num',       default  => 1 );
+has 'sample_names'          => ( is => 'rw', isa => 'ArrayRef', default => sub { [] } );
+has 'sample_names_to_genes' => ( is => 'rw', isa => 'HashRef',  default => sub { {} } );
+has 'allow_paralogs'        => ( is => 'rw', isa => 'Bool',     default => 0 );
+
+has '_number_of_isolates'               => ( is => 'rw', isa => 'Int' );
+has '_gene_column'                      => ( is => 'rw', isa => 'Int' );
+has '_num_isolates_column'              => ( is => 'rw', isa => 'Int' );
+has '_avg_sequences_per_isolate_column' => ( is => 'rw', isa => 'Int' );
+has '_genome_fragement_column'          => ( is => 'rw', isa => 'Int' );
+has '_order_within_fragement_column'    => ( is => 'rw', isa => 'Int' );
+has '_min_no_isolates_for_core'         => ( is => 'rw', isa => 'Num', lazy => 1, builder => '_build__min_no_isolates_for_core' );
 
 sub _build__min_no_isolates_for_core {
-  my ($self) = @_;
-  my $threshold =  $self->_number_of_isolates * $self->core_definition;
+    my ($self) = @_;
+    my $threshold = $self->_number_of_isolates * $self->core_definition;
 
-  return $threshold;
+    return $threshold;
 }
 
-sub _build__csv_parser
-{
-  my ($self) = @_;
-  return Text::CSV->new( { binary => 1, always_quote => 1} );
+sub _build__csv_parser {
+    my ($self) = @_;
+    return Text::CSV->new( { binary => 1, always_quote => 1 } );
 }
 
 sub _build__input_spreadsheet_fh {
@@ -54,115 +54,112 @@ sub _build__input_spreadsheet_fh {
     return $fh;
 }
 
-sub _update_number_of_isolates
-{
-  my ($self, $header_row) = @_;
-  my $number_of_isolates = @{$header_row} - @{Bio::Roary::GroupStatistics->fixed_headers};
-  $self->_number_of_isolates($number_of_isolates);
+sub _update_number_of_isolates {
+    my ( $self, $header_row ) = @_;
+    my $number_of_isolates = @{$header_row} - @{ Bio::Roary::GroupStatistics->fixed_headers };
+    $self->_number_of_isolates($number_of_isolates);
 }
 
-sub _setup_column_mappings
-{
-  my ($self, $header_row) = @_;
-  # current ordering
-  my %columns_of_interest_mappings = (
-    'Gene'                         => 0,
-    'No. isolates'                 => 3,
-    'Avg sequences per isolate'    => 5,
-    'Genome Fragment'              => 6,
-    'Order within Fragment'        => 7,
-	'QC'                           => 10,
+sub _setup_column_mappings {
+    my ( $self, $header_row ) = @_;
+
+    # current ordering
+    my %columns_of_interest_mappings = (
+        'Gene'                      => 0,
+        'No. isolates'              => 3,
+        'Avg sequences per isolate' => 5,
+        'Genome Fragment'           => 6,
+        'Order within Fragment'     => 7,
+        'QC'                        => 10,
     );
-  
-  # Dynamically overwrite the default ordering
-  for(my $i = 0; $i < @{$header_row}; $i++)
-  {
-    for my $col_name (%columns_of_interest_mappings)
-    {
-      if($header_row->[$i] eq $col_name)
-      {
-        $columns_of_interest_mappings{$col_name} = $i;
-        last;
-      }
+
+    # Dynamically overwrite the default ordering
+    for ( my $i = 0 ; $i < @{$header_row} ; $i++ ) {
+        for my $col_name (%columns_of_interest_mappings) {
+            if ( $header_row->[$i] eq $col_name ) {
+                $columns_of_interest_mappings{$col_name} = $i;
+                last;
+            }
+        }
     }
-  }
-  $self->_gene_column($columns_of_interest_mappings{'Gene'});
-  $self->_num_isolates_column($columns_of_interest_mappings{'No. isolates'});
-  $self->_avg_sequences_per_isolate_column($columns_of_interest_mappings{'Avg sequences per isolate'});
-  $self->_genome_fragement_column($columns_of_interest_mappings{'Genome Fragment'});
-  $self->_order_within_fragement_column($columns_of_interest_mappings{'Order within Fragment'});
-  $self->_update_number_of_isolates($header_row);
-  
-  # Get the sample_names
-  my @sample_names;
-  for(my $i = $self->_length_of_fixed_headers(); $i < @{$header_row}; $i++)
-  {
-	  push(@sample_names,$header_row->[$i]);
-  }
-  $self->sample_names(\@sample_names);
+    $self->_gene_column( $columns_of_interest_mappings{'Gene'} );
+    $self->_num_isolates_column( $columns_of_interest_mappings{'No. isolates'} );
+    $self->_avg_sequences_per_isolate_column( $columns_of_interest_mappings{'Avg sequences per isolate'} );
+    $self->_genome_fragement_column( $columns_of_interest_mappings{'Genome Fragment'} );
+    $self->_order_within_fragement_column( $columns_of_interest_mappings{'Order within Fragment'} );
+    $self->_update_number_of_isolates($header_row);
+
+    # Get the sample_names
+    my @sample_names;
+    for ( my $i = $self->_length_of_fixed_headers() ; $i < @{$header_row} ; $i++ ) {
+        push( @sample_names, $header_row->[$i] );
+    }
+    $self->sample_names( \@sample_names );
 }
 
-sub _length_of_fixed_headers
-{
-	my ($self) = @_;
-    return @{Bio::Roary::GroupStatistics->fixed_headers()};
+sub _length_of_fixed_headers {
+    my ($self) = @_;
+    return @{ Bio::Roary::GroupStatistics->fixed_headers() };
 }
 
-sub _populate_sample_to_gene_lookup_with_row
-{
-	 my ($self, $row) = @_;
-	 
-	 for(my $i = $self->_length_of_fixed_headers(); $i < @{$row}; $i++ )
-	 {
-		 if(defined($row->[$i]) && $row->[$i] ne "" )
-		 {
-		 	my $sample_name = $self->sample_names->[$i - $self->_length_of_fixed_headers()];
-			
-			$self->sample_names_to_genes->{$sample_name}->{$row->[$i]} = 1;
-		 }
-	 }
-	 return 1;
-}
+sub _populate_sample_to_gene_lookup_with_row {
+    my ( $self, $row ) = @_;
 
+    for ( my $i = $self->_length_of_fixed_headers() ; $i < @{$row} ; $i++ ) {
+        if ( defined( $row->[$i] ) && $row->[$i] ne "" ) {
+            my $sample_name = $self->sample_names->[ $i - $self->_length_of_fixed_headers() ];
 
-sub _ordered_core_genes
-{
-  my ($self) = @_;
-  my %ordered_genes;
-  while ( my $row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ) ) 
-  {
-    next if(@{$row} < 12); # no genes in group
-    next if(!defined($row->[$self->_gene_column]) || $row->[$self->_gene_column] eq '' ); # no gene name
-    next if(!defined($row->[$self->_avg_sequences_per_isolate_column]) || $row->[$self->_avg_sequences_per_isolate_column] eq '' ); # no average
-    next if(!defined($row->[$self->_genome_fragement_column]) || $row->[$self->_genome_fragement_column] eq '' ); # fragment not defined
-    
-    # next if($self->_number_of_isolates != $row->[$self->_num_isolates_column]); # if gene is not in all isolates
-    next if ( $row->[$self->_num_isolates_column] < $self->_min_no_isolates_for_core );
-    next if($row->[$self->_avg_sequences_per_isolate_column] != 1);
-    $ordered_genes{$row->[$self->_genome_fragement_column]}{$row->[$self->_order_within_fragement_column]} = $row->[$self->_gene_column];
-	$self->_populate_sample_to_gene_lookup_with_row($row);
-  }
-  
-  my @ordered_core_genes ;
-  for my $fragment_key(sort {$a <=> $b } keys %ordered_genes)
-  {
-    for my $order_within_fragement(sort {$a <=> $b } keys %{$ordered_genes{$fragment_key}})
-    {
-      push(@ordered_core_genes,$ordered_genes{$fragment_key}{$order_within_fragement});
+            $self->sample_names_to_genes->{$sample_name}->{ $row->[$i] } = 1;
+        }
     }
-  }
-  return \@ordered_core_genes;
+    return 1;
 }
 
-sub _build_ordered_core_genes
-{
-  my ($self) = @_;
-  my $header_row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh );
-  $self->_setup_column_mappings($header_row);
+sub _ordered_core_genes {
+    my ($self) = @_;
+    my %ordered_genes;
+    while ( my $row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh ) ) {
+        next if ( @{$row} < 12 );                                                                        # no genes in group
+        next if ( !defined( $row->[ $self->_gene_column ] ) || $row->[ $self->_gene_column ] eq '' );    # no gene name
+        next
+          if ( !defined( $row->[ $self->_avg_sequences_per_isolate_column ] ) || $row->[ $self->_avg_sequences_per_isolate_column ] eq '' )
+          ;                                                                                              # no average
+        next
+          if ( !defined( $row->[ $self->_genome_fragement_column ] ) || $row->[ $self->_genome_fragement_column ] eq '' )
+          ;                                                                                              # fragment not defined
+
+        # next if($self->_number_of_isolates != $row->[$self->_num_isolates_column]); # if gene is not in all isolates
+        next if ( $row->[ $self->_num_isolates_column ] < $self->_min_no_isolates_for_core );
+
+        if ( $self->allow_paralogs ) {
+            # should never happen
+            next if ( $row->[ $self->_avg_sequences_per_isolate_column ] < 1 );
+        }
+        else {
+            next if ( $row->[ $self->_avg_sequences_per_isolate_column ] != 1 );
+        }
+
+        $ordered_genes{ $row->[ $self->_genome_fragement_column ] }{ $row->[ $self->_order_within_fragement_column ] } =
+          $row->[ $self->_gene_column ];
+        $self->_populate_sample_to_gene_lookup_with_row($row);
+    }
 
-  return $self->_ordered_core_genes();
+    my @ordered_core_genes;
+    for my $fragment_key ( sort { $a <=> $b } keys %ordered_genes ) {
+        for my $order_within_fragement ( sort { $a <=> $b } keys %{ $ordered_genes{$fragment_key} } ) {
+            push( @ordered_core_genes, $ordered_genes{$fragment_key}{$order_within_fragement} );
+        }
+    }
+    return \@ordered_core_genes;
 }
 
+sub _build_ordered_core_genes {
+    my ($self) = @_;
+    my $header_row = $self->_csv_parser->getline( $self->_input_spreadsheet_fh );
+    $self->_setup_column_mappings($header_row);
+
+    return $self->_ordered_core_genes();
+}
 
 no Moose;
 __PACKAGE__->meta->make_immutable;
diff --git a/lib/Bio/Roary/MergeMultifastaAlignments.pm b/lib/Bio/Roary/MergeMultifastaAlignments.pm
index d2dd55e..f74d4ac 100644
--- a/lib/Bio/Roary/MergeMultifastaAlignments.pm
+++ b/lib/Bio/Roary/MergeMultifastaAlignments.pm
@@ -72,7 +72,7 @@ sub _sequence_for_sample_from_gene_file {
     my ( $self, $sample_name, $gene_file ) = @_;
 
     # loop over this to get the geneIDs
-    for my $gene_id ( keys %{ $self->_gene_to_sequence->{$gene_file} } ) {
+    for my $gene_id ( sort keys %{ $self->_gene_to_sequence->{$gene_file} } ) {
         if ( defined( $self->sample_names_to_genes->{$sample_name}->{$gene_id} ) ) {
             return $self->_gene_to_sequence->{$gene_file}->{$gene_id};
         }
diff --git a/t/Bio/Roary/ExtractCoreGenesFromSpreadsheet.t b/t/Bio/Roary/ExtractCoreGenesFromSpreadsheet.t
index 92c747d..edfc2b3 100755
--- a/t/Bio/Roary/ExtractCoreGenesFromSpreadsheet.t
+++ b/t/Bio/Roary/ExtractCoreGenesFromSpreadsheet.t
@@ -13,21 +13,56 @@ BEGIN {
 
 my $obj;
 
-ok($obj = Bio::Roary::ExtractCoreGenesFromSpreadsheet->new(
-  spreadsheet  => 't/data/core_group_statistics.csv',
-),'initalise obj');
-is_deeply($obj->ordered_core_genes, ['argF','speH','group_5'], 'Correct ordering');
-is_deeply($obj->sample_names_to_genes, {
-          'query_2' => {
-                         '2_3' => 1,
-                         '2_7' => 1,
-                         '2_2' => 1
-                       },
-          'query_1' => {
-                         '1_6' => 1,
-                         '1_3' => 1,
-                         '1_2' => 1
-                       }
-        }, 'Correct of sample names to genes is correct');
+ok(
+    $obj = Bio::Roary::ExtractCoreGenesFromSpreadsheet->new(
+        spreadsheet => 't/data/core_group_statistics.csv',
+    ),
+    'initalise obj'
+);
+is_deeply( $obj->ordered_core_genes, [ 'argF', 'speH', 'group_5' ], 'Correct ordering' );
+is_deeply(
+    $obj->sample_names_to_genes,
+    {
+        'query_2' => {
+            '2_3' => 1,
+            '2_7' => 1,
+            '2_2' => 1
+        },
+        'query_1' => {
+            '1_6' => 1,
+            '1_3' => 1,
+            '1_2' => 1
+        }
+    },
+    'Correct of sample names to genes is correct'
+);
+
+ok(
+    $obj = Bio::Roary::ExtractCoreGenesFromSpreadsheet->new(
+        spreadsheet    => 't/data/core_group_statistics.csv',
+        allow_paralogs => 1,
+    ),
+    'initalise obj where paralogs allowed'
+);
+is_deeply( $obj->ordered_core_genes, [ 'argF', 'hly', 'speH', 'group_5' ], 'Correct ordering where paralogs allowed' );
+
+is_deeply(
+    $obj->sample_names_to_genes,
+    {
+        'query_2' => {
+            '2_3' => 1,
+            '2_7' => 1,
+            '2_1' => 1,
+            '2_2' => 1
+        },
+        'query_1' => {
+            '1_6' => 1,
+            '1_3' => 1,
+            '1_1' => 1,
+            '1_2' => 1
+        }
+    },
+    'Correct of sample names to genes is correct where paralogs allowed'
+);
 
 done_testing();

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/debian-med/roary.git