[Reproducible-builds] [strip-nondeterminism] 01/01: Completely reorganize and overhaul

Andrew Ayer agwa at andrewayer.name
Mon Sep 8 01:15:56 UTC 2014


This is an automated email from the git hooks/post-receive script.

agwa-guest pushed a commit to branch master
in repository strip-nondeterminism.

commit ad9d61af15035e719d7a0223afcaa5fa25f0c815
Author: Andrew Ayer <agwa at andrewayer.name>
Date:   Sun Sep 7 17:40:37 2014 -0700

    Completely reorganize and overhaul
    
    strip-nondeterminism is now a standalone command, and handlers are in
    Perl modules.  debhelper will depend on strip-nondeterminism.
---
 .gitignore                               |  10 ++
 MANIFEST                                 |  11 ++
 MANIFEST.SKIP                            |  21 +++
 Makefile.PL                              |  20 +++
 TODO                                     |   6 +-
 bin/dh_strip_nondeterminism              |  85 +++++++++++
 bin/strip-nondeterminism                 | 110 ++++++++++++++
 dh_strip_nondeterminism                  | 248 -------------------------------
 handlers/README                          |   3 -
 handlers/gzip                            | 104 -------------
 handlers/zip                             |  12 --
 lib/StripNondeterminism.pm               |  76 ++++++++++
 lib/StripNondeterminism/handlers/ar.pm   |  86 +++++++++++
 lib/StripNondeterminism/handlers/gzip.pm | 122 +++++++++++++++
 lib/StripNondeterminism/handlers/jar.pm  |  43 ++++++
 lib/StripNondeterminism/handlers/zip.pm  |  45 ++++++
 16 files changed, 632 insertions(+), 370 deletions(-)

diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..61bb845
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,10 @@
+# Makemaker generated files and dirs.
+/Makefile
+/blib
+/MakeMaker-*
+/pm_to_blib
+/MYMETA.yml
+
+# Temp and old files.
+*~
+*.old
diff --git a/MANIFEST b/MANIFEST
new file mode 100644
index 0000000..c0c361b
--- /dev/null
+++ b/MANIFEST
@@ -0,0 +1,11 @@
+COPYING
+MANIFEST			This list of files
+Makefile.PL
+README
+bin/dh_strip_nondeterminism
+bin/strip-nondeterminism
+lib/StripNondeterminism.pm
+lib/StripNondeterminism/handlers/ar.pm
+lib/StripNondeterminism/handlers/gzip.pm
+lib/StripNondeterminism/handlers/jar.pm
+lib/StripNondeterminism/handlers/zip.pm
diff --git a/MANIFEST.SKIP b/MANIFEST.SKIP
new file mode 100644
index 0000000..93dfd07
--- /dev/null
+++ b/MANIFEST.SKIP
@@ -0,0 +1,21 @@
+# Version control files and dirs.
+\B\.git\b
+
+# Makemaker generated files and dirs.
+^MANIFEST\.
+^Makefile$
+^Makefile.old$
+^blib/
+^MakeMaker-\d
+^pm_to_blib
+
+# Temp, old and emacs backup files.
+~$
+\.old$
+\.swp$
+^#.*#$
+^\.#
+
+# Odds and ends
+\bLEFTOFF$
+\bTODO$
diff --git a/Makefile.PL b/Makefile.PL
new file mode 100644
index 0000000..affb564
--- /dev/null
+++ b/Makefile.PL
@@ -0,0 +1,20 @@
+use strict;
+use warnings;
+use ExtUtils::MakeMaker;
+
+my $MMVER = eval $ExtUtils::MakeMaker::VERSION;
+
+WriteMakefile(
+	NAME		=> 'strip-nondeterminism',
+	AUTHOR		=> 'Andrew Ayer <agwa at andrewayer.name>',
+	VERSION_FROM	=> 'lib/StripNondeterminism.pm',
+	ABSTRACT	=> 'tool for stripping non-determinism from files',
+	EXE_FILES	=> [qw(bin/dh_strip_nondeterminism bin/strip-nondeterminism)],
+	PREREQ_PM => {
+		'Archive::Zip'	=> 0,
+		'Getopt::Long'	=> 0,
+	},
+	LICENSE		=> "gpl",
+	dist		=> { COMPRESS => 'gzip -9nf', SUFFIX => 'gz', },
+	clean		=> { FILES => 'StripNondeterminism-*' },
+);
diff --git a/TODO b/TODO
index 275cb94..8020fc4 100644
--- a/TODO
+++ b/TODO
@@ -1,10 +1,10 @@
 Write handlers for:
-	gzip
+	[DONE] gzip
 	javadocs
-	ar
+	[DONE] ar
 	...
 
-Write main strip-nondeterminism command
+[DONE] Write main strip-nondeterminism command
 	It would just parse command line arguments and then execute the correct handler
 
 Write dh_strip_nondeterminism
diff --git a/bin/dh_strip_nondeterminism b/bin/dh_strip_nondeterminism
new file mode 100755
index 0000000..4adbf93
--- /dev/null
+++ b/bin/dh_strip_nondeterminism
@@ -0,0 +1,85 @@
+#!/usr/bin/perl -w
+
+=head1 NAME
+
+dh_strip_nondeterminism - strip uninteresting, non-deterministic information from files
+
+=cut
+
+use strict;
+use File::Find;
+use Debian::Debhelper::Dh_Lib;
+use StripNondeterminism;
+
+=head1 SYNOPSIS
+
+B<dh_strip_nondeterminism> [S<I<debhelper options>>] [B<-X>I<item>]
+
+=head1 DESCRIPTION
+
+B<dh_strip_nondeterminism> is a debhelper program that is responsible
+for stripping uninteresting, non-deterministic information, such as
+timestamps, from compiled files so that the build is reproducible.
+
+This program examines your package build directories and works out what
+to strip on its own. It uses L<file(1)> and filenames to figure out what
+files should have non-determinism stripped from them.  In general it
+seems to make very good guesses, and will do the right thing in almost
+all cases.
+
+=head1 OPTIONS
+
+=over 4
+
+=item B<-X>I<item>, B<--exclude=>I<item>
+
+Exclude files that contain I<item> anywhere in their filename from being
+stripped. You may use this option multiple times to build up a list of
+things to exclude.
+
+=back
+
+=cut
+
+init();
+
+my @nondeterministic_files;
+sub testfile {
+	return if -l $_ or -d $_; # Skip directories and symlinks always.
+
+	# See if we were asked to exclude this file.
+	# Note that we have to test on the full filename, including directory.
+	my $fn="$File::Find::dir/$_";
+	foreach my $f (@{$dh{EXCLUDE}}) {
+		return if ($fn=~m/\Q$f\E/);
+	}
+
+	my $normalizer = StripNondeterminism::get_normalizer_for_file($_);
+	if ($normalizer) {
+		push @nondeterministic_files, [$fn, $normalizer];
+	}
+}
+
+foreach my $package (@{$dh{DOPACKAGES}}) {
+	my $tmp=tmpdir($package);
+
+	@nondeterministic_files=();
+	find(\&testfile,$tmp);
+
+	foreach (@nondeterministic_files) {
+		my ($path, $normalize) = @$_;
+		$normalize->($path);
+	}
+}
+
+=head1 SEE ALSO
+
+L<debhelper(7)>
+
+This program is a part of debhelper.
+
+=head1 AUTHOR
+
+Andrew Ayer <agwa at andrewayer.name>
+
+=cut
diff --git a/bin/strip-nondeterminism b/bin/strip-nondeterminism
new file mode 100755
index 0000000..38c68ca
--- /dev/null
+++ b/bin/strip-nondeterminism
@@ -0,0 +1,110 @@
+#!/usr/bin/perl
+
+#
+# Copyright 2014 Andrew Ayer
+#
+# This file is part of strip-nondeterminism.
+#
+# strip-nondeterminism is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# strip-nondeterminism is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with strip-nondeterminism.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+use strict;
+use warnings;
+
+use StripNondeterminism;
+use Getopt::Long;
+Getopt::Long::Configure(qw(no_ignore_case permute bundling));
+
+my $cmd = $0;
+$cmd =~ s/.*\///;
+my $usage = "Usage: $cmd [-t|--type FILETYPE] FILENAME\n";
+
+my @opspec = (
+	'type|t=s', 'help|h', 'version|V',
+);
+my $glop = {};
+GetOptions($glop, @opspec) || die $usage;
+
+if ($glop->{help}) {
+	print $usage;
+	exit 0;
+}
+
+if ($glop->{version}) {
+	print "$cmd version $StripNondeterminism::VERSION\n";
+	exit 0;
+}
+
+die $usage if @ARGV == 0;
+
+for my $filename (@ARGV) {
+	my $normalizer;
+	if ($glop->{type}) {
+		$normalizer = StripNondeterminism::get_normalizer_by_name($glop->{type});
+		die $glop->{type} . ": Unknown file type\n" unless $normalizer;
+	} else {
+		$normalizer = StripNondeterminism::get_normalizer_for_file($filename);
+		next unless $normalizer;
+	}
+
+	$normalizer->($filename);
+}
+
+__END__
+
+=head1 NAME
+
+strip-nondeterminism - strip non-deterministic information from files
+
+=head1 SYNOPSIS
+
+ strip-nondeterminism [-t filetype] filename ...
+
+=head1 DESCRIPTION
+
+B<strip-nondeterminism> is a tool to strip bits of non-deterministic
+information, such as timestamps, from files.  It can be used as
+a post-processing step to make a build reproducible, when the build
+process itself cannot be made deterministic.
+
+=head1 OPTIONS
+
+=over 4
+
+=item B<-t> I<filetype>, B<--type> I<filetype>
+
+Use the normalizer for the given file type (ar, gzip, jar, zip).  If this
+option is not specified, the file type is detected automatically based on
+the file name extension.
+
+=item B<-h>, B<--help>
+
+Display this help message.
+
+=item B<-V>, B<--version>
+
+Print only the version string and then quit.
+
+=back
+
+=head1 AUTHOR
+
+Andrew Ayer
+
+=head1 COPYRIGHT
+
+strip-nondeterminism is free software.  You can redistribute it and/or
+modify it under the terms of the GNU General Public License, version 3.
+
+=cut
diff --git a/dh_strip_nondeterminism b/dh_strip_nondeterminism
deleted file mode 100755
index 501b1a4..0000000
--- a/dh_strip_nondeterminism
+++ /dev/null
@@ -1,248 +0,0 @@
-#!/usr/bin/perl -w
-
-=head1 NAME
-
-dh_strip_nondeterminism - strip uninteresting, non-deterministic information from files
-
-=cut
-
-use strict;
-use File::Find;
-use Debian::Debhelper::Dh_Lib;
-
-=head1 SYNOPSIS
-
-B<dh_strip_nondeterminism> [S<I<debhelper options>>] [B<-X>I<item>]
-
-=head1 DESCRIPTION
-
-B<dh_strip_nondeterminism> is a debhelper program that is responsible
-for stripping uninteresting, non-deterministic information, such as
-timestamps, from compiled files so that the build is reproducible.
-
-This program examines your package build directories and works out what
-to strip on its own. It uses L<file(1)> and filenames to figure out what
-files should have non-determinism stripped from them.  In general it
-seems to make very good guesses, and will do the right thing in almost
-all cases.
-
-=head1 OPTIONS
-
-=over 4
-
-=item B<-X>I<item>, B<--exclude=>I<item>
-
-Exclude files that contain I<item> anywhere in their filename from being
-stripped. You may use this option multiple times to build up a list of
-things to exclude.
-
-=back
-
-=cut
-
-init();
-
-# I could just use `file $_[0]`, but this is safer
-sub get_file_type {
-	my $file=shift;
-	open (FILE, '-|') # handle all filenames safely
-		|| exec('file', $file)
-		|| die "can't exec file: $!";
-	my $type=<FILE>;
-	close FILE;
-	return $type;
-}
-
-my @nondeterministic_files;
-sub testfile {
-	return if -l $_ or -d $_; # Skip directories and symlinks always.
-
-	# See if we were asked to exclude this file.
-	# Note that we have to test on the full filename, including directory.
-	my $fn="$File::Find::dir/$_";
-	foreach my $f (@{$dh{EXCLUDE}}) {
-		return if ($fn=~m/\Q$f\E/);
-	}
-
-	# gzip
-	if (m/\.gz$/) {
-		# No need for get_file_type b/c the gzip normalizer
-		# silently ignores non-gzip files
-		push @nondeterministic_files, [$fn, \&handlers::gzip::normalize];
-	}
-	# zip
-	if (m/\.zip$/ && get_file_type($_) =~ m/Zip archive data/) {
-		push @nondeterministic_files, [$fn, \&handlers::zip::normalize];
-	}
-	# jar
-	if (m/\.jar$/ && get_file_type($_) =~ m/Zip archive data/) {
-		push @nondeterministic_files, [$fn, \&handlers::jar::normalize];
-	}
-}
-
-foreach my $package (@{$dh{DOPACKAGES}}) {
-	my $tmp=tmpdir($package);
-
-	@nondeterministic_files=();
-	find(\&testfile,$tmp);
-
-	foreach (@nondeterministic_files) {
-		my ($path, $normalize) = @$_;
-		$normalize->($path);
-	}
-}
-
-package handlers::gzip;
-
-use Debian::Debhelper::Dh_Lib;
-use File::Temp qw/tempfile/;
-
-use constant {
-	FTEXT    => 1 << 0,
-	FHCRC    => 1 << 1,
-	FEXTRA   => 1 << 2,
-	FNAME    => 1 << 3,
-	FCOMMENT => 1 << 4,
-};
-
-sub normalize {
-	my ($filename) = @_;
-
-	open(my $fh, '<', $filename) or error "Unable to open $filename for reading: $!";
-	my ($out_fh, $out_filename) = tempfile(DIR => dirname($filename), UNLINK => 1);
-
-	# See RFC 1952
-
-	# 0   1   2   3   4   5   6   7   8   9   10
-	# +---+---+---+---+---+---+---+---+---+---+
-	# |ID1|ID2|CM |FLG|     MTIME     |XFL|OS |
-	# +---+---+---+---+---+---+---+---+---+---+
-
-	# Read the current header
-	my $hdr;
-	my $bytes_read = read($fh, $hdr, 10);
-	return unless $bytes_read == 10;
-	my ($id1, $id2, $cm, $flg, $mtime, $xfl, $os) = unpack('CCCCl<CC', $hdr);
-	return unless $id1 == 31 and $id2 == 139;
-
-	my $new_flg = $flg;
-	$new_flg &= ~FNAME;	# Don't include filename
-	$new_flg &= ~FHCRC;	# Don't include header CRC (not all implementations support it)
-	$mtime = 0;		# Zero out mtime (this is what `gzip -n` does)
-	# TODO: question: normalize some of the other fields, such as OS?
-
-	# Write a new header
-	print $out_fh pack('CCCCl<CC', $id1, $id2, $cm, $new_flg, $mtime, $xfl, $os);
-
-	if ($flg & FEXTRA) {	# Copy through
-		# 0   1   2
-		# +---+---+=================================+
-		# | XLEN  |...XLEN bytes of "extra field"...|
-		# +---+---+=================================+
-		my $buf;
-		read($fh, $buf, 2) == 2 or error "$filename: Malformed gzip file";
-		my ($xlen) = unpack('v', $buf);
-		read($fh, $buf, $xlen) == $xlen or error "$filename: Malformed gzip file";
-		print $out_fh pack('vA*', $xlen, $buf);
-	}
-	if ($flg & FNAME) {	# Read but do not copy through
-		# 0
-		# +=========================================+
-		# |...original file name, zero-terminated...|
-		# +=========================================+
-		while (1) {
-			my $buf;
-			read($fh, $buf, 1) == 1 or error "$filename: Malformed gzip file";
-			last if ord($buf) == 0;
-		}
-	}
-	if ($flg & FCOMMENT) {	# Copy through
-		# 0
-		# +===================================+
-		# |...file comment, zero-terminated...|
-		# +===================================+
-		while (1) {
-			my $buf;
-			read($fh, $buf, 1) == 1 or error "$filename: Malformed gzip file";
-			print $out_fh $buf;
-			last if ord($buf) == 0;
-		}
-	}
-	if ($flg & FHCRC) {	# Read but do not copy through
-		# 0   1   2
-		# +---+---+
-		# | CRC16 |
-		# +---+---+
-		my $buf;
-		read($fh, $buf, 2) == 2 or error "$filename: Malformed gzip file";
-	}
-
-	# Copy through the rest of the file.
-	# TODO: also normalize concatenated gzip files.  This will require reading and understanding
-	# each DEFLATE block (see RFC 1951), since gzip doesn't include lengths anywhere.
-	while (1) {
-		my $buf;
-		my $bytes_read = read($fh, $buf, 4096);
-		defined($bytes_read) or error "$filename: read failed: $!";
-		print $out_fh $buf;
-		last if $bytes_read == 0;
-	}
-
-	chmod((stat($fh))[2] & 07777, $out_filename);
-	rename($out_filename, $filename) or error "$filename: unable to overwrite: rename: $!";
-}
-
-package handlers::zip;
-
-use Debian::Debhelper::Dh_Lib;
-use Archive::Zip;
-
-Archive::Zip::setErrorHandler(\&error);
-
-# A magic number from Archive::Zip for the earliest timestamp that
-# can be represented by a Zip file.  From the Archive::Zip source:
-# "Note, this isn't exactly UTC 1980, it's 1980 + 12 hours and 1
-# minute so that nothing timezoney can muck us up."
-use constant SAFE_EPOCH => 315576060;
-
-sub normalize {
-	my ($zip_filename, $filename_cmp) = @_;
-	$filename_cmp ||= sub { $a cmp $b };
-	my $zip = Archive::Zip->new($zip_filename);
-	my @filenames = sort $filename_cmp $zip->memberNames();
-	for my $filename (@filenames) {
-		my $member = $zip->removeMember($filename);
-		$zip->addMember($member);
-		$member->setLastModFileDateTimeFromUnix(SAFE_EPOCH);
-	}
-	$zip->overwrite();
-}
-
-package handlers::jar;
-
-sub _jar_filename_cmp {
-	# META-INF/ and META-INF/MANIFEST.MF are expected to be the first entries in the Zip archive.
-	return 0 if $a eq $b;
-	for (qw{META-INF/ META-INF/MANIFEST.MF}) {
-		return -1 if $a eq $_;
-		return  1 if $b eq $_;
-	}
-	return $a cmp $b;
-}
-
-sub normalize {
-	my ($jar_filename) = @_;
-	handlers::zip::normalize($jar_filename, \&_jar_filename_cmp);
-}
-
-=head1 SEE ALSO
-
-L<debhelper(7)>
-
-This program is a part of debhelper.
-
-=head1 AUTHOR
-
-Andrew Ayer <agwa at andrewayer.name>
-
-=cut
diff --git a/handlers/README b/handlers/README
deleted file mode 100644
index be83133..0000000
--- a/handlers/README
+++ /dev/null
@@ -1,3 +0,0 @@
-A handler reads a file of a particular format from stdin, strips the
-non-determinism from it, and writes the result to stdout.  A handler
-can be written in any language.
diff --git a/handlers/gzip b/handlers/gzip
deleted file mode 100755
index 79134b1..0000000
--- a/handlers/gzip
+++ /dev/null
@@ -1,104 +0,0 @@
-#!/usr/bin/perl
-
-# Copyright 2014 Andrew Ayer
-#
-# This program is free software: you can redistribute it and/or modify
-# it under the terms of the GNU General Public License as published by
-# the Free Software Foundation, either version 3 of the License, or
-# (at your option) any later version.
-#
-# This program is distributed in the hope that it will be useful,
-# but WITHOUT ANY WARRANTY; without even the implied warranty of
-# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-# GNU General Public License for more details.
-#
-# You should have received a copy of the GNU General Public License
-# along with this program.  If not, see <http://www.gnu.org/licenses/>.
-
-use strict;
-use warnings;
-
-use constant {
-	FTEXT    => 1 << 0,
-	FHCRC    => 1 << 1,
-	FEXTRA   => 1 << 2,
-	FNAME    => 1 << 3,
-	FCOMMENT => 1 << 4,
-};
-
-# See RFC 1952
-
-# 0   1   2   3   4   5   6   7   8   9   10
-# +---+---+---+---+---+---+---+---+---+---+
-# |ID1|ID2|CM |FLG|     MTIME     |XFL|OS |
-# +---+---+---+---+---+---+---+---+---+---+
-
-# Read the current header
-my $hdr;
-my $bytes_read = read(*STDIN, $hdr, 10);
-die "Not a gzip file" unless $bytes_read == 10;
-my ($id1, $id2, $cm, $flg, $mtime, $xfl, $os) = unpack('CCCCl<CC', $hdr);
-die "Not a gzip file" unless $id1 == 31 and $id2 == 139;
-
-my $new_flg = $flg;
-$new_flg &= ~FNAME;	# Don't include filename
-$new_flg &= ~FHCRC;	# Don't include header CRC (not all implementations support it)
-$mtime = 0;		# Zero out mtime (this is what `gzip -n` does)
-# TODO: question: normalize some of the other fields, such as OS?
-
-# Write a new header
-print pack('CCCCl<CC', $id1, $id2, $cm, $new_flg, $mtime, $xfl, $os);
-
-if ($flg & FEXTRA) {	# Copy through
-	# 0   1   2
-	# +---+---+=================================+
-	# | XLEN  |...XLEN bytes of "extra field"...|
-	# +---+---+=================================+
-	my $buf;
-	read(*STDIN, $buf, 2) == 2 or die "Malformed gzip file";
-	my ($xlen) = unpack('v', $buf);
-	read(*STDIN, $buf, $xlen) == $xlen or die "Malformed gzip file";
-	print pack('vA*', $xlen, $buf);
-}
-if ($flg & FNAME) {	# Read but do not copy through
-	# 0
-	# +=========================================+
-	# |...original file name, zero-terminated...|
-	# +=========================================+
-	while (1) {
-		my $buf;
-		read(*STDIN, $buf, 1) == 1 or die "Malformed gzip file";
-		last if ord($buf) == 0;
-	}
-}
-if ($flg & FCOMMENT) {	# Copy through
-	# 0
-	# +===================================+
-	# |...file comment, zero-terminated...|
-	# +===================================+
-	while (1) {
-		my $buf;
-		read(*STDIN, $buf, 1) == 1 or die "Malformed gzip file";
-		print $buf;
-		last if ord($buf) == 0;
-	}
-}
-if ($flg & FHCRC) {	# Read but do not copy through
-	# 0   1   2
-	# +---+---+
-	# | CRC16 |
-	# +---+---+
-	my $buf;
-	read(*STDIN, $buf, 2) == 2 or die "Not a gzip file";
-}
-
-# Copy through the rest of the file.
-# TODO: support concatenated gzip files.  This will require reading and understanding
-# each DEFLATE block (see RFC 1951), since gzip doesn't include lengths anywhere.
-while (1) {
-	my $buf;
-	my $bytes_read = read(*STDIN, $buf, 4096);
-	defined($bytes_read) or die "read failed: $!";
-	print $buf;
-	last if $bytes_read == 0;
-}
diff --git a/handlers/zip b/handlers/zip
deleted file mode 100755
index 7a1dba0..0000000
--- a/handlers/zip
+++ /dev/null
@@ -1,12 +0,0 @@
-#!/usr/bin/python
-
-from zipfile import ZipFile
-import sys
-
-with ZipFile(sys.stdout, 'w') as outzip:
-  with ZipFile(sys.stdin, 'r') as inzip:
-    for info in sorted(inzip.infolist(), key=lambda info: info.filename):
-      # 1980-01-01 is the earliest date that the zip format supports
-      info.date_time = (1980, 1, 1, 0, 0, 0)
-      content = inzip.read(info.filename)
-      outzip.writestr(info, content)
diff --git a/lib/StripNondeterminism.pm b/lib/StripNondeterminism.pm
new file mode 100644
index 0000000..b01d673
--- /dev/null
+++ b/lib/StripNondeterminism.pm
@@ -0,0 +1,76 @@
+#
+# Copyright 2014 Andrew Ayer
+#
+# This file is part of strip-nondeterminism.
+#
+# strip-nondeterminism is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# strip-nondeterminism is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with strip-nondeterminism.  If not, see <http://www.gnu.org/licenses/>.
+#
+package StripNondeterminism;
+
+use strict;
+use warnings;
+
+use StripNondeterminism::handlers::ar;
+use StripNondeterminism::handlers::gzip;
+use StripNondeterminism::handlers::jar;
+use StripNondeterminism::handlers::zip;
+
+our($VERSION);
+
+$VERSION = '0.001'; # 0.001
+
+sub _get_file_type {
+	my $file=shift;
+	open (FILE, '-|') # handle all filenames safely
+		|| exec('file', $file)
+		|| die "can't exec file: $!";
+	my $type=<FILE>;
+	close FILE;
+	return $type;
+}
+
+sub get_normalizer_for_file {
+	$_ = shift;
+
+	return undef if -d $_; # Skip directories
+
+	# ar
+	if (m/\.a$/ && _get_file_type($_) =~ m/ar archive/) {
+		return \&StripNondeterminism::handlers::ar::normalize;
+	}
+	# gzip
+	if (m/\.gz$/ && _get_file_type($_) =~ m/gzip compressed data/) {
+		return \&StripNondeterminism::handlers::gzip::normalize;
+	}
+	# jar
+	if (m/\.jar$/ && _get_file_type($_) =~ m/Zip archive data/) {
+		return \&StripNondeterminism::handlers::jar::normalize;
+	}
+	# zip
+	if (m/\.zip$/ && _get_file_type($_) =~ m/Zip archive data/) {
+		return \&StripNondeterminism::handlers::zip::normalize;
+	}
+	return undef;
+}
+
+sub get_normalizer_by_name {
+	$_ = shift;
+	return \&StripNondeterminism::handlers::ar::normalize if $_ eq 'ar';
+	return \&StripNondeterminism::handlers::gzip::normalize if $_ eq 'gzip';
+	return \&StripNondeterminism::handlers::jar::normalize if $_ eq 'jar';
+	return \&StripNondeterminism::handlers::zip::normalize if $_ eq 'zip';
+	return undef;
+}
+
+1;
diff --git a/lib/StripNondeterminism/handlers/ar.pm b/lib/StripNondeterminism/handlers/ar.pm
new file mode 100644
index 0000000..02d1b99
--- /dev/null
+++ b/lib/StripNondeterminism/handlers/ar.pm
@@ -0,0 +1,86 @@
+# Copyright © 2014 Jérémy Bobbio <lunar at debian.org>
+# Copyright © 2014 Niko Tyni <ntyni at debian.org>
+#
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful, but
+# WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+# Some code borrowed from ArFile
+# Copyright (C) 2007    Stefano Zacchiroli  <zack at debian.org>
+# Copyright (C) 2007    Filippo Giunchedi   <filippo at debian.org>
+
+package StripNondeterminism::handlers::ar;
+
+use strict;
+use warnings;
+
+use Fcntl q/SEEK_SET/;
+
+sub normalize {
+	my ($file) = @_;
+
+	my $GLOBAL_HEADER = "!<arch>\n";
+	my $GLOBAL_HEADER_LENGTH = length $GLOBAL_HEADER;
+
+	my $FILE_HEADER_LENGTH = 60;
+	my $FILE_MAGIC = "`\n";
+
+	my $buf;
+
+	open(my $fh, '+<', $file)
+	    or die("failed to open $file for read+write: $!");
+
+	read $fh, $buf, $GLOBAL_HEADER_LENGTH;
+	return if $buf ne $GLOBAL_HEADER;
+
+	while (1) {
+		my $file_header_start = tell $fh;
+		my $count = read $fh, $buf, $FILE_HEADER_LENGTH;
+		die "reading $file failed: $!" if !defined $count;
+		last if $count == 0;
+
+		# http://en.wikipedia.org/wiki/Ar_(Unix)    
+		#from   to     Name                      Format
+		#0      15     File name                 ASCII
+		#16     27     File modification date    Decimal
+		#28     33     Owner ID                  Decimal
+		#34     39     Group ID                  Decimal
+		#40     47     File mode                 Octal
+		#48     57     File size in bytes        Decimal
+		#58     59     File magic                \140\012
+
+		# FIXME: is this correct?
+		last if $count == 1 and eof($fh) and $buf eq "\n";
+
+		die "Incorrect header length"
+		if length $buf != $FILE_HEADER_LENGTH;
+		die "Incorrect file magic"
+		if substr($buf, 58, length($FILE_MAGIC)) ne $FILE_MAGIC;
+
+		my $file_size = substr($buf, 48, 10);
+		seek $fh, $file_header_start + 16, SEEK_SET;
+
+		# mtime
+		syswrite $fh, sprintf("%-12d", 0);
+		# owner
+		syswrite $fh, sprintf("%-6d", 0);
+		# group
+		syswrite $fh, sprintf("%-6d", 0);
+		# file mode
+		syswrite $fh, sprintf("%-8o", 0644);
+
+		# move to next member
+		seek $fh, $file_header_start + $FILE_HEADER_LENGTH + $file_size, SEEK_SET;
+	}
+}
+
+1;
diff --git a/lib/StripNondeterminism/handlers/gzip.pm b/lib/StripNondeterminism/handlers/gzip.pm
new file mode 100644
index 0000000..daa82d2
--- /dev/null
+++ b/lib/StripNondeterminism/handlers/gzip.pm
@@ -0,0 +1,122 @@
+#
+# Copyright 2014 Andrew Ayer
+#
+# This file is part of strip-nondeterminism.
+#
+# strip-nondeterminism is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# strip-nondeterminism is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with strip-nondeterminism.  If not, see <http://www.gnu.org/licenses/>.
+#
+package StripNondeterminism::handlers::gzip;
+
+use strict;
+use warnings;
+
+use File::Temp qw/tempfile/;
+use File::Basename;
+
+use constant {
+	FTEXT    => 1 << 0,
+	FHCRC    => 1 << 1,
+	FEXTRA   => 1 << 2,
+	FNAME    => 1 << 3,
+	FCOMMENT => 1 << 4,
+};
+
+sub normalize {
+	my ($filename) = @_;
+
+	open(my $fh, '<', $filename) or die "Unable to open $filename for reading: $!";
+	my ($out_fh, $out_filename) = tempfile(DIR => dirname($filename), UNLINK => 1);
+
+	# See RFC 1952
+
+	# 0   1   2   3   4   5   6   7   8   9   10
+	# +---+---+---+---+---+---+---+---+---+---+
+	# |ID1|ID2|CM |FLG|     MTIME     |XFL|OS |
+	# +---+---+---+---+---+---+---+---+---+---+
+
+	# Read the current header
+	my $hdr;
+	my $bytes_read = read($fh, $hdr, 10);
+	return unless $bytes_read == 10;
+	my ($id1, $id2, $cm, $flg, $mtime, $xfl, $os) = unpack('CCCCl<CC', $hdr);
+	return unless $id1 == 31 and $id2 == 139;
+
+	my $new_flg = $flg;
+	$new_flg &= ~FNAME;	# Don't include filename
+	$new_flg &= ~FHCRC;	# Don't include header CRC (not all implementations support it)
+	$mtime = 0;		# Zero out mtime (this is what `gzip -n` does)
+	# TODO: question: normalize some of the other fields, such as OS?
+
+	# Write a new header
+	print $out_fh pack('CCCCl<CC', $id1, $id2, $cm, $new_flg, $mtime, $xfl, $os);
+
+	if ($flg & FEXTRA) {	# Copy through
+		# 0   1   2
+		# +---+---+=================================+
+		# | XLEN  |...XLEN bytes of "extra field"...|
+		# +---+---+=================================+
+		my $buf;
+		read($fh, $buf, 2) == 2 or die "$filename: Malformed gzip file";
+		my ($xlen) = unpack('v', $buf);
+		read($fh, $buf, $xlen) == $xlen or die "$filename: Malformed gzip file";
+		print $out_fh pack('vA*', $xlen, $buf);
+	}
+	if ($flg & FNAME) {	# Read but do not copy through
+		# 0
+		# +=========================================+
+		# |...original file name, zero-terminated...|
+		# +=========================================+
+		while (1) {
+			my $buf;
+			read($fh, $buf, 1) == 1 or die "$filename: Malformed gzip file";
+			last if ord($buf) == 0;
+		}
+	}
+	if ($flg & FCOMMENT) {	# Copy through
+		# 0
+		# +===================================+
+		# |...file comment, zero-terminated...|
+		# +===================================+
+		while (1) {
+			my $buf;
+			read($fh, $buf, 1) == 1 or die "$filename: Malformed gzip file";
+			print $out_fh $buf;
+			last if ord($buf) == 0;
+		}
+	}
+	if ($flg & FHCRC) {	# Read but do not copy through
+		# 0   1   2
+		# +---+---+
+		# | CRC16 |
+		# +---+---+
+		my $buf;
+		read($fh, $buf, 2) == 2 or die "$filename: Malformed gzip file";
+	}
+
+	# Copy through the rest of the file.
+	# TODO: also normalize concatenated gzip files.  This will require reading and understanding
+	# each DEFLATE block (see RFC 1951), since gzip doesn't include lengths anywhere.
+	while (1) {
+		my $buf;
+		my $bytes_read = read($fh, $buf, 4096);
+		defined($bytes_read) or die "$filename: read failed: $!";
+		print $out_fh $buf;
+		last if $bytes_read == 0;
+	}
+
+	chmod((stat($fh))[2] & 07777, $out_filename);
+	rename($out_filename, $filename) or die "$filename: unable to overwrite: rename: $!";
+}
+
+1;
diff --git a/lib/StripNondeterminism/handlers/jar.pm b/lib/StripNondeterminism/handlers/jar.pm
new file mode 100644
index 0000000..6be7071
--- /dev/null
+++ b/lib/StripNondeterminism/handlers/jar.pm
@@ -0,0 +1,43 @@
+#
+# Copyright 2014 Andrew Ayer
+#
+# This file is part of strip-nondeterminism.
+#
+# strip-nondeterminism is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# strip-nondeterminism is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with strip-nondeterminism.  If not, see <http://www.gnu.org/licenses/>.
+#
+package StripNondeterminism::handlers::jar;
+
+use strict;
+use warnings;
+
+use Archive::Zip;
+use StripNondeterminism::handlers::zip;
+
+sub _jar_filename_cmp ($$) {
+	my ($a, $b) = @_;
+	# META-INF/ and META-INF/MANIFEST.MF are expected to be the first entries in the Zip archive.
+	return 0 if $a eq $b;
+	for (qw{META-INF/ META-INF/MANIFEST.MF}) {
+		return -1 if $a eq $_;
+		return  1 if $b eq $_;
+	}
+	return $a cmp $b;
+}
+
+sub normalize {
+	my ($jar_filename) = @_;
+	StripNondeterminism::handlers::zip::normalize($jar_filename, \&_jar_filename_cmp);
+}
+
+1;
diff --git a/lib/StripNondeterminism/handlers/zip.pm b/lib/StripNondeterminism/handlers/zip.pm
new file mode 100644
index 0000000..e19c7fc
--- /dev/null
+++ b/lib/StripNondeterminism/handlers/zip.pm
@@ -0,0 +1,45 @@
+#
+# Copyright 2014 Andrew Ayer
+#
+# This file is part of strip-nondeterminism.
+#
+# strip-nondeterminism is free software: you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+#
+# strip-nondeterminism is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with strip-nondeterminism.  If not, see <http://www.gnu.org/licenses/>.
+#
+package StripNondeterminism::handlers::zip;
+
+use strict;
+use warnings;
+
+use Archive::Zip;
+
+# A magic number from Archive::Zip for the earliest timestamp that
+# can be represented by a Zip file.  From the Archive::Zip source:
+# "Note, this isn't exactly UTC 1980, it's 1980 + 12 hours and 1
+# minute so that nothing timezoney can muck us up."
+use constant SAFE_EPOCH => 315576060;
+
+sub normalize {
+	my ($zip_filename, $filename_cmp) = @_;
+	$filename_cmp ||= sub { $a cmp $b };
+	my $zip = Archive::Zip->new($zip_filename);
+	my @filenames = sort $filename_cmp $zip->memberNames();
+	for my $filename (@filenames) {
+		my $member = $zip->removeMember($filename);
+		$zip->addMember($member);
+		$member->setLastModFileDateTimeFromUnix(SAFE_EPOCH);
+	}
+	$zip->overwrite();
+}
+
+1;

-- 
Alioth's /usr/local/bin/git-commit-notice on /srv/git.debian.org/git/reproducible/strip-nondeterminism.git



More information about the Reproducible-builds mailing list