[med-svn] r2403 - trunk/community/talks/200808_debconf8
tille at alioth.debian.org
tille at alioth.debian.org
Sun Aug 10 23:36:05 UTC 2008
Author: tille
Date: 2008-08-10 23:36:05 +0000 (Sun, 10 Aug 2008)
New Revision: 2403
Modified:
trunk/community/talks/200808_debconf8/get-archive-pages
Log:
Try to include alioth lists. Not working now and some Debugging output added.
Modified: trunk/community/talks/200808_debconf8/get-archive-pages
===================================================================
--- trunk/community/talks/200808_debconf8/get-archive-pages 2008-08-10 01:25:03 UTC (rev 2402)
+++ trunk/community/talks/200808_debconf8/get-archive-pages 2008-08-10 23:36:05 UTC (rev 2403)
@@ -10,8 +10,13 @@
my @PROJECTS = ('med', 'edu', 'jr', 'accessibility', 'desktop', 'enterprise', 'lex',
'nonprofit', 'science', 'custom',
'i18n', 'devel', 'project') ; # ... just for the sake of interest
+
+## DEBUG
+ at PROJECTS = ('enterprise'); # Just find a very short list while testing Alioth ...
+
# Well, there is also interest in alioth lists ...
-my @ALIOTHPRJ= ('debichem-devel', 'pkg-grass-general') ;
+my $BASEALIOTH = 'http://lists.alioth.debian.org/pipermail/';
+my @ALIOTHPRJ = ('debichem-devel', 'pkg-grass-general') ;
## http://lists.alioth.debian.org/pipermail/debichem-devel/2008-August/thread.html
## http://lists.alioth.debian.org/pipermail/pkg-grass-general/2008-July/thread.html
@@ -42,7 +47,7 @@
# if != 0 then extract of mailing list archives is stored in files in dirs
# The prefered method is to use only the database
-my $storefiles = 0;
+my $storefiles = 1; # Just store the files again for debugging issues of alioth lists
# Debian-Devel starts in 1995
my $YEARSTART = 1995;
@@ -65,7 +70,25 @@
my $datain = $dbh->prepare_cached($insert);
my ( $robot, $robotflag );
+my %ALLPROJECTS;
+
foreach $project (@PROJECTS) {
+ $ALLPROJECTS{$project} = { 'url' => "${BASEURL}-${project}",
+ 'type' => 0 # == lists.debian.org
+ };
+}
+
+foreach $project (@ALIOTHPRJ) {
+ $ALLPROJECTS{$project} = { 'url' => "${BASEALIOTH}/${project}",
+ 'type' => 1 # == lists.alioth.debian.org
+ };
+}
+
+# foreach $project (keys %ALLPROJECTS) {
+# print "$project: $ALLPROJECTS{$project}{'url'}, $ALLPROJECTS{$project}{'type'}\n"
+#}
+
+foreach $project (keys %ALLPROJECTS) {
# Remove database entries for this project
my $query = "DELETE FROM listarchive WHERE project = '$project'";
my($daten) = $dbh->prepare_cached($query);
@@ -76,15 +99,19 @@
mkdir($project,0777);
chdir($project);
}
- my $URL="${BASEURL}-${project}";
- my $year;
- my $month;
+ my $URL="$ALLPROJECTS{$project}{'url'}";
+ my ( $year, $month, $url, @data, @lines ) ;
+ my ($content, $msgurl, $subject, $author, $messages, $pages, $page, $line) ;
for ( $year = $YEARSTART ; $year <= $YEAREND; $year++ ) {
foreach $month (@MONTHES) {
if ( $year == $YEAREND && $month == $MONTHEND ) {
last;
}
- my $url = "${URL}/${year}/${month}/";
+ if ( $ALLPROJECTS{$project}{'type'} == 0 ) {
+ $url = "${URL}/${year}/${month}/";
+ } else {
+ $url = "${URL}/${year}-$monthdict{$month}/";
+ }
my $datafile = "${year}-${month}" ;
if ( $storefiles ) {
unless ( open(HTMLSNIP, ">$datafile") ) { die("Unable to open $datafile"); }
@@ -93,7 +120,7 @@
my $spamlines = 0;
my $robotlines = 0;
while ( $url =~ /.+/ ) { # if only one page $url is set to ''
- # print "$year-$month: $url\n";
+ print "DEBUG: $year-$month: $url\n";
my $uri = URI->new($url);
my $indexpage = $ua->get($url, Host => $uri->host );
unless ( $indexpage->is_success ) { # some mailing lists startet later ...
@@ -103,12 +130,36 @@
unlink($datafile);
next;
} ;
- (my @data) = $indexpage->content =~ m#.*<!--TNAVEND-->\n(.+)<hr>.*<!--BNAVSTART-->.*#gs;
- my ($content, $msgurl, $subject, $author, $messages, $pages, $page) ;
+ if ( $ALLPROJECTS{$project}{'type'} == 1 ) {
+ # make sure the loop will end in case of Alioth lists. Seems these list do
+ # not feature more than one page per Month so there is no point in looping over them
+ $url = '';
+ }
+ if ( $ALLPROJECTS{$project}{'type'} == 0 ) {
+ @data = $indexpage->content =~ m#.*<!--TNAVEND-->\n(.+)<hr>.*<!--BNAVSTART-->.*#gs;
+ } else {
+ my @tmpdata = $indexpage->content =~ m#.*<b>Ending:</b> <i>[ \w]+ [ \d:]+ UTC [\d]+</i><br>\n(.+)<a name="end"><b>Last message date:</b></a>.*#gs;
+ @data = ();
+ foreach $content (@tmpdata) {
+ @lines = split(/(\n)/, $content);
+ foreach $line (@lines) {
+ if ( $line =~ /^\s*$/ || $line =~ /^<!--\d+ / ||
+ $line =~ /^<\/I>$/ || $line =~ /^<UL>$/ ||
+ $line =~ /^<\/A><A NAME="\d+"> <\/A>$/ ) { next ; }
+ if ( $line =~ /^<LI><A HREF="\d+.html">\[[-\w]+\]/ ) {
+ @data = (@data, $line) ;
+ } else {
+ @data = (@data, "$line\n" ) ;
+ }
+ }
+ }
+ if ( $storefiles ) {
+ print HTMLSNIP "@data\n";
+ }
+ }
foreach $content (@data) {
- my @lines = split(/(\n)/, $content);
+ @lines = split(/(\n)/, $content);
# print "------> @lines\n" ;
- my $line;
my $linestart = '';
foreach $line (@lines) {
if ( $line =~ /^\s*$/) { next ; }
More information about the debian-med-commit
mailing list