[med-svn] r2331 - trunk/community/talks/200808_debconf8
tille at alioth.debian.org
tille at alioth.debian.org
Sat Jul 26 20:04:49 UTC 2008
Author: tille
Date: 2008-07-26 20:04:47 +0000 (Sat, 26 Jul 2008)
New Revision: 2331
Modified:
trunk/community/talks/200808_debconf8/get-archive-pages
Log:
several fixes
Modified: trunk/community/talks/200808_debconf8/get-archive-pages
===================================================================
--- trunk/community/talks/200808_debconf8/get-archive-pages 2008-07-26 18:03:08 UTC (rev 2330)
+++ trunk/community/talks/200808_debconf8/get-archive-pages 2008-07-26 20:04:47 UTC (rev 2331)
@@ -8,7 +8,7 @@
my $BASEURL = "http://lists.debian.org/debian" ;
my @PROJECTS = ('med', 'edu', 'jr') ;
my @MONTHES = ('01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12');
-my @ROBOTS = ('Debian Installer', 'bugzilla-skolelinux', 'Archive Administrator');
+my @ROBOTS = ('Debian Installer', 'bugzilla-skolelinux', 'Archive Administrator', 'hostmaster');
# Debian-Jr starts in 2000
my $YEARSTART = 2000;
@@ -57,8 +57,16 @@
my $line;
my $linestart = '';
foreach $line (@lines) {
+ if ( $line =~ /^\s*$/) { next ; }
if ( $linestart =~ /.+/ ) {
- $line = $linestart . $line;
+ if ( $line =~ /^\s*<\/?ul>\s*$/ ||
+ $line =~ /^\s*<\/?li>\s*$/ ) {
+ # fix broken formatting if there is a useless EOL and next line is <ul> or </li>
+ $line = $linestart;
+ } else {
+ # Append next line
+ $line = $linestart . $line;
+ }
print "DEBUG: Whole line is $line\n" ;
$linestart = '';
}
@@ -66,6 +74,7 @@
$line =~ /^\s*<\/?li>\s*$/ ||
$line =~ /^\s*<li>[^<]+<\/li>\s*$/ ||
$line =~ /^\s*<li><em>Message not available<\/em>/ ||
+ $line =~ /^<em>(continued)<\/em>\s*$/ ||
$line =~ /^\s*$/) { next ; }
if ( ($subject, $author) = $line =~ m#<li><strong>.*html">(.+)</a></strong>\s*<em>(.+)</em>#gs ) {
$_ = $subject ;
@@ -74,7 +83,7 @@
$_ =~ s/\s*\(fwd\)\s*//i ; # Remove (fwd)
$subject = $_ ;
if ( $subject =~ /^[&#x\d;\sA-F\?]+$/ ) {
- print "Warning: Potential SPAM line: $line\n";
+ print "Potential SPAM line - strange subject: $project $year-$month: $subject\n";
$spamlines++ ;
} else {
print HTMLSNIP "$subject ; $author\n";
@@ -98,9 +107,10 @@
unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { # sometimes there are continued lines ...
print "DEBUG: Continued line $line\n" ;
$linestart = $line;
+ ##next ; ##### ??????? if this line is missing line we get $linestart$linestart ...
} else {
if ( $line =~ /<em>\s*<\/em>\s*<\/li>\s*$/ ) { # sometimes SPAM has no sender ...
- print "Warning: Potential SPAM line: $line\n";
+ print "Potential SPAM line - no author: $project $year-$month\n";
$spamlines++ ;
} else {
print "Warning: unknown Line: $line\n";
More information about the debian-med-commit
mailing list