[med-svn] r2331 - trunk/community/talks/200808_debconf8

tille at alioth.debian.org tille at alioth.debian.org
Sat Jul 26 20:04:49 UTC 2008


Author: tille
Date: 2008-07-26 20:04:47 +0000 (Sat, 26 Jul 2008)
New Revision: 2331

Modified:
   trunk/community/talks/200808_debconf8/get-archive-pages
Log:
several fixes


Modified: trunk/community/talks/200808_debconf8/get-archive-pages
===================================================================
--- trunk/community/talks/200808_debconf8/get-archive-pages	2008-07-26 18:03:08 UTC (rev 2330)
+++ trunk/community/talks/200808_debconf8/get-archive-pages	2008-07-26 20:04:47 UTC (rev 2331)
@@ -8,7 +8,7 @@
 my $BASEURL  = "http://lists.debian.org/debian" ;
 my @PROJECTS = ('med', 'edu', 'jr') ;
 my @MONTHES  = ('01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12');
-my @ROBOTS   = ('Debian Installer', 'bugzilla-skolelinux', 'Archive Administrator');
+my @ROBOTS   = ('Debian Installer', 'bugzilla-skolelinux', 'Archive Administrator', 'hostmaster');
 
 # Debian-Jr starts in 2000
 my $YEARSTART = 2000;
@@ -57,8 +57,16 @@
 		    my $line;
 		    my $linestart = '';
 		    foreach $line (@lines) {
+			if ( $line =~ /^\s*$/) { next ; }
 			if ( $linestart =~ /.+/ ) {
-			    $line = $linestart . $line;
+			    if ( $line =~ /^\s*<\/?ul>\s*$/ || 
+				 $line =~ /^\s*<\/?li>\s*$/ ) {
+				# fix broken formatting if there is a useless EOL and next line is <ul> or </li>
+				$line = $linestart;
+			    } else {
+				# Append next line
+				$line = $linestart . $line;
+			    }
 			    print "DEBUG: Whole line is $line\n" ;
 			    $linestart = '';
 			}
@@ -66,6 +74,7 @@
 			     $line =~ /^\s*<\/?li>\s*$/ ||
 			     $line =~ /^\s*<li>[^<]+<\/li>\s*$/ ||
 			     $line =~ /^\s*<li><em>Message not available<\/em>/ ||
+			     $line =~ /^<em>(continued)<\/em>\s*$/ ||
 			     $line =~ /^\s*$/) { next ; }
 			if ( ($subject, $author) = $line =~ m#<li><strong>.*html">(.+)</a></strong>\s*<em>(.+)</em>#gs ) {
 			    $_ = $subject ;
@@ -74,7 +83,7 @@
 			    $_ =~ s/\s*\(fwd\)\s*//i ; # Remove (fwd)
 			    $subject = $_ ;
 			    if ( $subject =~ /^[&#x\d;\sA-F\?]+$/ ) {
-				print "Warning: Potential SPAM line: $line\n";
+				print "Potential SPAM line - strange subject: $project $year-$month: $subject\n";
 				$spamlines++ ;
 			    } else {
 				print HTMLSNIP "$subject ; $author\n";
@@ -98,9 +107,10 @@
 				unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { # sometimes there are continued lines ...
 				    print "DEBUG: Continued line $line\n" ;
 				    $linestart = $line;
+				    ##next ; ##### ??????? if this line is missing line we get $linestart$linestart ...
 				} else {
 				    if ( $line =~ /<em>\s*<\/em>\s*<\/li>\s*$/ ) { # sometimes SPAM has no sender ...
-					print "Warning: Potential SPAM line: $line\n";
+					print "Potential SPAM line - no author: $project $year-$month\n";
 					$spamlines++ ;
 				    } else {
 					print "Warning: unknown Line: $line\n";




More information about the debian-med-commit mailing list