[med-svn] r2327 - trunk/community/talks/200808_debconf8

tille at alioth.debian.org tille at alioth.debian.org
Sat Jul 26 16:23:03 UTC 2008


Author: tille
Date: 2008-07-26 16:23:02 +0000 (Sat, 26 Jul 2008)
New Revision: 2327

Modified:
   trunk/community/talks/200808_debconf8/get-archive-pages
Log:
Better parsing features.


Modified: trunk/community/talks/200808_debconf8/get-archive-pages
===================================================================
--- trunk/community/talks/200808_debconf8/get-archive-pages	2008-07-26 15:30:38 UTC (rev 2326)
+++ trunk/community/talks/200808_debconf8/get-archive-pages	2008-07-26 16:23:02 UTC (rev 2327)
@@ -41,15 +41,23 @@
 	    #print "$year-$month\n$data\n";
 	    my $datafile = "${year}-${month}" ;
 	    unless ( open(HTMLSNIP, ">$datafile") ) { die("Unable to open $datafile"); }
-	    my ($content, $subject, $author) ;
+	    my ($content, $subject, $author, $messages, $pages) ;
 	    foreach $content (@data) {
 		my @lines = split(/(\n)/, $content);
 		# print "------> @lines\n" ;
 		my $line;
+		my $linestart = '';
+		my $messagelines = 0;
+		my $spamlines = 0;
 		foreach $line (@lines) {
+		    if ( $linestart ) {
+			$line = $linestart . $line;
+			$linestart = '';
+		    }
 		    if ( $line =~ /^\s*<\/?ul>\s*$/ || 
                          $line =~ /^\s*<\/?li>\s*$/ ||
                          $line =~ /^\s*<li>[^<]+<\/li>\s*$/ ||
+			 $line =~ /^\s*<li><em>Message not available<\/em>/ ||
                          $line =~ /^\s*$/) { next ; }
 		    if ( ($subject, $author) = $line =~ m#<li><strong>.*html">(.+)</a></strong>\s*<em>(.+)</em>#gs ) {
 			$_ = $subject ;
@@ -58,8 +66,30 @@
 			$_ =~ s/\s*\(fwd\)\s*//i ; # Remove (fwd)
 			$subject = $_ ;
 			print HTMLSNIP "$subject ; $author\n";
+			$messagelines++ ;
 		    } else {
-			print HTMLSNIP "$line\n";
+			if ( ($messages, $pages) = $line 
+                               =~ m#The last update .* There are (\d+) messages. Page 1 of (\d+).<br>#gs ) {
+			    if ( $pages > 1 ) {
+				print "Warning: More than one page ($pages) in $year/$month of $project\n";
+			    }
+			    print HTMLSNIP "$messages Messages (counted $messagelines)\n";
+			    if ( $messages != $messagelines + $spamlines ) {
+				print "Warning: $project $year/$month counted $messagelines and $spamlines but page says $messages\n";
+			    }
+			} else {
+			    unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { # sometimes there are continued lines ...
+				print "DEBUG: Continued line $line\n" ;
+				$linestart = $line;
+			    } else {
+				if ( $line =~ /<em>\s*<\/em>\s*<\/li>\s*$/ ) { # sometimes SPAM has no sender ...
+				    print "Warning: Potential SPAM line: $line\n";
+				    $spamlines++ ;
+				} else {
+				    print "Warning: unknown Line: $line\n";
+				}
+			    }
+			}
 		    }
 		}
 	    }




More information about the debian-med-commit mailing list