[med-svn] r2327 - trunk/community/talks/200808_debconf8
tille at alioth.debian.org
tille at alioth.debian.org
Sat Jul 26 16:23:03 UTC 2008
Author: tille
Date: 2008-07-26 16:23:02 +0000 (Sat, 26 Jul 2008)
New Revision: 2327
Modified:
trunk/community/talks/200808_debconf8/get-archive-pages
Log:
Better parsing features.
Modified: trunk/community/talks/200808_debconf8/get-archive-pages
===================================================================
--- trunk/community/talks/200808_debconf8/get-archive-pages 2008-07-26 15:30:38 UTC (rev 2326)
+++ trunk/community/talks/200808_debconf8/get-archive-pages 2008-07-26 16:23:02 UTC (rev 2327)
@@ -41,15 +41,23 @@
#print "$year-$month\n$data\n";
my $datafile = "${year}-${month}" ;
unless ( open(HTMLSNIP, ">$datafile") ) { die("Unable to open $datafile"); }
- my ($content, $subject, $author) ;
+ my ($content, $subject, $author, $messages, $pages) ;
foreach $content (@data) {
my @lines = split(/(\n)/, $content);
# print "------> @lines\n" ;
my $line;
+ my $linestart = '';
+ my $messagelines = 0;
+ my $spamlines = 0;
foreach $line (@lines) {
+ if ( $linestart ) {
+ $line = $linestart . $line;
+ $linestart = '';
+ }
if ( $line =~ /^\s*<\/?ul>\s*$/ ||
$line =~ /^\s*<\/?li>\s*$/ ||
$line =~ /^\s*<li>[^<]+<\/li>\s*$/ ||
+ $line =~ /^\s*<li><em>Message not available<\/em>/ ||
$line =~ /^\s*$/) { next ; }
if ( ($subject, $author) = $line =~ m#<li><strong>.*html">(.+)</a></strong>\s*<em>(.+)</em>#gs ) {
$_ = $subject ;
@@ -58,8 +66,30 @@
$_ =~ s/\s*\(fwd\)\s*//i ; # Remove (fwd)
$subject = $_ ;
print HTMLSNIP "$subject ; $author\n";
+ $messagelines++ ;
} else {
- print HTMLSNIP "$line\n";
+ if ( ($messages, $pages) = $line
+ =~ m#The last update .* There are (\d+) messages. Page 1 of (\d+).<br>#gs ) {
+ if ( $pages > 1 ) {
+ print "Warning: More than one page ($pages) in $year/$month of $project\n";
+ }
+ print HTMLSNIP "$messages Messages (counted $messagelines)\n";
+ if ( $messages != $messagelines + $spamlines ) {
+ print "Warning: $project $year/$month counted $messagelines and $spamlines but page says $messages\n";
+ }
+ } else {
+ unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { # sometimes there are continued lines ...
+ print "DEBUG: Continued line $line\n" ;
+ $linestart = $line;
+ } else {
+ if ( $line =~ /<em>\s*<\/em>\s*<\/li>\s*$/ ) { # sometimes SPAM has no sender ...
+ print "Warning: Potential SPAM line: $line\n";
+ $spamlines++ ;
+ } else {
+ print "Warning: unknown Line: $line\n";
+ }
+ }
+ }
}
}
}
More information about the debian-med-commit
mailing list