[med-svn] r2329 - trunk/community/talks/200808_debconf8

tille at alioth.debian.org tille at alioth.debian.org
Sat Jul 26 17:47:26 UTC 2008


Author: tille
Date: 2008-07-26 17:47:25 +0000 (Sat, 26 Jul 2008)
New Revision: 2329

Modified:
   trunk/community/talks/200808_debconf8/get-archive-pages
Log:
Remove some obvious spam entries


Modified: trunk/community/talks/200808_debconf8/get-archive-pages
===================================================================
--- trunk/community/talks/200808_debconf8/get-archive-pages	2008-07-26 17:22:18 UTC (rev 2328)
+++ trunk/community/talks/200808_debconf8/get-archive-pages	2008-07-26 17:47:25 UTC (rev 2329)
@@ -8,6 +8,7 @@
 my $BASEURL  = "http://lists.debian.org/debian" ;
 my @PROJECTS = ('med', 'edu', 'jr') ;
 my @MONTHES  = ('01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12');
+my @ROBOTS   = ('Debian Installer', 'bugzilla-skolelinux', 'Archive Administrator');
 
 # Debian-Jr starts in 2000
 my $YEARSTART = 2000;
@@ -33,60 +34,67 @@
 		last;
 	    }
 	    my $url = "${URL}/${year}/${month}/";
-	    #print "$year-$month: $url\n";
-	    my $uri = URI->new($url);
-	    my $page = $ua->get($url, Host => $uri->host );
-	    unless ( $page->is_success ) { next } ; # some mailing lists startet later ...
-	    (my @data) = $page->content =~ m#.*<!--TNAVEND-->\n(.+)<hr>.*<!--BNAVSTART-->.*#gs;
-	    #print "$year-$month\n$data\n";
-	    my $datafile = "${year}-${month}" ;
-	    unless ( open(HTMLSNIP, ">$datafile") ) { die("Unable to open $datafile"); }
-	    my ($content, $subject, $author, $messages, $pages) ;
-	    foreach $content (@data) {
-		my @lines = split(/(\n)/, $content);
-		# print "------> @lines\n" ;
-		my $line;
-		my $linestart = '';
-		my $messagelines = 0;
-		my $spamlines = 0;
-		foreach $line (@lines) {
-		    if ( $linestart ) {
-			$line = $linestart . $line;
-			$linestart = '';
-		    }
-		    if ( $line =~ /^\s*<\/?ul>\s*$/ || 
-                         $line =~ /^\s*<\/?li>\s*$/ ||
-                         $line =~ /^\s*<li>[^<]+<\/li>\s*$/ ||
-			 $line =~ /^\s*<li><em>Message not available<\/em>/ ||
-                         $line =~ /^\s*$/) { next ; }
-		    if ( ($subject, $author) = $line =~ m#<li><strong>.*html">(.+)</a></strong>\s*<em>(.+)</em>#gs ) {
-			$_ = $subject ;
-			$_ =~ s/^Re:\s*//i ;       # Remove Re:
-			$_ =~ s/^\[[^\]]+\]\s*// ; # Remove other list markers
-			$_ =~ s/\s*\(fwd\)\s*//i ; # Remove (fwd)
-			$subject = $_ ;
-			print HTMLSNIP "$subject ; $author\n";
-			$messagelines++ ;
-		    } else {
-			if ( ($messages, $pages) = $line 
-                               =~ m#The last update .* There are (\d+) messages. Page 1 of (\d+).<br>#gs ) {
-			    if ( $pages > 1 ) {
-				print "Warning: More than one page ($pages) in $year/$month of $project\n";
-			    }
-			    print HTMLSNIP "$messages Messages (counted $messagelines)\n";
-			    if ( $messages != $messagelines + $spamlines ) {
-				print "Warning: $project $year/$month counted $messagelines and $spamlines but page says $messages\n";
-			    }
+	    while ( $url =~ /.+/ ) { # if only one page $url is set to ''
+		# print "$year-$month: $url\n";
+		my $uri = URI->new($url);
+		my $indexpage = $ua->get($url, Host => $uri->host );
+		unless ( $indexpage->is_success ) { $url = ''; next; } ; # some mailing lists startet later ...
+		(my @data) = $indexpage->content =~ m#.*<!--TNAVEND-->\n(.+)<hr>.*<!--BNAVSTART-->.*#gs;
+		#print "$year-$month\n$data\n";
+		my $datafile = "${year}-${month}" ;
+		unless ( open(HTMLSNIP, ">$datafile") ) { die("Unable to open $datafile"); }
+		my ($content, $subject, $author, $messages, $pages, $page) ;
+		foreach $content (@data) {
+		    my @lines = split(/(\n)/, $content);
+		    # print "------> @lines\n" ;
+		    my $line;
+		    my $linestart = '';
+		    my $messagelines = 0;
+		    my $spamlines = 0;
+		    foreach $line (@lines) {
+			if ( $linestart =~ /.+/ ) {
+			    $line = $linestart . $line;
+			    print "DEBUG: Whole line is $line\n" ;
+			    $linestart = '';
+			}
+			if ( $line =~ /^\s*<\/?ul>\s*$/ || 
+			     $line =~ /^\s*<\/?li>\s*$/ ||
+			     $line =~ /^\s*<li>[^<]+<\/li>\s*$/ ||
+			     $line =~ /^\s*<li><em>Message not available<\/em>/ ||
+			     $line =~ /^\s*$/) { next ; }
+			if ( ($subject, $author) = $line =~ m#<li><strong>.*html">(.+)</a></strong>\s*<em>(.+)</em>#gs ) {
+			    $_ = $subject ;
+			    $_ =~ s/^Re:\s*//i ;       # Remove Re:
+			    $_ =~ s/^\[[^\]]+\]\s*([^\s]+)/$1/ ; # Remove other list markers (but only if something is following)
+			    $_ =~ s/\s*\(fwd\)\s*//i ; # Remove (fwd)
+			    $subject = $_ ;
+			    print HTMLSNIP "$subject ; $author\n";
+			    $messagelines++ ;
 			} else {
-			    unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { # sometimes there are continued lines ...
-				print "DEBUG: Continued line $line\n" ;
-				$linestart = $line;
+			    if ( ($messages, $page, $pages) = $line 
+				 =~ m#The last update .* There are (\d+) messages. Page (\d+) of (\d+).<br>#gs ) {
+				if ( $page != $pages ) { # handle following pages
+				    print "Warning: Page %page of $pages in $year/$month of $project\n";
+				    $page++;
+				    $url = "$url/thrd${page}.html";
+				} else {
+				    $url = '';
+				}
+				print HTMLSNIP "$messages Messages (counted $messagelines)\n";
+				if ( $messages != $messagelines + $spamlines ) {
+				    print "Warning: $project $year/$month counted $messagelines and $spamlines but page says $messages\n";
+				}
 			    } else {
-				if ( $line =~ /<em>\s*<\/em>\s*<\/li>\s*$/ ) { # sometimes SPAM has no sender ...
-				    print "Warning: Potential SPAM line: $line\n";
-				    $spamlines++ ;
+				unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { # sometimes there are continued lines ...
+				    print "DEBUG: Continued line $line\n" ;
+				    $linestart = $line;
 				} else {
-				    print "Warning: unknown Line: $line\n";
+				    if ( $line =~ /<em>\s*<\/em>\s*<\/li>\s*$/ ) { # sometimes SPAM has no sender ...
+					print "Warning: Potential SPAM line: $line\n";
+					$spamlines++ ;
+				    } else {
+					print "Warning: unknown Line: $line\n";
+				    }
 				}
 			    }
 			}




More information about the debian-med-commit mailing list