[med-svn] r2410 - trunk/community/talks/200808_debconf8

Fri Aug 15 03:17:32 UTC 2008

Author: tille
Date: 2008-08-15 03:17:31 +0000 (Fri, 15 Aug 2008)
New Revision: 2410

Modified:
   trunk/community/talks/200808_debconf8/get-archive-pages
Log:
Parsing Alioth should work


Modified: trunk/community/talks/200808_debconf8/get-archive-pages
===================================================================

--- trunk/community/talks/200808_debconf8/get-archive-pages	2008-08-13 01:25:27 UTC (rev 2409)
+++ trunk/community/talks/200808_debconf8/get-archive-pages	2008-08-15 03:17:31 UTC (rev 2410)
@@ -38,7 +38,8 @@
 my @ROBOTS   = ('Debian Installer', 'bugzilla-skolelinux', 'Archive Administrator', 'hostmaster',
                 'Debian-med-request', 'Debian testing watch', 'Debian Bug Tracking System',
                 'Skolelinux archive Installer', 'Debian Wiki', 'gentoo-\w+\+help',
-                'Debichem-commits');
+                'Debichem-commits', 'Weekly infolist of updatable packages for the debichem project',
+                'bts-link-upstream at lists.alioth.debian.org', 'DDPOMail robot');
 
 ## TODO: just consider mails containing these strings as SPAM
 ##       This has to be implemented in the code below
@@ -75,7 +76,7 @@
 
 foreach $project (@PROJECTS) {
     $ALLPROJECTS{$project} = { 'url'     => "${BASEURL}-${project}",
-			       'type'    => 0 # == lists.debian.org
+			       'type'    => 0, # == lists.debian.org
     };
 }
 
@@ -85,14 +86,23 @@
     };
 }
 
+my $SEPARATOR='<!-- -->';
+# different mailing list systems use different separators between message URL, subject and author
+my @SEP1 = ( '<li><strong>.*href="', '\s*' );
+my @SEP2 = ( '">',                   "\\s*<!-- -->\\s*" );
+my @SEP3 = ( '</a></strong>\s*<em>', '\s*<I>\s*');
+my @SEP4 = ( '</em>',                '\s*');
+
 # foreach $project (keys %ALLPROJECTS) {
 #    print "$project: $ALLPROJECTS{$project}{'url'}, $ALLPROJECTS{$project}{'type'}\n"
 #}
 
+my ($query, $daten);
+
 foreach $project (keys %ALLPROJECTS) {
     # Remove database entries for this project
-    my $query  = "DELETE FROM listarchive WHERE project = '$project'";
-    my($daten) = $dbh->prepare_cached($query);
+    $query = "DELETE FROM listarchive WHERE project = '$project'";
+    $daten = $dbh->prepare_cached($query);
     $daten->execute() ;
     $daten->finish() ;
 
@@ -103,12 +113,13 @@
     my $URL="$ALLPROJECTS{$project}{'url'}";
     my ( $year, $month, $url, @data, @lines ) ;
     my ($content, $msgurl, $subject, $author, $messages, $pages, $page, $line) ;
+    my $type = $ALLPROJECTS{$project}{'type'};
     for ( $year = $YEARSTART ; $year <= $YEAREND; $year++ ) {
 	foreach $month (@MONTHES) {
 	    if ( $year == $YEAREND && $month == $MONTHEND ) {
 		last;
 	    }
-	    if ( $ALLPROJECTS{$project}{'type'} == 0 ) {
+	    if ( $type == 0 ) {
 		$url = "${URL}/${year}/${month}/";
 	    } else {
 		$url = "${URL}/${year}-$monthdict{$month}/";
@@ -121,7 +132,7 @@
 	    my $spamlines    = 0;
 	    my $robotlines   = 0;
 	    while ( $url =~ /.+/ ) { # if only one page $url is set to ''
-		print "DEBUG: $year-$month: $url\n";
+		# print "DEBUG: $year-$month: $url\n";
 		my $uri = URI->new($url);
 		my $indexpage = $ua->get($url, Host => $uri->host );
 		unless ( $indexpage->is_success ) { # some mailing lists startet later ...
@@ -131,48 +142,49 @@
 		    unlink($datafile);
 		    next;
 		} ; 
-		if ( $ALLPROJECTS{$project}{'type'} == 1 ) {
+		if ( $type == 1 ) {
 		    # make sure the loop will end in case of Alioth lists.  Seems these list do
 		    # not feature more than one page per Month so there is no point in looping over them
 		    $url = '';
 		}
-		if ( $ALLPROJECTS{$project}{'type'} == 0 ) {
+		if ( $type == 0 ) {
 		    @data = $indexpage->content =~ m#.*<!--TNAVEND-->\n(.+)<hr>.*<!--BNAVSTART-->.*#gs;
 		} else {
 		    my @tmpdata = $indexpage->content =~ m#.*<b>Ending:</b> <i>[ \w]+ [ \d:]+ UTC [\d]+</i><br>\n(.+)<a name="end"><b>Last message date:</b></a>.*#gs;
-		    @data = ();
+		    my $tmpdata = '';
+                    my $tmpline = '';
 		    foreach $content (@tmpdata) {
 			@lines = split(/(\n)/, $content);
-			foreach (@lines) {
+			foreach $line (@lines) {
+			    $_ = $line;
 			    s/\s+/ /g;
 			    if ( $_ =~ /^\s*$/   || $_ =~ /^<!--\d+ /      ||
 				 $_ =~ /^<\/I>$/ || $_ =~ /^\s*<\/?p>\s*$/ ||
                                  $_ =~ /^\s*<\/?UL>\s*$/i ||
 				 $_ =~ /^<\/A><A NAME="\d+">&nbsp;<\/A>$/ ) { next ; }
-			    if ( ($subject) = $_ =~ /^\s*<LI><A HREF="\d+.html">\[[-\w]+\]\s*(.+)$/ ) {
+			    if ( ($msgurl, $subject) = $_ =~ /^\s*<LI><A HREF="(\d+.html)">\[[-\w]+\]\s*(.+)$/ ) {
 				$_ = $subject ;
 				$_ =~ s/^\s*Re:\s*//i ;       # Remove Re:
-				@data = (@data, $subject) ;
+				$_ =~ s/^\s*//i ;             # Remove blanks
+				$tmpline = $msgurl . $SEPARATOR . $subject ;
 			    } else {
 				if ( $_ =~ /<I>/ || $_ =~ /<b>Messages:<\/b>/ ) {
-				    @data = (@data, "$_\n" ) ;
+				    $tmpline = "$_\n"  ;
 				} else {
-				    @data = (@data, "$_" ) ;
+				    $tmpline = "$_"  ;
 				}
 			    }
+			    $tmpdata = $tmpdata . $tmpline;
 			}
 		    }
-		    if ( $storefiles ) {
-			print HTMLSNIP "@data\n";
-		    }
+		    @data = ($tmpdata);
 		}
 		foreach $content (@data) {
 		    @lines = split(/(\n)/, $content);
-		    # print "------> @lines\n" ;
 		    my $linestart = '';
 		    foreach $line (@lines) {
 			if ( $line =~ /^\s*$/) { next ; }
-			if ( $linestart =~ /.+/ ) {
+			if ( $linestart =~ /.+/ && $type == 0 ) {
 			    if ( $line =~ /^\s*<\/?ul>\s*$/ || 
 				 $line =~ /^\s*<\/?li>\s*$/ ) {
 				# fix broken formatting if there is a useless EOL and next line is <ul> or </li>
@@ -189,8 +201,16 @@
 			     $line =~ /^\s*<li><em>Message not available<\/em>/ ||
 			     $line =~ /<em>\(continued\)<\/em>\s*$/ ||
 			     $line =~ /^\s*$/) { next ; }
+# @SEP1 = ( '<li><strong>.*href="', '\s*' );
+# @SEP2 = ( '">',                   "\s*$SEPARATOR\s*" );
+# @SEP3 = ( '</a></strong>\s*<em>', '\s*<I>\s*');
+# @SEP4 = ( '</em>',                '\s*');
+			# print "DEBUG: $line\n";
+			if ( $storefiles ) {
+			    print HTMLSNIP "$line\n";
+			}
 			if ( ($msgurl, $subject, $author) = 
-                              $line =~ m#<li><strong>.*href="(msg\d+\.html)">(.+)</a></strong>\s*<em>(.+)</em>#gs ) {
+                              $line =~ m#$SEP1[$type]([msg]*\d+\.html)$SEP2[$type](.+)$SEP3[$type](.+)$SEP4[$type]#gs ) {
 			    $_ = $subject ;
 			    $_ =~ s/^Re:\s*//i ;       # Remove Re:
 			    $_ =~ s/^\[[^\]]+\]\s*([^\s]+)/$1/ ; # Remove other list markers (but only if something is following)
@@ -229,31 +249,39 @@
 				}
 			    }
 			} else {
-			    if ( ($messages, $page, $pages) = $line 
-				 =~ m#The last update .* There are (\d+) messages. Page (\d+) of (\d+).<br>#gs ) {
-				if ( $page != $pages ) { # handle following pages
-				    print "Warning: Page $page of $pages in $year/$month of $project\n";
-				    $page++;
-				    $url = "${URL}/${year}/${month}/thrd${page}.html";
+			    if ( $type == 0 ) {
+				if ( ($messages, $page, $pages) = $line 
+				     =~ m#The last update .* There are (\d+) messages. Page (\d+) of (\d+).<br>#gs ) {
+				    if ( $page != $pages ) { # handle following pages
+					print "Warning: Page $page of $pages in $year/$month of $project\n";
+					$page++;
+					$url = "${URL}/${year}/${month}/thrd${page}.html";
+				    } else {
+					$url = '';
+				    }
+				    if ( $storefiles ) {
+					print HTMLSNIP "$messages Messages ($messagelines real messages, $spamlines SPAM, $robotlines messages by robots)\n";
+				    }
+				    if ( $messages != $messagelines + $spamlines + $robotlines ) {
+					print "Warning: $project $year/$month counted $messagelines Messages, $spamlines SPAM and $robotlines robots but page says $messages\n";
+				    }
 				} else {
-				    $url = '';
+				    unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { # sometimes there are continued lines ...
+					$linestart = $line;
+					##next ; ##### ??????? if this line is missing line we get $linestart$linestart ...
+				    } else {
+					if ( $line =~ /<em>\s*<\/em>\s*<\/li>\s*$/ ) { # sometimes SPAM has no sender ...
+					    print "Potential SPAM line - no author: $project $year-$month\n";
+					    $spamlines++ ;
+					} else {
+					    print "Warning: unknown Line: $line\n";
+					}
+				    }
 				}
-				if ( $storefiles ) {
-				    print HTMLSNIP "$messages Messages ($messagelines real messages, $spamlines SPAM, $robotlines messages by robots)\n";
-				}
-				if ( $messages != $messagelines + $spamlines + $robotlines ) {
-				    print "Warning: $project $year/$month counted $messagelines Messages, $spamlines SPAM and $robotlines robots but page says $messages\n";
-				}
 			    } else {
-				unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { # sometimes there are continued lines ...
-				    $linestart = $line;
-				    ##next ; ##### ??????? if this line is missing line we get $linestart$linestart ...
-				} else {
-				    if ( $line =~ /<em>\s*<\/em>\s*<\/li>\s*$/ ) { # sometimes SPAM has no sender ...
-					print "Potential SPAM line - no author: $project $year-$month\n";
-					$spamlines++ ;
-				    } else {
-					print "Warning: unknown Line: $line\n";
+				if ( ($messages) = $line =~ m#^\s*<b>Messages:</b>\s*(\d+)<p>#gs ) {
+				    if ( $storefiles ) {
+					print HTMLSNIP "$messages Messages ($messagelines real messages, $spamlines SPAM, $robotlines messages by robots)\n";
 				    }
 				}
 			    }
@@ -271,9 +299,16 @@
 
 # Database has shown that Ralf Gsellenstetter is posting with several names
 # in Debian Edu.  This script cleans up this
-system("./0fix_ralf_edu");
+# system("./0fix_ralf_edu");
+$query = "UPDATE listarchive SET author = 'Ralf Gesellensetter' WHERE project = 'edu' AND author LIKE 'Ralf%setter';" ;
+$query = "UPDATE listarchive SET author = 'Vagrant Cascadian'   WHERE project = 'edu' AND author LIKE '%vagrant%';" ;
 
+$daten = $dbh->prepare_cached($query);
+$daten->execute() ;
+$daten->finish() ;
+
+
 # Just do the graphing of all lists we got
-foreach $project (@PROJECTS) {
+foreach $project (keys %ALLPROJECTS) {
     system("./author_stats $project") ;
 }