[med-svn] r2330 - trunk/community/talks/200808_debconf8

tille at alioth.debian.org tille at alioth.debian.org
Sat Jul 26 18:03:09 UTC 2008


Author: tille
Date: 2008-07-26 18:03:08 +0000 (Sat, 26 Jul 2008)
New Revision: 2330

Modified:
   trunk/community/talks/200808_debconf8/get-archive-pages
Log:
Fixed some bugs, better SPAM detection


Modified: trunk/community/talks/200808_debconf8/get-archive-pages
===================================================================
--- trunk/community/talks/200808_debconf8/get-archive-pages	2008-07-26 17:47:25 UTC (rev 2329)
+++ trunk/community/talks/200808_debconf8/get-archive-pages	2008-07-26 18:03:08 UTC (rev 2330)
@@ -34,23 +34,28 @@
 		last;
 	    }
 	    my $url = "${URL}/${year}/${month}/";
+	    my $datafile = "${year}-${month}" ;
+	    unless ( open(HTMLSNIP, ">$datafile") ) { die("Unable to open $datafile"); }
+	    my $messagelines = 0;
+	    my $spamlines = 0;
 	    while ( $url =~ /.+/ ) { # if only one page $url is set to ''
 		# print "$year-$month: $url\n";
 		my $uri = URI->new($url);
 		my $indexpage = $ua->get($url, Host => $uri->host );
-		unless ( $indexpage->is_success ) { $url = ''; next; } ; # some mailing lists startet later ...
+		unless ( $indexpage->is_success ) { # some mailing lists startet later ...
+		    $url = '';
+		    close HTMLSNIP ;
+		    # remove empty file
+		    unlink($datafile);
+		    next;
+		} ; 
 		(my @data) = $indexpage->content =~ m#.*<!--TNAVEND-->\n(.+)<hr>.*<!--BNAVSTART-->.*#gs;
-		#print "$year-$month\n$data\n";
-		my $datafile = "${year}-${month}" ;
-		unless ( open(HTMLSNIP, ">$datafile") ) { die("Unable to open $datafile"); }
 		my ($content, $subject, $author, $messages, $pages, $page) ;
 		foreach $content (@data) {
 		    my @lines = split(/(\n)/, $content);
 		    # print "------> @lines\n" ;
 		    my $line;
 		    my $linestart = '';
-		    my $messagelines = 0;
-		    my $spamlines = 0;
 		    foreach $line (@lines) {
 			if ( $linestart =~ /.+/ ) {
 			    $line = $linestart . $line;
@@ -68,13 +73,18 @@
 			    $_ =~ s/^\[[^\]]+\]\s*([^\s]+)/$1/ ; # Remove other list markers (but only if something is following)
 			    $_ =~ s/\s*\(fwd\)\s*//i ; # Remove (fwd)
 			    $subject = $_ ;
-			    print HTMLSNIP "$subject ; $author\n";
-			    $messagelines++ ;
+			    if ( $subject =~ /^[&#x\d;\sA-F\?]+$/ ) {
+				print "Warning: Potential SPAM line: $line\n";
+				$spamlines++ ;
+			    } else {
+				print HTMLSNIP "$subject ; $author\n";
+				$messagelines++ ;
+			    }
 			} else {
 			    if ( ($messages, $page, $pages) = $line 
 				 =~ m#The last update .* There are (\d+) messages. Page (\d+) of (\d+).<br>#gs ) {
 				if ( $page != $pages ) { # handle following pages
-				    print "Warning: Page %page of $pages in $year/$month of $project\n";
+				    print "Warning: Page $page of $pages in $year/$month of $project\n";
 				    $page++;
 				    $url = "$url/thrd${page}.html";
 				} else {




More information about the debian-med-commit mailing list