[med-svn] r2330 - trunk/community/talks/200808_debconf8
tille at alioth.debian.org
tille at alioth.debian.org
Sat Jul 26 18:03:09 UTC 2008
Author: tille
Date: 2008-07-26 18:03:08 +0000 (Sat, 26 Jul 2008)
New Revision: 2330
Modified:
trunk/community/talks/200808_debconf8/get-archive-pages
Log:
Fixed some bugs, better SPAM detection
Modified: trunk/community/talks/200808_debconf8/get-archive-pages
===================================================================
--- trunk/community/talks/200808_debconf8/get-archive-pages 2008-07-26 17:47:25 UTC (rev 2329)
+++ trunk/community/talks/200808_debconf8/get-archive-pages 2008-07-26 18:03:08 UTC (rev 2330)
@@ -34,23 +34,28 @@
last;
}
my $url = "${URL}/${year}/${month}/";
+ my $datafile = "${year}-${month}" ;
+ unless ( open(HTMLSNIP, ">$datafile") ) { die("Unable to open $datafile"); }
+ my $messagelines = 0;
+ my $spamlines = 0;
while ( $url =~ /.+/ ) { # if only one page $url is set to ''
# print "$year-$month: $url\n";
my $uri = URI->new($url);
my $indexpage = $ua->get($url, Host => $uri->host );
- unless ( $indexpage->is_success ) { $url = ''; next; } ; # some mailing lists startet later ...
+ unless ( $indexpage->is_success ) { # some mailing lists startet later ...
+ $url = '';
+ close HTMLSNIP ;
+ # remove empty file
+ unlink($datafile);
+ next;
+ } ;
(my @data) = $indexpage->content =~ m#.*<!--TNAVEND-->\n(.+)<hr>.*<!--BNAVSTART-->.*#gs;
- #print "$year-$month\n$data\n";
- my $datafile = "${year}-${month}" ;
- unless ( open(HTMLSNIP, ">$datafile") ) { die("Unable to open $datafile"); }
my ($content, $subject, $author, $messages, $pages, $page) ;
foreach $content (@data) {
my @lines = split(/(\n)/, $content);
# print "------> @lines\n" ;
my $line;
my $linestart = '';
- my $messagelines = 0;
- my $spamlines = 0;
foreach $line (@lines) {
if ( $linestart =~ /.+/ ) {
$line = $linestart . $line;
@@ -68,13 +73,18 @@
$_ =~ s/^\[[^\]]+\]\s*([^\s]+)/$1/ ; # Remove other list markers (but only if something is following)
$_ =~ s/\s*\(fwd\)\s*//i ; # Remove (fwd)
$subject = $_ ;
- print HTMLSNIP "$subject ; $author\n";
- $messagelines++ ;
+ if ( $subject =~ /^[&#x\d;\sA-F\?]+$/ ) {
+ print "Warning: Potential SPAM line: $line\n";
+ $spamlines++ ;
+ } else {
+ print HTMLSNIP "$subject ; $author\n";
+ $messagelines++ ;
+ }
} else {
if ( ($messages, $page, $pages) = $line
=~ m#The last update .* There are (\d+) messages. Page (\d+) of (\d+).<br>#gs ) {
if ( $page != $pages ) { # handle following pages
- print "Warning: Page %page of $pages in $year/$month of $project\n";
+ print "Warning: Page $page of $pages in $year/$month of $project\n";
$page++;
$url = "$url/thrd${page}.html";
} else {
More information about the debian-med-commit
mailing list