[med-svn] r2329 - trunk/community/talks/200808_debconf8
tille at alioth.debian.org
tille at alioth.debian.org
Sat Jul 26 17:47:26 UTC 2008
Author: tille
Date: 2008-07-26 17:47:25 +0000 (Sat, 26 Jul 2008)
New Revision: 2329
Modified:
trunk/community/talks/200808_debconf8/get-archive-pages
Log:
Remove some obvious spam entries
Modified: trunk/community/talks/200808_debconf8/get-archive-pages
===================================================================
--- trunk/community/talks/200808_debconf8/get-archive-pages 2008-07-26 17:22:18 UTC (rev 2328)
+++ trunk/community/talks/200808_debconf8/get-archive-pages 2008-07-26 17:47:25 UTC (rev 2329)
@@ -8,6 +8,7 @@
my $BASEURL = "http://lists.debian.org/debian" ;
my @PROJECTS = ('med', 'edu', 'jr') ;
my @MONTHES = ('01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12');
+my @ROBOTS = ('Debian Installer', 'bugzilla-skolelinux', 'Archive Administrator');
# Debian-Jr starts in 2000
my $YEARSTART = 2000;
@@ -33,60 +34,67 @@
last;
}
my $url = "${URL}/${year}/${month}/";
- #print "$year-$month: $url\n";
- my $uri = URI->new($url);
- my $page = $ua->get($url, Host => $uri->host );
- unless ( $page->is_success ) { next } ; # some mailing lists startet later ...
- (my @data) = $page->content =~ m#.*<!--TNAVEND-->\n(.+)<hr>.*<!--BNAVSTART-->.*#gs;
- #print "$year-$month\n$data\n";
- my $datafile = "${year}-${month}" ;
- unless ( open(HTMLSNIP, ">$datafile") ) { die("Unable to open $datafile"); }
- my ($content, $subject, $author, $messages, $pages) ;
- foreach $content (@data) {
- my @lines = split(/(\n)/, $content);
- # print "------> @lines\n" ;
- my $line;
- my $linestart = '';
- my $messagelines = 0;
- my $spamlines = 0;
- foreach $line (@lines) {
- if ( $linestart ) {
- $line = $linestart . $line;
- $linestart = '';
- }
- if ( $line =~ /^\s*<\/?ul>\s*$/ ||
- $line =~ /^\s*<\/?li>\s*$/ ||
- $line =~ /^\s*<li>[^<]+<\/li>\s*$/ ||
- $line =~ /^\s*<li><em>Message not available<\/em>/ ||
- $line =~ /^\s*$/) { next ; }
- if ( ($subject, $author) = $line =~ m#<li><strong>.*html">(.+)</a></strong>\s*<em>(.+)</em>#gs ) {
- $_ = $subject ;
- $_ =~ s/^Re:\s*//i ; # Remove Re:
- $_ =~ s/^\[[^\]]+\]\s*// ; # Remove other list markers
- $_ =~ s/\s*\(fwd\)\s*//i ; # Remove (fwd)
- $subject = $_ ;
- print HTMLSNIP "$subject ; $author\n";
- $messagelines++ ;
- } else {
- if ( ($messages, $pages) = $line
- =~ m#The last update .* There are (\d+) messages. Page 1 of (\d+).<br>#gs ) {
- if ( $pages > 1 ) {
- print "Warning: More than one page ($pages) in $year/$month of $project\n";
- }
- print HTMLSNIP "$messages Messages (counted $messagelines)\n";
- if ( $messages != $messagelines + $spamlines ) {
- print "Warning: $project $year/$month counted $messagelines and $spamlines but page says $messages\n";
- }
+ while ( $url =~ /.+/ ) { # if only one page $url is set to ''
+ # print "$year-$month: $url\n";
+ my $uri = URI->new($url);
+ my $indexpage = $ua->get($url, Host => $uri->host );
+ unless ( $indexpage->is_success ) { $url = ''; next; } ; # some mailing lists startet later ...
+ (my @data) = $indexpage->content =~ m#.*<!--TNAVEND-->\n(.+)<hr>.*<!--BNAVSTART-->.*#gs;
+ #print "$year-$month\n$data\n";
+ my $datafile = "${year}-${month}" ;
+ unless ( open(HTMLSNIP, ">$datafile") ) { die("Unable to open $datafile"); }
+ my ($content, $subject, $author, $messages, $pages, $page) ;
+ foreach $content (@data) {
+ my @lines = split(/(\n)/, $content);
+ # print "------> @lines\n" ;
+ my $line;
+ my $linestart = '';
+ my $messagelines = 0;
+ my $spamlines = 0;
+ foreach $line (@lines) {
+ if ( $linestart =~ /.+/ ) {
+ $line = $linestart . $line;
+ print "DEBUG: Whole line is $line\n" ;
+ $linestart = '';
+ }
+ if ( $line =~ /^\s*<\/?ul>\s*$/ ||
+ $line =~ /^\s*<\/?li>\s*$/ ||
+ $line =~ /^\s*<li>[^<]+<\/li>\s*$/ ||
+ $line =~ /^\s*<li><em>Message not available<\/em>/ ||
+ $line =~ /^\s*$/) { next ; }
+ if ( ($subject, $author) = $line =~ m#<li><strong>.*html">(.+)</a></strong>\s*<em>(.+)</em>#gs ) {
+ $_ = $subject ;
+ $_ =~ s/^Re:\s*//i ; # Remove Re:
+ $_ =~ s/^\[[^\]]+\]\s*([^\s]+)/$1/ ; # Remove other list markers (but only if something is following)
+ $_ =~ s/\s*\(fwd\)\s*//i ; # Remove (fwd)
+ $subject = $_ ;
+ print HTMLSNIP "$subject ; $author\n";
+ $messagelines++ ;
} else {
- unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { # sometimes there are continued lines ...
- print "DEBUG: Continued line $line\n" ;
- $linestart = $line;
+ if ( ($messages, $page, $pages) = $line
+ =~ m#The last update .* There are (\d+) messages. Page (\d+) of (\d+).<br>#gs ) {
+ if ( $page != $pages ) { # handle following pages
+ print "Warning: Page %page of $pages in $year/$month of $project\n";
+ $page++;
+ $url = "$url/thrd${page}.html";
+ } else {
+ $url = '';
+ }
+ print HTMLSNIP "$messages Messages (counted $messagelines)\n";
+ if ( $messages != $messagelines + $spamlines ) {
+ print "Warning: $project $year/$month counted $messagelines and $spamlines but page says $messages\n";
+ }
} else {
- if ( $line =~ /<em>\s*<\/em>\s*<\/li>\s*$/ ) { # sometimes SPAM has no sender ...
- print "Warning: Potential SPAM line: $line\n";
- $spamlines++ ;
+ unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { # sometimes there are continued lines ...
+ print "DEBUG: Continued line $line\n" ;
+ $linestart = $line;
} else {
- print "Warning: unknown Line: $line\n";
+ if ( $line =~ /<em>\s*<\/em>\s*<\/li>\s*$/ ) { # sometimes SPAM has no sender ...
+ print "Warning: Potential SPAM line: $line\n";
+ $spamlines++ ;
+ } else {
+ print "Warning: unknown Line: $line\n";
+ }
}
}
}
More information about the debian-med-commit
mailing list