[med-svn] r2341 - trunk/community/talks/200808_debconf8
tille at alioth.debian.org
tille at alioth.debian.org
Sun Jul 27 17:33:45 UTC 2008
Author: tille
Date: 2008-07-27 17:33:45 +0000 (Sun, 27 Jul 2008)
New Revision: 2341
Added:
trunk/community/talks/200808_debconf8/archives.sql
Modified:
trunk/community/talks/200808_debconf8/get-archive-pages
Log:
Store messages in database to be flexible when trying to obtain stats.
Added: trunk/community/talks/200808_debconf8/archives.sql
===================================================================
--- trunk/community/talks/200808_debconf8/archives.sql (rev 0)
+++ trunk/community/talks/200808_debconf8/archives.sql 2008-07-27 17:33:45 UTC (rev 2341)
@@ -0,0 +1,19 @@
+#!/bin/sh
+
+createdb cddlistarchives
+
+psql cddlistarchives <<EOT
+
+BEGIN;
+
+CREATE TABLE listarchive (
+ project text,
+ yearmonth date,
+ author text,
+ subject text,
+ url text,
+ ts date
+);
+
+COMMIT;
+EOT
Property changes on: trunk/community/talks/200808_debconf8/archives.sql
___________________________________________________________________
Name: svn:executable
+ *
Modified: trunk/community/talks/200808_debconf8/get-archive-pages
===================================================================
--- trunk/community/talks/200808_debconf8/get-archive-pages 2008-07-27 17:14:24 UTC (rev 2340)
+++ trunk/community/talks/200808_debconf8/get-archive-pages 2008-07-27 17:33:45 UTC (rev 2341)
@@ -4,11 +4,15 @@
use LWP::UserAgent;
use URI;
use Cwd;
+use DBI;
my $BASEURL = "http://lists.debian.org/debian" ;
my @PROJECTS = ('med', 'edu', 'jr') ;
my @MONTHES = ('01', '02', '03', '04', '05', '06', '07', '08', '09', '10', '11', '12');
-my @ROBOTS = ('Debian Installer', 'bugzilla-skolelinux', 'Archive Administrator', 'hostmaster');
+my @ROBOTS = ('Debian Installer', 'bugzilla-skolelinux', 'Archive Administrator', 'hostmaster',
+ 'Debian-med-request', 'Debian testing watch', 'Debian Bug Tracking System',
+ 'Skolelinux archive Installer');
+my @SPAMAUTHORS = ('Pls check this new site');
# Debian-Jr starts in 2000
my $YEARSTART = 2000;
@@ -16,13 +20,28 @@
my ($sec,$min,$hour,$day,$MONTHEND,$YEAREND,$wday,$yday,$isdst) = localtime(time);
$MONTHEND++;
$YEAREND +=1900;
+$day++;
+my $today = "$YEAREND-$MONTHEND-$day";
+my $dbname = 'cddlistarchives';
+my $dbh = DBI->connect("dbi:Pg:dbname=$dbname");
+
my $ua = LWP::UserAgent->new( agent => 'varbot');
$ua->env_proxy;
my $cdw = getcwd;
my $project;
+my $insert = "INSERT INTO listarchive (project, yearmonth, author, subject, url, ts) VALUES (?, ?, ?, ?, ?, '$today')";
+my $datain = $dbh->prepare_cached($insert);
+my ( $robot, $robotflag );
+
foreach $project (@PROJECTS) {
+ # Remove database entries for this project
+ my $query = "DELETE FROM listarchive WHERE project = '$project'";
+ my($daten) = $dbh->prepare_cached($query);
+ $daten->execute() ;
+ $daten->finish() ;
+
mkdir($project,0777);
chdir($project);
my $URL="${BASEURL}-${project}";
@@ -37,7 +56,8 @@
my $datafile = "${year}-${month}" ;
unless ( open(HTMLSNIP, ">$datafile") ) { die("Unable to open $datafile"); }
my $messagelines = 0;
- my $spamlines = 0;
+ my $spamlines = 0;
+ my $robotlines = 0;
while ( $url =~ /.+/ ) { # if only one page $url is set to ''
# print "$year-$month: $url\n";
my $uri = URI->new($url);
@@ -50,7 +70,7 @@
next;
} ;
(my @data) = $indexpage->content =~ m#.*<!--TNAVEND-->\n(.+)<hr>.*<!--BNAVSTART-->.*#gs;
- my ($content, $subject, $author, $messages, $pages, $page) ;
+ my ($content, $msgurl, $subject, $author, $messages, $pages, $page) ;
foreach $content (@data) {
my @lines = split(/(\n)/, $content);
# print "------> @lines\n" ;
@@ -67,7 +87,6 @@
# Append next line
$line = $linestart . $line;
}
- print "DEBUG: Whole line is $line\n" ;
$linestart = '';
}
if ( $line =~ /^\s*<\/?ul>\s*$/ ||
@@ -76,7 +95,8 @@
$line =~ /^\s*<li><em>Message not available<\/em>/ ||
$line =~ /<em>\(continued\)<\/em>\s*$/ ||
$line =~ /^\s*$/) { next ; }
- if ( ($subject, $author) = $line =~ m#<li><strong>.*html">(.+)</a></strong>\s*<em>(.+)</em>#gs ) {
+ if ( ($msgurl, $subject, $author) =
+ $line =~ m#<li><strong>.*href="(msg\d+\.html)">(.+)</a></strong>\s*<em>(.+)</em>#gs ) {
$_ = $subject ;
$_ =~ s/^Re:\s*//i ; # Remove Re:
$_ =~ s/^\[[^\]]+\]\s*([^\s]+)/$1/ ; # Remove other list markers (but only if something is following)
@@ -86,8 +106,31 @@
print "Potential SPAM line - strange subject: $project $year-$month: $subject\n";
$spamlines++ ;
} else {
- print HTMLSNIP "$subject ; $author\n";
- $messagelines++ ;
+ if ( $author =~ /^[&#x\d;\sA-F\?]+$/ ||
+ $author =~ /info/i ) { # never had a non-spam message from an author whos name contains info
+ print "Potential SPAM line - strange author: $project $year-$month: $author\n";
+ $spamlines++ ;
+ } else {
+ if ( $author =~ /^Tille, Andreas$/ ) { $author = 'Andreas Tille'; }
+ if ( $author =~ /Steffen Möller/ ) { $author = 'Steffen Moeller'; }
+ $_ = $author;
+ $_ = s/ö/ö/g ;
+ $_ = s/ü/ü/g ;
+ $robotflag = 0;
+ foreach $robot (@ROBOTS) {
+ if ( $author =~ /$robot/ ) { # we are not interested in automatic mails
+ $robotlines++ ;
+ $robotflag = 1 ;
+ last;
+ }
+ }
+ if ( $robotflag == 0 ) {
+ print HTMLSNIP "$subject ; $author\n";
+ $datain->execute($project, "$year-$month-01", $author, $subject,
+ "${URL}/${year}/${month}/$msgurl") ;
+ $messagelines++ ;
+ }
+ }
}
} else {
if ( ($messages, $page, $pages) = $line
@@ -100,12 +143,11 @@
$url = '';
}
print HTMLSNIP "$messages Messages (counted $messagelines)\n";
- if ( $messages != $messagelines + $spamlines ) {
- print "Warning: $project $year/$month counted $messagelines and $spamlines but page says $messages\n";
+ if ( $messages != $messagelines + $spamlines + $robotlines ) {
+ print "Warning: $project $year/$month counted $messagelines Messages, $spamlines SPAM and $robotlines robots but page says $messages\n";
}
} else {
unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { # sometimes there are continued lines ...
- print "DEBUG: Continued line $line\n" ;
$linestart = $line;
##next ; ##### ??????? if this line is missing line we get $linestart$linestart ...
} else {
@@ -127,3 +169,4 @@
chdir($cdw);
}
+$datain->finish;
More information about the debian-med-commit
mailing list