[med-svn] r2410 - trunk/community/talks/200808_debconf8
tille at alioth.debian.org
tille at alioth.debian.org
Fri Aug 15 03:17:32 UTC 2008
Author: tille
Date: 2008-08-15 03:17:31 +0000 (Fri, 15 Aug 2008)
New Revision: 2410
Modified:
trunk/community/talks/200808_debconf8/get-archive-pages
Log:
Parsing Alioth should work
Modified: trunk/community/talks/200808_debconf8/get-archive-pages
===================================================================
--- trunk/community/talks/200808_debconf8/get-archive-pages 2008-08-13 01:25:27 UTC (rev 2409)
+++ trunk/community/talks/200808_debconf8/get-archive-pages 2008-08-15 03:17:31 UTC (rev 2410)
@@ -38,7 +38,8 @@
my @ROBOTS = ('Debian Installer', 'bugzilla-skolelinux', 'Archive Administrator', 'hostmaster',
'Debian-med-request', 'Debian testing watch', 'Debian Bug Tracking System',
'Skolelinux archive Installer', 'Debian Wiki', 'gentoo-\w+\+help',
- 'Debichem-commits');
+ 'Debichem-commits', 'Weekly infolist of updatable packages for the debichem project',
+ 'bts-link-upstream at lists.alioth.debian.org', 'DDPOMail robot');
## TODO: just consider mails containing these strings as SPAM
## This has to be implemented in the code below
@@ -75,7 +76,7 @@
foreach $project (@PROJECTS) {
$ALLPROJECTS{$project} = { 'url' => "${BASEURL}-${project}",
- 'type' => 0 # == lists.debian.org
+ 'type' => 0, # == lists.debian.org
};
}
@@ -85,14 +86,23 @@
};
}
+my $SEPARATOR='<!-- -->';
+# different mailing list systems use different separators between message URL, subject and author
+my @SEP1 = ( '<li><strong>.*href="', '\s*' );
+my @SEP2 = ( '">', "\\s*<!-- -->\\s*" );
+my @SEP3 = ( '</a></strong>\s*<em>', '\s*<I>\s*');
+my @SEP4 = ( '</em>', '\s*');
+
# foreach $project (keys %ALLPROJECTS) {
# print "$project: $ALLPROJECTS{$project}{'url'}, $ALLPROJECTS{$project}{'type'}\n"
#}
+my ($query, $daten);
+
foreach $project (keys %ALLPROJECTS) {
# Remove database entries for this project
- my $query = "DELETE FROM listarchive WHERE project = '$project'";
- my($daten) = $dbh->prepare_cached($query);
+ $query = "DELETE FROM listarchive WHERE project = '$project'";
+ $daten = $dbh->prepare_cached($query);
$daten->execute() ;
$daten->finish() ;
@@ -103,12 +113,13 @@
my $URL="$ALLPROJECTS{$project}{'url'}";
my ( $year, $month, $url, @data, @lines ) ;
my ($content, $msgurl, $subject, $author, $messages, $pages, $page, $line) ;
+ my $type = $ALLPROJECTS{$project}{'type'};
for ( $year = $YEARSTART ; $year <= $YEAREND; $year++ ) {
foreach $month (@MONTHES) {
if ( $year == $YEAREND && $month == $MONTHEND ) {
last;
}
- if ( $ALLPROJECTS{$project}{'type'} == 0 ) {
+ if ( $type == 0 ) {
$url = "${URL}/${year}/${month}/";
} else {
$url = "${URL}/${year}-$monthdict{$month}/";
@@ -121,7 +132,7 @@
my $spamlines = 0;
my $robotlines = 0;
while ( $url =~ /.+/ ) { # if only one page $url is set to ''
- print "DEBUG: $year-$month: $url\n";
+ # print "DEBUG: $year-$month: $url\n";
my $uri = URI->new($url);
my $indexpage = $ua->get($url, Host => $uri->host );
unless ( $indexpage->is_success ) { # some mailing lists startet later ...
@@ -131,48 +142,49 @@
unlink($datafile);
next;
} ;
- if ( $ALLPROJECTS{$project}{'type'} == 1 ) {
+ if ( $type == 1 ) {
# make sure the loop will end in case of Alioth lists. Seems these list do
# not feature more than one page per Month so there is no point in looping over them
$url = '';
}
- if ( $ALLPROJECTS{$project}{'type'} == 0 ) {
+ if ( $type == 0 ) {
@data = $indexpage->content =~ m#.*<!--TNAVEND-->\n(.+)<hr>.*<!--BNAVSTART-->.*#gs;
} else {
my @tmpdata = $indexpage->content =~ m#.*<b>Ending:</b> <i>[ \w]+ [ \d:]+ UTC [\d]+</i><br>\n(.+)<a name="end"><b>Last message date:</b></a>.*#gs;
- @data = ();
+ my $tmpdata = '';
+ my $tmpline = '';
foreach $content (@tmpdata) {
@lines = split(/(\n)/, $content);
- foreach (@lines) {
+ foreach $line (@lines) {
+ $_ = $line;
s/\s+/ /g;
if ( $_ =~ /^\s*$/ || $_ =~ /^<!--\d+ / ||
$_ =~ /^<\/I>$/ || $_ =~ /^\s*<\/?p>\s*$/ ||
$_ =~ /^\s*<\/?UL>\s*$/i ||
$_ =~ /^<\/A><A NAME="\d+"> <\/A>$/ ) { next ; }
- if ( ($subject) = $_ =~ /^\s*<LI><A HREF="\d+.html">\[[-\w]+\]\s*(.+)$/ ) {
+ if ( ($msgurl, $subject) = $_ =~ /^\s*<LI><A HREF="(\d+.html)">\[[-\w]+\]\s*(.+)$/ ) {
$_ = $subject ;
$_ =~ s/^\s*Re:\s*//i ; # Remove Re:
- @data = (@data, $subject) ;
+ $_ =~ s/^\s*//i ; # Remove blanks
+ $tmpline = $msgurl . $SEPARATOR . $subject ;
} else {
if ( $_ =~ /<I>/ || $_ =~ /<b>Messages:<\/b>/ ) {
- @data = (@data, "$_\n" ) ;
+ $tmpline = "$_\n" ;
} else {
- @data = (@data, "$_" ) ;
+ $tmpline = "$_" ;
}
}
+ $tmpdata = $tmpdata . $tmpline;
}
}
- if ( $storefiles ) {
- print HTMLSNIP "@data\n";
- }
+ @data = ($tmpdata);
}
foreach $content (@data) {
@lines = split(/(\n)/, $content);
- # print "------> @lines\n" ;
my $linestart = '';
foreach $line (@lines) {
if ( $line =~ /^\s*$/) { next ; }
- if ( $linestart =~ /.+/ ) {
+ if ( $linestart =~ /.+/ && $type == 0 ) {
if ( $line =~ /^\s*<\/?ul>\s*$/ ||
$line =~ /^\s*<\/?li>\s*$/ ) {
# fix broken formatting if there is a useless EOL and next line is <ul> or </li>
@@ -189,8 +201,16 @@
$line =~ /^\s*<li><em>Message not available<\/em>/ ||
$line =~ /<em>\(continued\)<\/em>\s*$/ ||
$line =~ /^\s*$/) { next ; }
+# @SEP1 = ( '<li><strong>.*href="', '\s*' );
+# @SEP2 = ( '">', "\s*$SEPARATOR\s*" );
+# @SEP3 = ( '</a></strong>\s*<em>', '\s*<I>\s*');
+# @SEP4 = ( '</em>', '\s*');
+ # print "DEBUG: $line\n";
+ if ( $storefiles ) {
+ print HTMLSNIP "$line\n";
+ }
if ( ($msgurl, $subject, $author) =
- $line =~ m#<li><strong>.*href="(msg\d+\.html)">(.+)</a></strong>\s*<em>(.+)</em>#gs ) {
+ $line =~ m#$SEP1[$type]([msg]*\d+\.html)$SEP2[$type](.+)$SEP3[$type](.+)$SEP4[$type]#gs ) {
$_ = $subject ;
$_ =~ s/^Re:\s*//i ; # Remove Re:
$_ =~ s/^\[[^\]]+\]\s*([^\s]+)/$1/ ; # Remove other list markers (but only if something is following)
@@ -229,31 +249,39 @@
}
}
} else {
- if ( ($messages, $page, $pages) = $line
- =~ m#The last update .* There are (\d+) messages. Page (\d+) of (\d+).<br>#gs ) {
- if ( $page != $pages ) { # handle following pages
- print "Warning: Page $page of $pages in $year/$month of $project\n";
- $page++;
- $url = "${URL}/${year}/${month}/thrd${page}.html";
+ if ( $type == 0 ) {
+ if ( ($messages, $page, $pages) = $line
+ =~ m#The last update .* There are (\d+) messages. Page (\d+) of (\d+).<br>#gs ) {
+ if ( $page != $pages ) { # handle following pages
+ print "Warning: Page $page of $pages in $year/$month of $project\n";
+ $page++;
+ $url = "${URL}/${year}/${month}/thrd${page}.html";
+ } else {
+ $url = '';
+ }
+ if ( $storefiles ) {
+ print HTMLSNIP "$messages Messages ($messagelines real messages, $spamlines SPAM, $robotlines messages by robots)\n";
+ }
+ if ( $messages != $messagelines + $spamlines + $robotlines ) {
+ print "Warning: $project $year/$month counted $messagelines Messages, $spamlines SPAM and $robotlines robots but page says $messages\n";
+ }
} else {
- $url = '';
+ unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { # sometimes there are continued lines ...
+ $linestart = $line;
+ ##next ; ##### ??????? if this line is missing line we get $linestart$linestart ...
+ } else {
+ if ( $line =~ /<em>\s*<\/em>\s*<\/li>\s*$/ ) { # sometimes SPAM has no sender ...
+ print "Potential SPAM line - no author: $project $year-$month\n";
+ $spamlines++ ;
+ } else {
+ print "Warning: unknown Line: $line\n";
+ }
+ }
}
- if ( $storefiles ) {
- print HTMLSNIP "$messages Messages ($messagelines real messages, $spamlines SPAM, $robotlines messages by robots)\n";
- }
- if ( $messages != $messagelines + $spamlines + $robotlines ) {
- print "Warning: $project $year/$month counted $messagelines Messages, $spamlines SPAM and $robotlines robots but page says $messages\n";
- }
} else {
- unless ( $line =~ /<\/em>\s*<\/li>\s*$/ ) { # sometimes there are continued lines ...
- $linestart = $line;
- ##next ; ##### ??????? if this line is missing line we get $linestart$linestart ...
- } else {
- if ( $line =~ /<em>\s*<\/em>\s*<\/li>\s*$/ ) { # sometimes SPAM has no sender ...
- print "Potential SPAM line - no author: $project $year-$month\n";
- $spamlines++ ;
- } else {
- print "Warning: unknown Line: $line\n";
+ if ( ($messages) = $line =~ m#^\s*<b>Messages:</b>\s*(\d+)<p>#gs ) {
+ if ( $storefiles ) {
+ print HTMLSNIP "$messages Messages ($messagelines real messages, $spamlines SPAM, $robotlines messages by robots)\n";
}
}
}
@@ -271,9 +299,16 @@
# Database has shown that Ralf Gsellenstetter is posting with several names
# in Debian Edu. This script cleans up this
-system("./0fix_ralf_edu");
+# system("./0fix_ralf_edu");
+$query = "UPDATE listarchive SET author = 'Ralf Gesellensetter' WHERE project = 'edu' AND author LIKE 'Ralf%setter';" ;
+$query = "UPDATE listarchive SET author = 'Vagrant Cascadian' WHERE project = 'edu' AND author LIKE '%vagrant%';" ;
+$daten = $dbh->prepare_cached($query);
+$daten->execute() ;
+$daten->finish() ;
+
+
# Just do the graphing of all lists we got
-foreach $project (@PROJECTS) {
+foreach $project (keys %ALLPROJECTS) {
system("./author_stats $project") ;
}
More information about the debian-med-commit
mailing list