alexandrie/parse_known_sites

188 lines
5.5 KiB
Perl
Executable File

#! /usr/bin/perl
# Name : parse_known_sites
# Author : theocrite
# Description : parses (gets author, date & title) of
# pages grabbed by alexandrie
#
use strict;
use LWP::Simple;
use DBI;
use Config::Simple;
# CONFIGURATION FILE
my $alexandrieConf = new Config::Simple("/etc/alexandrie/alexandrie.conf") or die "Missing configuration file.";
print "Configuration file loaded.\n";
# DATABASE
my $dsn = $alexandrieConf->param("database.dsn");
my $db_user_name = $alexandrieConf->param("database.username");
my $db_password = $alexandrieConf->param("database.password");
sub parse_page
{
my $url=shift;
my $id =shift;
print "parsing $url\n";
my $title_patern = '';
my $author_patern = '';
my $date_patern = '';
my $site = '';
if ($url =~ /pcinpact\.com/)
{
$author_patern = '';
$title_patern = '<title>(.*) - PC INpact<\/title>';
$date_patern = '<\/strong> le (.*) \(<span class="or">';
$site = 'PC INpact';
}
elsif ($url =~ /zdnet\.fr/)
{
$author_patern = '<p><cite site="Auteur">par <a href=".*">(.*)</a></cite>';
$title_patern = '<title>(.*) - Actualit.s - ZDNet\.fr<\/title>';
$date_patern = 'Publi&eacute; le <span class="date">(.*)</span></p>';
$site = 'ZDNet.fr';
}
elsif ($url =~ /silicon\.fr/)
{
$author_patern = '<p class="author">\s+Par (.*?)\s+</p>';
$title_patern = '<title>(.*) Actualit&eacute; --- Silicon\.fr --- <\/title>';
$date_patern = '<p class="date">(.*)</p>';
$site = 'Silicon.fr';
}
elsif ($url =~ /cio-online\.com/)
{
$author_patern = 'par <a href=".*" class="couleur">(.*)</a>';
$title_patern = '<title>(.*) - CIO-Online.*<\/title>';
# $date_patern = '<\/b><\/h3><br \/><b class="rouge">\((.*)\)<\/b>';
$date_patern = '<br \/><b class="couleur">Edition du (.*)<\/b> - par';
$site = 'CIO-Online';
}
elsif ($url =~ /lemonde\.fr/)
{
$author_patern = '<div class=author><b>(.*)<\/b><\/div><';
$title_patern = '<title>(.*) - LeMonde.fr</title>';
$date_patern = '<div class="dateline">[^|]+\| (.*?) \|';
$site = 'Le Monde.fr';
}
elsif ($url =~ /lemondeinformatique\.fr/)
{
$author_patern = 'par <a href=".*" class="rouge">(.*)<\/a>';
$title_patern = '<title>(.*) - .* - Le Monde Informatique<\/title>';
$date_patern = '<br \/><b class="rouge">Edition du (.*)<\/b>';
$site = 'Le Monde Informatique';
}
elsif ($url =~ /ecrans\.fr/)
{
$author_patern = '<p class="auteur"> par <a href=".*">(.*)<\/a>';
$title_patern = '<title>(.*)- Ecrans';
$date_patern = '<p class="date">(.*?)\s*<\/p>';
$site = 'Ecrans';
}
elsif ($url =~ /numerama\.com/)
{
$author_patern = '<a href="/contact/articles.html" style="text-decoration:underline;color:#929292">(.*)</a> -';
$title_patern = '<title>(.*) - Numerama<\/title>';
$date_patern = 'publi&eacute; le (.*) - ';
$site = 'Numerama';
}
elsif ($url =~ /01net\.com/)
{
$author_patern = '<a .*href=.mailto:commentaires@01net.fr?.* style="text-decoration:.*;">(.*)<\/a>';
$title_patern = '<title>(.*)<\/title>';
$date_patern = '">le (.*?)<\/';
$site = '01net informatique';
}
elsif ($url =~ /lemagit\.fr/)
{
$author_patern = '- par <a href=".*">(.*)<\/a>';
$title_patern = '<title>(.*)<\/title>';
$date_patern = '<p class="auteur_date typo_auteur"> Le (.*) - par';
$site = 'LeMagIT';
}
elsif ($url =~ /www\.neteco\.com/)
{
$author_patern = 'Publi. par <b><a href=\S+ title="(.*?)">';
$title_patern = '<title>(.*) par Neteco.com</title>';
$date_patern = '<\/a><\/b> le <b>(.*?)<\/b>';
$site = 'NetEco';
}
# BROKEN - journaldunet devs = stupid piece of junk
elsif ($url =~ /www\.journaldunet\.com/)
{
$author_patern = '<div id="signature"><a href=".*?" rel="nofollow">(.*?) Journal du Net<\/a> <\/div>';
$title_patern = '<title>([^-]+) -';
$date_patern = 'span class="date_publication">Publi&eacute; le (.*)<\/span>';
$site = 'LeJournalduNet';
}
elsif ($url =~ /www\.generation-nt\.com/)
{
$author_patern = 'par <a href="/divers/contacts.php">(.*)</a> &nbsp;';
$title_patern = '<title>(.*)<\/title>';
$date_patern = '(.*) par <a href="/divers/contacts.php">';
$site = 'Génération nouvelles technologies';
}
elsif ($url =~ /lepoint\.fr/)
{
$author_patern = 'La rédaction';
$title_patern = '<title>(.*), actualit&eacute;eacute</title>';
$date_patern = 'Publi&eacute;\s+le (.*?)\s+-';
$site = 'Le point';
}
# elsif ($url =~ //)
# {
# $author_patern = '';
# $title_patern = '';
# $date_patern = '';
# $site = '';
# }
# elsif ($url =~ //)
# {
# $author_patern = '';
# $title_patern = '';
# $date_patern = '';
# $site = '';
# }
else
{
next;
}
my $contents = get $url;
$contents =~ /$author_patern/im;
my $author=$1;
$contents =~ /$title_patern/im;
my $title=$1;
$contents =~ /$date_patern/im;
my $date = $1;
my $update='UPDATE presse SET date=?, auteur=?, titre=?, site=? WHERE id=?';
my $dbh = DBI->connect($dsn, $db_user_name, $db_password);
my $sth = $dbh->prepare($update);
$sth->execute($date, $author, $title, $site, $id);
$sth->finish(); # we're done with this query
}
#my $url=$ARGV[0];
#parse_page $url;
my $id=$ARGV[0];
my $select='SELECT url, id FROM presse'. ($id eq ''?'':" WHERE id=$id");
my $dbh = DBI->connect($dsn, $db_user_name, $db_password);
my $sth = $dbh->prepare($select);
if ( $sth->execute() )
{
while (my @found = $sth->fetchrow_array())
{
parse_page $found[0], $found[1];
}
}
$sth->finish(); # we're done with this query