188 lines
5.5 KiB
Perl
Executable File
188 lines
5.5 KiB
Perl
Executable File
#! /usr/bin/perl
|
|
|
|
# Name : parse_known_sites
|
|
# Author : theocrite
|
|
# Description : parses (gets author, date & title) of
|
|
# pages grabbed by alexandrie
|
|
#
|
|
|
|
use strict;
|
|
use LWP::Simple;
|
|
use DBI;
|
|
use Config::Simple;
|
|
|
|
# CONFIGURATION FILE
|
|
my $alexandrieConf = new Config::Simple("/etc/alexandrie/alexandrie.conf") or die "Missing configuration file.";
|
|
print "Configuration file loaded.\n";
|
|
|
|
# DATABASE
|
|
my $dsn = $alexandrieConf->param("database.dsn");
|
|
my $db_user_name = $alexandrieConf->param("database.username");
|
|
my $db_password = $alexandrieConf->param("database.password");
|
|
|
|
|
|
sub parse_page
|
|
{
|
|
my $url=shift;
|
|
my $id =shift;
|
|
|
|
print "parsing $url\n";
|
|
|
|
my $title_patern = '';
|
|
my $author_patern = '';
|
|
my $date_patern = '';
|
|
my $site = '';
|
|
|
|
if ($url =~ /pcinpact\.com/)
|
|
{
|
|
$author_patern = '';
|
|
$title_patern = '<title>(.*) - PC INpact<\/title>';
|
|
$date_patern = '<\/strong> le (.*) \(<span class="or">';
|
|
$site = 'PC INpact';
|
|
}
|
|
elsif ($url =~ /zdnet\.fr/)
|
|
{
|
|
$author_patern = '<p><cite site="Auteur">par <a href=".*">(.*)</a></cite>';
|
|
$title_patern = '<title>(.*) - Actualit.s - ZDNet\.fr<\/title>';
|
|
$date_patern = 'Publié le <span class="date">(.*)</span></p>';
|
|
$site = 'ZDNet.fr';
|
|
}
|
|
elsif ($url =~ /silicon\.fr/)
|
|
{
|
|
$author_patern = '<p class="author">\s+Par (.*?)\s+</p>';
|
|
$title_patern = '<title>(.*) Actualité --- Silicon\.fr --- <\/title>';
|
|
$date_patern = '<p class="date">(.*)</p>';
|
|
$site = 'Silicon.fr';
|
|
}
|
|
elsif ($url =~ /cio-online\.com/)
|
|
{
|
|
$author_patern = 'par <a href=".*" class="couleur">(.*)</a>';
|
|
$title_patern = '<title>(.*) - CIO-Online.*<\/title>';
|
|
# $date_patern = '<\/b><\/h3><br \/><b class="rouge">\((.*)\)<\/b>';
|
|
$date_patern = '<br \/><b class="couleur">Edition du (.*)<\/b> - par';
|
|
$site = 'CIO-Online';
|
|
}
|
|
elsif ($url =~ /lemonde\.fr/)
|
|
{
|
|
$author_patern = '<div class=author><b>(.*)<\/b><\/div><';
|
|
$title_patern = '<title>(.*) - LeMonde.fr</title>';
|
|
$date_patern = '<div class="dateline">[^|]+\| (.*?) \|';
|
|
$site = 'Le Monde.fr';
|
|
}
|
|
elsif ($url =~ /lemondeinformatique\.fr/)
|
|
{
|
|
$author_patern = 'par <a href=".*" class="rouge">(.*)<\/a>';
|
|
$title_patern = '<title>(.*) - .* - Le Monde Informatique<\/title>';
|
|
$date_patern = '<br \/><b class="rouge">Edition du (.*)<\/b>';
|
|
$site = 'Le Monde Informatique';
|
|
}
|
|
elsif ($url =~ /ecrans\.fr/)
|
|
{
|
|
$author_patern = '<p class="auteur"> par <a href=".*">(.*)<\/a>';
|
|
$title_patern = '<title>(.*)- Ecrans';
|
|
$date_patern = '<p class="date">(.*?)\s*<\/p>';
|
|
$site = 'Ecrans';
|
|
}
|
|
elsif ($url =~ /numerama\.com/)
|
|
{
|
|
$author_patern = '<a href="/contact/articles.html" style="text-decoration:underline;color:#929292">(.*)</a> -';
|
|
$title_patern = '<title>(.*) - Numerama<\/title>';
|
|
$date_patern = 'publié le (.*) - ';
|
|
$site = 'Numerama';
|
|
}
|
|
elsif ($url =~ /01net\.com/)
|
|
{
|
|
$author_patern = '<a .*href=.mailto:commentaires@01net.fr?.* style="text-decoration:.*;">(.*)<\/a>';
|
|
$title_patern = '<title>(.*)<\/title>';
|
|
$date_patern = '">le (.*?)<\/';
|
|
$site = '01net informatique';
|
|
}
|
|
elsif ($url =~ /lemagit\.fr/)
|
|
{
|
|
$author_patern = '- par <a href=".*">(.*)<\/a>';
|
|
$title_patern = '<title>(.*)<\/title>';
|
|
$date_patern = '<p class="auteur_date typo_auteur"> Le (.*) - par';
|
|
$site = 'LeMagIT';
|
|
}
|
|
elsif ($url =~ /www\.neteco\.com/)
|
|
{
|
|
$author_patern = 'Publi. par <b><a href=\S+ title="(.*?)">';
|
|
$title_patern = '<title>(.*) par Neteco.com</title>';
|
|
$date_patern = '<\/a><\/b> le <b>(.*?)<\/b>';
|
|
$site = 'NetEco';
|
|
}
|
|
# BROKEN - journaldunet devs = stupid piece of junk
|
|
elsif ($url =~ /www\.journaldunet\.com/)
|
|
{
|
|
$author_patern = '<div id="signature"><a href=".*?" rel="nofollow">(.*?) Journal du Net<\/a> <\/div>';
|
|
$title_patern = '<title>([^-]+) -';
|
|
$date_patern = 'span class="date_publication">Publié le (.*)<\/span>';
|
|
$site = 'LeJournalduNet';
|
|
}
|
|
elsif ($url =~ /www\.generation-nt\.com/)
|
|
{
|
|
$author_patern = 'par <a href="/divers/contacts.php">(.*)</a> ';
|
|
$title_patern = '<title>(.*)<\/title>';
|
|
$date_patern = '(.*) par <a href="/divers/contacts.php">';
|
|
$site = 'Génération nouvelles technologies';
|
|
}
|
|
elsif ($url =~ /lepoint\.fr/)
|
|
{
|
|
$author_patern = 'La rédaction';
|
|
$title_patern = '<title>(.*), actualitéeacute</title>';
|
|
$date_patern = 'Publié\s+le (.*?)\s+-';
|
|
$site = 'Le point';
|
|
}
|
|
# elsif ($url =~ //)
|
|
# {
|
|
# $author_patern = '';
|
|
# $title_patern = '';
|
|
# $date_patern = '';
|
|
# $site = '';
|
|
# }
|
|
# elsif ($url =~ //)
|
|
# {
|
|
# $author_patern = '';
|
|
# $title_patern = '';
|
|
# $date_patern = '';
|
|
# $site = '';
|
|
# }
|
|
else
|
|
{
|
|
next;
|
|
}
|
|
|
|
|
|
my $contents = get $url;
|
|
|
|
$contents =~ /$author_patern/im;
|
|
my $author=$1;
|
|
$contents =~ /$title_patern/im;
|
|
my $title=$1;
|
|
$contents =~ /$date_patern/im;
|
|
my $date = $1;
|
|
|
|
my $update='UPDATE presse SET date=?, auteur=?, titre=?, site=? WHERE id=?';
|
|
my $dbh = DBI->connect($dsn, $db_user_name, $db_password);
|
|
my $sth = $dbh->prepare($update);
|
|
$sth->execute($date, $author, $title, $site, $id);
|
|
$sth->finish(); # we're done with this query
|
|
}
|
|
|
|
|
|
#my $url=$ARGV[0];
|
|
#parse_page $url;
|
|
my $id=$ARGV[0];
|
|
|
|
my $select='SELECT url, id FROM presse'. ($id eq ''?'':" WHERE id=$id");
|
|
my $dbh = DBI->connect($dsn, $db_user_name, $db_password);
|
|
my $sth = $dbh->prepare($select);
|
|
if ( $sth->execute() )
|
|
{
|
|
while (my @found = $sth->fetchrow_array())
|
|
{
|
|
parse_page $found[0], $found[1];
|
|
}
|
|
}
|
|
$sth->finish(); # we're done with this query
|