#! /usr/bin/perl # Name : parse_known_sites # Author : theocrite # Description : parses (gets author, date & title) of # pages grabbed by alexandrie # use strict; use LWP::Simple; use DBI; my $dsn = 'DBI:mysql:drupal6:172.16.0.7'; my $db_user_name = 'drupal'; my $db_password = 'xxxxxxx'; sub parse_page { my $url=shift; my $id =shift; print "parsing $url\n"; my $title_patern = ''; my $author_patern = ''; my $date_patern = ''; my $site = ''; if ($url =~ /pcinpact\.com/) { $author_patern = ''; $title_patern = '(.*) - PC INpact<\/title>'; $date_patern = '<\/strong> le (.*) \(<span class="or">'; $site = 'PC INpact'; } elsif ($url =~ /zdnet\.fr/) { $author_patern = '<p><cite site="Auteur">par <a href=".*">(.*)</a></cite>'; $title_patern = '<title>(.*) - Actualit.s - ZDNet\.fr<\/title>'; $date_patern = 'Publié le <span class="date">(.*)</span></p>'; $site = 'ZDNet.fr'; } elsif ($url =~ /silicon\.fr/) { $author_patern = '<p class="author">\s+Par (.*?)\s+</p>'; $title_patern = '<title>(.*) Actualité --- Silicon\.fr --- <\/title>'; $date_patern = '<p class="date">(.*)</p>'; $site = 'Silicon.fr'; } elsif ($url =~ /cio-online\.com/) { $author_patern = 'par <a href=".*" class="couleur">(.*)</a>'; $title_patern = '<title>(.*) - CIO-Online.*<\/title>'; # $date_patern = '<\/b><\/h3><br \/><b class="rouge">\((.*)\)<\/b>'; $date_patern = '<br \/><b class="couleur">Edition du (.*)<\/b> - par'; $site = 'CIO-Online'; } elsif ($url =~ /lemonde\.fr/) { $author_patern = '<div class=author><b>(.*)<\/b><\/div><'; $title_patern = '<title>(.*) - LeMonde.fr'; $date_patern = '
[^|]+\| (.*?) \|'; $site = 'Le Monde.fr'; } elsif ($url =~ /lemondeinformatique\.fr/) { $author_patern = 'par (.*)<\/a>'; $title_patern = '(.*) - .* - Le Monde Informatique<\/title>'; $date_patern = '<br \/><b class="rouge">Edition du (.*)<\/b>'; $site = 'Le Monde Informatique'; } elsif ($url =~ /ecrans\.fr/) { $author_patern = '<p class="auteur"> par <a href=".*">(.*)<\/a>'; $title_patern = '<title>(.*)- Ecrans'; $date_patern = '<p class="date">(.*?)\s*<\/p>'; $site = 'Ecrans'; } elsif ($url =~ /numerama\.com/) { $author_patern = '<a href="/contact/articles.html" style="text-decoration:underline;color:#929292">(.*)</a> -'; $title_patern = '<title>(.*) - Numerama<\/title>'; $date_patern = 'publié le (.*) - '; $site = 'Numerama'; } elsif ($url =~ /01net\.com/) { $author_patern = '<a .*href=.mailto:commentaires@01net.fr?.* style="text-decoration:.*;">(.*)<\/a>'; $title_patern = '<title>(.*)<\/title>'; $date_patern = '">le (.*?)<\/'; $site = '01net informatique'; } elsif ($url =~ /lemagit\.fr/) { $author_patern = '- par <a href=".*">(.*)<\/a>'; $title_patern = '<title>(.*)<\/title>'; $date_patern = '<p class="auteur_date typo_auteur"> Le (.*) - par'; $site = 'LeMagIT'; } elsif ($url =~ /www\.neteco\.com/) { $author_patern = 'Publi. par <b><a href=\S+ title="(.*?)">'; $title_patern = '<title>(.*) par Neteco.com'; $date_patern = '<\/a><\/b> le (.*?)<\/b>'; $site = 'NetEco'; } # BROKEN - journaldunet devs = stupid piece of junk elsif ($url =~ /www\.journaldunet\.com/) { $author_patern = '
(.*?) Journal du Net<\/a> <\/div>'; $title_patern = '([^-]+) -'; $date_patern = 'span class="date_publication">Publié le (.*)<\/span>'; $site = 'LeJournalduNet'; } elsif ($url =~ /www\.generation-nt\.com/) { $author_patern = 'par <a href="/divers/contacts.php">(.*)</a>  '; $title_patern = '<title>(.*)<\/title>'; $date_patern = '(.*) par <a href="/divers/contacts.php">'; $site = 'Génération nouvelles technologies'; } elsif ($url =~ /lepoint\.fr/) { $author_patern = 'La rédaction'; $title_patern = '<title>(.*), actualitéeacute'; $date_patern = 'Publié\s+le (.*?)\s+-'; $site = 'Le point'; } # elsif ($url =~ //) # { # $author_patern = ''; # $title_patern = ''; # $date_patern = ''; # $site = ''; # } # elsif ($url =~ //) # { # $author_patern = ''; # $title_patern = ''; # $date_patern = ''; # $site = ''; # } else { next; } my $contents = get $url; $contents =~ /$author_patern/im; my $author=$1; $contents =~ /$title_patern/im; my $title=$1; $contents =~ /$date_patern/im; my $date = $1; my $update='UPDATE presse SET date=?, auteur=?, titre=?, site=? WHERE id=?'; my $dbh = DBI->connect($dsn, $db_user_name, $db_password); my $sth = $dbh->prepare($update); $sth->execute($date, $author, $title, $site, $id); $sth->finish(); # we're done with this query } #my $url=$ARGV[0]; #parse_page $url; my $id=$ARGV[0]; my $select='SELECT url, id FROM presse'. ($id eq ''?'':" WHERE id=$id"); my $dbh = DBI->connect($dsn, $db_user_name, $db_password); my $sth = $dbh->prepare($select); if ( $sth->execute() ) { while (my @found = $sth->fetchrow_array()) { parse_page $found[0], $found[1]; } } $sth->finish(); # we're done with this query