diff --git a/drupal2spip_lal/base/management/commands/import.py b/drupal2spip_lal/base/management/commands/import.py index b988fb8..83a5ac0 100644 --- a/drupal2spip_lal/base/management/commands/import.py +++ b/drupal2spip_lal/base/management/commands/import.py @@ -1,13 +1,60 @@ +import argparse import logging +import re from django.core.management.base import BaseCommand +from django.core.validators import URLValidator +from django.db.models import Q + +import requests +from bs4 import BeautifulSoup as bs from drupal2spip_lal.base.convert import convert_node -from drupal2spip_lal.drupal.models import Node +from drupal2spip_lal.drupal.models import Node, UrlAlias logger = logging.getLogger('drupal2spip_lal') +def validate_url(url): + try: + URLValidator(schemes=['http', 'https'])(url) + return requests.get(url).content.decode() + except Exception as e: + raise argparse.ArgumentTypeError(e) + + +def nodes_from_index(html): + aprilorg_url = re.compile( + r'^https?://(www\.)?april.org/(fr/)?(?P[0-9a-z/-]+)' + ) + + def april_link(tag): + return tag.name == 'a' and aprilorg_url.match( + tag.attrs.get('href', '') + ) + + soup = bs(html, 'html.parser') + + aprilorg_href = [ + a.attrs.get('href', '') for a in soup.find_all(april_link) + ] + + aprilorg_uri = [ + aprilorg_url.match(href).group('uri') for href in aprilorg_href + ] + + nodes_alias = UrlAlias.objects.filter( + Q(src__in=aprilorg_uri) | Q(dst__in=aprilorg_uri) + ).values_list('src', flat=True) + + nodes_pk = [ + int(re.match(r'node/(?P\d+)', src).group('pk')) + for src in nodes_alias + ] + + return nodes_pk + + class Command(BaseCommand): help = "Import Drupal nodes to SPIP articles." @@ -24,6 +71,12 @@ class Command(BaseCommand): type=int, help='Selects what nodes to be imported. Default is none.', ) + parser.add_argument( + '--from-index', + type=validate_url, + help='Selects nodes to be imported by scrapping list at given URL. ' + 'Default is none.', + ) parser.add_argument( '--update', action='store_true', @@ -41,6 +94,10 @@ class Command(BaseCommand): qs |= Node.objects.filter(pk__in=options['node']) if options['user']: qs |= Node.objects.filter(user__name__in=options['user']) + if options['from_index']: + qs |= Node.objects.filter( + pk__in=nodes_from_index(options['from_index']) + ) for n in qs: try: