import argparse import logging import re from django.core.management.base import BaseCommand from django.core.validators import URLValidator from django.db.models import Q import requests from bs4 import BeautifulSoup as bs from drupal2spip_lal.base.convert import convert_node from drupal2spip_lal.drupal.models import Node, UrlAlias logger = logging.getLogger('drupal2spip_lal') def validate_url(url): try: URLValidator(schemes=['http', 'https'])(url) return requests.get(url).content.decode() except Exception as e: raise argparse.ArgumentTypeError(e) def nodes_from_index(html): aprilorg_url = re.compile( r'^https?://(www\.)?april.org/(fr/)?(?P[0-9a-z/-]+)' ) def april_link(tag): return tag.name == 'a' and aprilorg_url.match( tag.attrs.get('href', '') ) soup = bs(html, 'html.parser') aprilorg_href = [ a.attrs.get('href', '') for a in soup.find_all(april_link) ] aprilorg_uri = [ aprilorg_url.match(href).group('uri') for href in aprilorg_href ] nodes_alias = UrlAlias.objects.filter( Q(src__in=aprilorg_uri) | Q(dst__in=aprilorg_uri) ).values_list('src', flat=True) nodes_pk = [ int(re.match(r'node/(?P\d+)', src).group('pk')) for src in nodes_alias ] return nodes_pk class Command(BaseCommand): help = "Import Drupal nodes to SPIP articles." def add_arguments(self, parser): parser.add_argument( '--user', nargs='*', type=str, help='Selects users nodes to be imported. Default is nobody.', ) parser.add_argument( '--node', nargs='*', type=int, help='Selects what nodes to be imported. Default is none.', ) parser.add_argument( '--from-index', type=validate_url, help='Selects nodes to be imported by scrapping list at given URL. ' 'Default is none.', ) parser.add_argument( '--update', action='store_true', help='Force existing articles to be updated. Default is skip.', ) parser.add_argument( '--force-download', action='store_true', help='Force existing ressources to be downloaded. Default is skip.', ) def handle(self, **options): qs = Node.objects.none() if options['node']: qs |= Node.objects.filter(pk__in=options['node']) if options['user']: qs |= Node.objects.filter(user__name__in=options['user']) if options['from_index']: qs |= Node.objects.filter( pk__in=nodes_from_index(options['from_index']) ) for n in qs: try: convert_node(n, options) except Exception as e: logger.critical( "L'import du node {} a échoué : {}".format(n.pk, e) )