drupal2spip_lal/drupal2spip_lal/base/management/commands/import.py

import argparse
import logging
import re

from django.core.management.base import BaseCommand
from django.core.validators import URLValidator
from django.db.models import Q

import requests
from bs4 import BeautifulSoup as bs

from drupal2spip_lal.base.convert import convert_node
from drupal2spip_lal.drupal.models import Node, UrlAlias

logger = logging.getLogger('drupal2spip_lal')


def validate_url(url):
    try:
        URLValidator(schemes=['http', 'https'])(url)
        return requests.get(url).content.decode()
    except Exception as e:
        raise argparse.ArgumentTypeError(e)


def nodes_from_index(html):
    aprilorg_url = re.compile(
        r'^https?://(www\.)?april.org/(fr/)?(?P<uri>[0-9a-z/-]+)'
    )

    def april_link(tag):
        return tag.name == 'a' and aprilorg_url.match(
            tag.attrs.get('href', '')
        )

    soup = bs(html, 'html.parser')

    aprilorg_href = [
        a.attrs.get('href', '') for a in soup.find_all(april_link)
    ]

    aprilorg_uri = [
        aprilorg_url.match(href).group('uri') for href in aprilorg_href
    ]

    nodes_alias = UrlAlias.objects.filter(
        Q(src__in=aprilorg_uri) | Q(dst__in=aprilorg_uri)
    ).values_list('src', flat=True)

    nodes_pk = [
        int(re.match(r'node/(?P<pk>\d+)', src).group('pk'))
        for src in nodes_alias
    ]

    return nodes_pk


class Command(BaseCommand):
    help = "Import Drupal nodes to SPIP articles."

    def add_arguments(self, parser):
        parser.add_argument(
            '--user',
            nargs='*',
            type=str,
            help='Selects users nodes to be imported. Default is nobody.',
        )
        parser.add_argument(
            '--node',
            nargs='*',
            type=int,
            help='Selects what nodes to be imported. Default is none.',
        )
        parser.add_argument(
            '--from-index',
            type=validate_url,
            help='Selects nodes to be imported by scrapping list at given URL. '
            'Default is none.',
        )
        parser.add_argument(
            '--update',
            action='store_true',
            help='Force existing articles to be updated. Default is skip.',
        )
        parser.add_argument(
            '--force-download',
            action='store_true',
            help='Force existing ressources to be downloaded. Default is skip.',
        )

    def handle(self, **options):
        qs = Node.objects.none()
        if options['node']:
            qs |= Node.objects.filter(pk__in=options['node'])
        if options['user']:
            qs |= Node.objects.filter(user__name__in=options['user'])
        if options['from_index']:
            qs |= Node.objects.filter(
                pk__in=nodes_from_index(options['from_index'])
            )

        for n in qs:
            try:
                convert_node(n, options)
            except Exception as e:
                logger.critical(
                    "L'import du node {} a échoué : {}".format(n.pk, e)
                )