drupal2spip_lal/drupal2spip_lal/base/management/commands/import.py

109 lines
3.0 KiB
Python

import argparse
import logging
import re
from django.core.management.base import BaseCommand
from django.core.validators import URLValidator
from django.db.models import Q
import requests
from bs4 import BeautifulSoup as bs
from drupal2spip_lal.base.convert import convert_node
from drupal2spip_lal.drupal.models import Node, UrlAlias
logger = logging.getLogger('drupal2spip_lal')
def validate_url(url):
try:
URLValidator(schemes=['http', 'https'])(url)
return requests.get(url).content.decode()
except Exception as e:
raise argparse.ArgumentTypeError(e)
def nodes_from_index(html):
aprilorg_url = re.compile(
r'^https?://(www\.)?april.org/(fr/)?(?P<uri>[0-9a-z/-]+)'
)
def april_link(tag):
return tag.name == 'a' and aprilorg_url.match(
tag.attrs.get('href', '')
)
soup = bs(html, 'html.parser')
aprilorg_href = [
a.attrs.get('href', '') for a in soup.find_all(april_link)
]
aprilorg_uri = [
aprilorg_url.match(href).group('uri') for href in aprilorg_href
]
nodes_alias = UrlAlias.objects.filter(
Q(src__in=aprilorg_uri) | Q(dst__in=aprilorg_uri)
).values_list('src', flat=True)
nodes_pk = [
int(re.match(r'node/(?P<pk>\d+)', src).group('pk'))
for src in nodes_alias
]
return nodes_pk
class Command(BaseCommand):
help = "Import Drupal nodes to SPIP articles."
def add_arguments(self, parser):
parser.add_argument(
'--user',
nargs='*',
type=str,
help='Selects users nodes to be imported. Default is nobody.',
)
parser.add_argument(
'--node',
nargs='*',
type=int,
help='Selects what nodes to be imported. Default is none.',
)
parser.add_argument(
'--from-index',
type=validate_url,
help='Selects nodes to be imported by scrapping list at given URL. '
'Default is none.',
)
parser.add_argument(
'--update',
action='store_true',
help='Force existing articles to be updated. Default is skip.',
)
parser.add_argument(
'--force-download',
action='store_true',
help='Force existing ressources to be downloaded. Default is skip.',
)
def handle(self, **options):
qs = Node.objects.none()
if options['node']:
qs |= Node.objects.filter(pk__in=options['node'])
if options['user']:
qs |= Node.objects.filter(user__name__in=options['user'])
if options['from_index']:
qs |= Node.objects.filter(
pk__in=nodes_from_index(options['from_index'])
)
for n in qs:
try:
convert_node(n, options)
except Exception as e:
logger.critical(
"L'import du node {} a échoué : {}".format(n.pk, e)
)