109 lines
3.0 KiB
Python
109 lines
3.0 KiB
Python
import argparse
|
|
import logging
|
|
import re
|
|
|
|
from django.core.management.base import BaseCommand
|
|
from django.core.validators import URLValidator
|
|
from django.db.models import Q
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup as bs
|
|
|
|
from drupal2spip_lal.base.convert import convert_node
|
|
from drupal2spip_lal.drupal.models import Node, UrlAlias
|
|
|
|
logger = logging.getLogger('drupal2spip_lal')
|
|
|
|
|
|
def validate_url(url):
|
|
try:
|
|
URLValidator(schemes=['http', 'https'])(url)
|
|
return requests.get(url).content.decode()
|
|
except Exception as e:
|
|
raise argparse.ArgumentTypeError(e)
|
|
|
|
|
|
def nodes_from_index(html):
|
|
aprilorg_url = re.compile(
|
|
r'^https?://(www\.)?april.org/(fr/)?(?P<uri>[0-9a-z/-]+)'
|
|
)
|
|
|
|
def april_link(tag):
|
|
return tag.name == 'a' and aprilorg_url.match(
|
|
tag.attrs.get('href', '')
|
|
)
|
|
|
|
soup = bs(html, 'html.parser')
|
|
|
|
aprilorg_href = [
|
|
a.attrs.get('href', '') for a in soup.find_all(april_link)
|
|
]
|
|
|
|
aprilorg_uri = [
|
|
aprilorg_url.match(href).group('uri') for href in aprilorg_href
|
|
]
|
|
|
|
nodes_alias = UrlAlias.objects.filter(
|
|
Q(src__in=aprilorg_uri) | Q(dst__in=aprilorg_uri)
|
|
).values_list('src', flat=True)
|
|
|
|
nodes_pk = [
|
|
int(re.match(r'node/(?P<pk>\d+)', src).group('pk'))
|
|
for src in nodes_alias
|
|
]
|
|
|
|
return nodes_pk
|
|
|
|
|
|
class Command(BaseCommand):
|
|
help = "Import Drupal nodes to SPIP articles."
|
|
|
|
def add_arguments(self, parser):
|
|
parser.add_argument(
|
|
'--user',
|
|
nargs='*',
|
|
type=str,
|
|
help='Selects users nodes to be imported. Default is nobody.',
|
|
)
|
|
parser.add_argument(
|
|
'--node',
|
|
nargs='*',
|
|
type=int,
|
|
help='Selects what nodes to be imported. Default is none.',
|
|
)
|
|
parser.add_argument(
|
|
'--from-index',
|
|
type=validate_url,
|
|
help='Selects nodes to be imported by scrapping list at given URL. '
|
|
'Default is none.',
|
|
)
|
|
parser.add_argument(
|
|
'--update',
|
|
action='store_true',
|
|
help='Force existing articles to be updated. Default is skip.',
|
|
)
|
|
parser.add_argument(
|
|
'--force-download',
|
|
action='store_true',
|
|
help='Force existing ressources to be downloaded. Default is skip.',
|
|
)
|
|
|
|
def handle(self, **options):
|
|
qs = Node.objects.none()
|
|
if options['node']:
|
|
qs |= Node.objects.filter(pk__in=options['node'])
|
|
if options['user']:
|
|
qs |= Node.objects.filter(user__name__in=options['user'])
|
|
if options['from_index']:
|
|
qs |= Node.objects.filter(
|
|
pk__in=nodes_from_index(options['from_index'])
|
|
)
|
|
|
|
for n in qs:
|
|
try:
|
|
convert_node(n, options)
|
|
except Exception as e:
|
|
logger.critical(
|
|
"L'import du node {} a échoué : {}".format(n.pk, e)
|
|
)
|