feat(import): ajoute la possibilite d import par une page d index

This commit is contained in:
François Poulain 2020-08-15 17:50:38 +02:00
parent 83b6d71850
commit 3f20b966cd

View File

@ -1,13 +1,60 @@
import argparse
import logging
import re
from django.core.management.base import BaseCommand
from django.core.validators import URLValidator
from django.db.models import Q
import requests
from bs4 import BeautifulSoup as bs
from drupal2spip_lal.base.convert import convert_node
from drupal2spip_lal.drupal.models import Node
from drupal2spip_lal.drupal.models import Node, UrlAlias
logger = logging.getLogger('drupal2spip_lal')
def validate_url(url):
try:
URLValidator(schemes=['http', 'https'])(url)
return requests.get(url).content.decode()
except Exception as e:
raise argparse.ArgumentTypeError(e)
def nodes_from_index(html):
aprilorg_url = re.compile(
r'^https?://(www\.)?april.org/(fr/)?(?P<uri>[0-9a-z/-]+)'
)
def april_link(tag):
return tag.name == 'a' and aprilorg_url.match(
tag.attrs.get('href', '')
)
soup = bs(html, 'html.parser')
aprilorg_href = [
a.attrs.get('href', '') for a in soup.find_all(april_link)
]
aprilorg_uri = [
aprilorg_url.match(href).group('uri') for href in aprilorg_href
]
nodes_alias = UrlAlias.objects.filter(
Q(src__in=aprilorg_uri) | Q(dst__in=aprilorg_uri)
).values_list('src', flat=True)
nodes_pk = [
int(re.match(r'node/(?P<pk>\d+)', src).group('pk'))
for src in nodes_alias
]
return nodes_pk
class Command(BaseCommand):
help = "Import Drupal nodes to SPIP articles."
@ -24,6 +71,12 @@ class Command(BaseCommand):
type=int,
help='Selects what nodes to be imported. Default is none.',
)
parser.add_argument(
'--from-index',
type=validate_url,
help='Selects nodes to be imported by scrapping list at given URL. '
'Default is none.',
)
parser.add_argument(
'--update',
action='store_true',
@ -41,6 +94,10 @@ class Command(BaseCommand):
qs |= Node.objects.filter(pk__in=options['node'])
if options['user']:
qs |= Node.objects.filter(user__name__in=options['user'])
if options['from_index']:
qs |= Node.objects.filter(
pk__in=nodes_from_index(options['from_index'])
)
for n in qs:
try: