feat(import): ajoute la possibilite d import par une page d index
This commit is contained in:
parent
83b6d71850
commit
3f20b966cd
@ -1,13 +1,60 @@
|
||||
import argparse
|
||||
import logging
|
||||
import re
|
||||
|
||||
from django.core.management.base import BaseCommand
|
||||
from django.core.validators import URLValidator
|
||||
from django.db.models import Q
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup as bs
|
||||
|
||||
from drupal2spip_lal.base.convert import convert_node
|
||||
from drupal2spip_lal.drupal.models import Node
|
||||
from drupal2spip_lal.drupal.models import Node, UrlAlias
|
||||
|
||||
logger = logging.getLogger('drupal2spip_lal')
|
||||
|
||||
|
||||
def validate_url(url):
|
||||
try:
|
||||
URLValidator(schemes=['http', 'https'])(url)
|
||||
return requests.get(url).content.decode()
|
||||
except Exception as e:
|
||||
raise argparse.ArgumentTypeError(e)
|
||||
|
||||
|
||||
def nodes_from_index(html):
|
||||
aprilorg_url = re.compile(
|
||||
r'^https?://(www\.)?april.org/(fr/)?(?P<uri>[0-9a-z/-]+)'
|
||||
)
|
||||
|
||||
def april_link(tag):
|
||||
return tag.name == 'a' and aprilorg_url.match(
|
||||
tag.attrs.get('href', '')
|
||||
)
|
||||
|
||||
soup = bs(html, 'html.parser')
|
||||
|
||||
aprilorg_href = [
|
||||
a.attrs.get('href', '') for a in soup.find_all(april_link)
|
||||
]
|
||||
|
||||
aprilorg_uri = [
|
||||
aprilorg_url.match(href).group('uri') for href in aprilorg_href
|
||||
]
|
||||
|
||||
nodes_alias = UrlAlias.objects.filter(
|
||||
Q(src__in=aprilorg_uri) | Q(dst__in=aprilorg_uri)
|
||||
).values_list('src', flat=True)
|
||||
|
||||
nodes_pk = [
|
||||
int(re.match(r'node/(?P<pk>\d+)', src).group('pk'))
|
||||
for src in nodes_alias
|
||||
]
|
||||
|
||||
return nodes_pk
|
||||
|
||||
|
||||
class Command(BaseCommand):
|
||||
help = "Import Drupal nodes to SPIP articles."
|
||||
|
||||
@ -24,6 +71,12 @@ class Command(BaseCommand):
|
||||
type=int,
|
||||
help='Selects what nodes to be imported. Default is none.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--from-index',
|
||||
type=validate_url,
|
||||
help='Selects nodes to be imported by scrapping list at given URL. '
|
||||
'Default is none.',
|
||||
)
|
||||
parser.add_argument(
|
||||
'--update',
|
||||
action='store_true',
|
||||
@ -41,6 +94,10 @@ class Command(BaseCommand):
|
||||
qs |= Node.objects.filter(pk__in=options['node'])
|
||||
if options['user']:
|
||||
qs |= Node.objects.filter(user__name__in=options['user'])
|
||||
if options['from_index']:
|
||||
qs |= Node.objects.filter(
|
||||
pk__in=nodes_from_index(options['from_index'])
|
||||
)
|
||||
|
||||
for n in qs:
|
||||
try:
|
||||
|
Loading…
Reference in New Issue
Block a user