feat(import): ajoute la possibilite d import par une page d index
This commit is contained in:
parent
83b6d71850
commit
3f20b966cd
@ -1,13 +1,60 @@
|
|||||||
|
import argparse
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
|
|
||||||
from django.core.management.base import BaseCommand
|
from django.core.management.base import BaseCommand
|
||||||
|
from django.core.validators import URLValidator
|
||||||
|
from django.db.models import Q
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup as bs
|
||||||
|
|
||||||
from drupal2spip_lal.base.convert import convert_node
|
from drupal2spip_lal.base.convert import convert_node
|
||||||
from drupal2spip_lal.drupal.models import Node
|
from drupal2spip_lal.drupal.models import Node, UrlAlias
|
||||||
|
|
||||||
logger = logging.getLogger('drupal2spip_lal')
|
logger = logging.getLogger('drupal2spip_lal')
|
||||||
|
|
||||||
|
|
||||||
|
def validate_url(url):
|
||||||
|
try:
|
||||||
|
URLValidator(schemes=['http', 'https'])(url)
|
||||||
|
return requests.get(url).content.decode()
|
||||||
|
except Exception as e:
|
||||||
|
raise argparse.ArgumentTypeError(e)
|
||||||
|
|
||||||
|
|
||||||
|
def nodes_from_index(html):
|
||||||
|
aprilorg_url = re.compile(
|
||||||
|
r'^https?://(www\.)?april.org/(fr/)?(?P<uri>[0-9a-z/-]+)'
|
||||||
|
)
|
||||||
|
|
||||||
|
def april_link(tag):
|
||||||
|
return tag.name == 'a' and aprilorg_url.match(
|
||||||
|
tag.attrs.get('href', '')
|
||||||
|
)
|
||||||
|
|
||||||
|
soup = bs(html, 'html.parser')
|
||||||
|
|
||||||
|
aprilorg_href = [
|
||||||
|
a.attrs.get('href', '') for a in soup.find_all(april_link)
|
||||||
|
]
|
||||||
|
|
||||||
|
aprilorg_uri = [
|
||||||
|
aprilorg_url.match(href).group('uri') for href in aprilorg_href
|
||||||
|
]
|
||||||
|
|
||||||
|
nodes_alias = UrlAlias.objects.filter(
|
||||||
|
Q(src__in=aprilorg_uri) | Q(dst__in=aprilorg_uri)
|
||||||
|
).values_list('src', flat=True)
|
||||||
|
|
||||||
|
nodes_pk = [
|
||||||
|
int(re.match(r'node/(?P<pk>\d+)', src).group('pk'))
|
||||||
|
for src in nodes_alias
|
||||||
|
]
|
||||||
|
|
||||||
|
return nodes_pk
|
||||||
|
|
||||||
|
|
||||||
class Command(BaseCommand):
|
class Command(BaseCommand):
|
||||||
help = "Import Drupal nodes to SPIP articles."
|
help = "Import Drupal nodes to SPIP articles."
|
||||||
|
|
||||||
@ -24,6 +71,12 @@ class Command(BaseCommand):
|
|||||||
type=int,
|
type=int,
|
||||||
help='Selects what nodes to be imported. Default is none.',
|
help='Selects what nodes to be imported. Default is none.',
|
||||||
)
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
'--from-index',
|
||||||
|
type=validate_url,
|
||||||
|
help='Selects nodes to be imported by scrapping list at given URL. '
|
||||||
|
'Default is none.',
|
||||||
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
'--update',
|
'--update',
|
||||||
action='store_true',
|
action='store_true',
|
||||||
@ -41,6 +94,10 @@ class Command(BaseCommand):
|
|||||||
qs |= Node.objects.filter(pk__in=options['node'])
|
qs |= Node.objects.filter(pk__in=options['node'])
|
||||||
if options['user']:
|
if options['user']:
|
||||||
qs |= Node.objects.filter(user__name__in=options['user'])
|
qs |= Node.objects.filter(user__name__in=options['user'])
|
||||||
|
if options['from_index']:
|
||||||
|
qs |= Node.objects.filter(
|
||||||
|
pk__in=nodes_from_index(options['from_index'])
|
||||||
|
)
|
||||||
|
|
||||||
for n in qs:
|
for n in qs:
|
||||||
try:
|
try:
|
||||||
|
Loading…
Reference in New Issue
Block a user