2020-08-08 11:00:28 +02:00
|
|
|
import logging
|
2020-08-03 21:09:02 +02:00
|
|
|
import os
|
2020-08-03 19:16:19 +02:00
|
|
|
import re
|
2020-08-03 16:59:21 +02:00
|
|
|
from datetime import datetime
|
2020-08-03 19:16:19 +02:00
|
|
|
from itertools import groupby
|
2020-08-03 16:59:21 +02:00
|
|
|
|
2020-08-03 21:09:02 +02:00
|
|
|
from django.conf import settings
|
2020-08-04 09:37:22 +02:00
|
|
|
from django.utils.timezone import make_aware, now
|
2020-08-03 16:59:21 +02:00
|
|
|
|
2020-08-03 21:09:02 +02:00
|
|
|
import request
|
2020-08-03 18:18:12 +02:00
|
|
|
from bs4 import BeautifulSoup as bs
|
|
|
|
|
2020-08-03 16:59:21 +02:00
|
|
|
from drupal2spip_lal.drupal import models as drupal
|
|
|
|
from drupal2spip_lal.spip import models as spip
|
|
|
|
|
|
|
|
# Questions
|
|
|
|
# - quelle utilisation des rubriques ?
|
|
|
|
# - quelle hiérarchie des mots clés ?
|
|
|
|
# - autobr sélectif ?
|
2020-08-04 11:24:45 +02:00
|
|
|
# - importer les drupaleries ? (fn, toc, etc.) + autobr selon le format
|
|
|
|
# - convertir en format SPIP ?
|
2020-08-03 16:59:21 +02:00
|
|
|
|
|
|
|
|
2020-08-08 11:00:28 +02:00
|
|
|
logger = logging.getLogger('drupal2spip_lal')
|
|
|
|
|
|
|
|
|
2020-08-03 16:59:21 +02:00
|
|
|
def convert_timestamp(timestamp):
|
|
|
|
return make_aware(datetime.fromtimestamp(timestamp))
|
|
|
|
|
|
|
|
|
2020-08-03 19:16:19 +02:00
|
|
|
def strong_to_dl(html):
|
|
|
|
"""
|
|
|
|
Marie-Odile nous fait des dl-like à base de strong.
|
|
|
|
Parser html avec des regex est mal mais on essaie de reconstruire la dl
|
|
|
|
avant de casser la forme avec bs.
|
|
|
|
"""
|
|
|
|
is_strong = r'<strong>(?P<dt>.+)</strong>(?P<dd>.*)$'
|
|
|
|
|
|
|
|
def is_strong_item(s):
|
|
|
|
return bool(re.match(is_strong, s))
|
|
|
|
|
|
|
|
items = re.split(r'[\r\n]+', html)
|
|
|
|
grouped_items = groupby(items, key=is_strong_item)
|
|
|
|
r = []
|
|
|
|
for key, group in grouped_items:
|
|
|
|
group = list(group)
|
|
|
|
if key and len(group) > 2:
|
2020-08-08 15:48:53 +02:00
|
|
|
dl = ['<dl class="strong_to_dl">']
|
2020-08-03 19:16:19 +02:00
|
|
|
for elem in group:
|
|
|
|
match = re.match(is_strong, elem).groupdict()
|
|
|
|
dl += [
|
|
|
|
'<dt>{}</dt>'.format(match['dt'].strip()),
|
|
|
|
'<dd>{}</dd>'.format(match['dd'].strip()),
|
|
|
|
]
|
|
|
|
dl.append('</dl>')
|
|
|
|
r += dl
|
|
|
|
else:
|
|
|
|
r += group
|
|
|
|
return '\n'.join(r)
|
|
|
|
|
|
|
|
|
2020-08-04 10:53:21 +02:00
|
|
|
def fetch_and_remove_logo(article, force_download):
|
2020-08-03 21:09:02 +02:00
|
|
|
def fetch_logo(src):
|
|
|
|
"""
|
|
|
|
SPIP gère les logos à la façon d'un hack : un fichier dans IMG nommé
|
|
|
|
'arton{}.{}'.format(article.pk, ext)
|
|
|
|
"""
|
|
|
|
ext = src.split('.')[-1]
|
|
|
|
filename = 'arton{}.{}'.format(article.pk, ext)
|
|
|
|
path = os.path.join(settings.SPIP_LOGO_DIR, filename)
|
|
|
|
|
2020-08-04 10:53:21 +02:00
|
|
|
if not os.access(path, os.R_OK) or force_download:
|
|
|
|
r = request.get(src, stream=True)
|
|
|
|
with open(path, 'wb') as fd:
|
|
|
|
for chunk in r.iter_content(chunk_size=128):
|
|
|
|
fd.write(chunk)
|
2020-08-03 21:09:02 +02:00
|
|
|
|
|
|
|
def remove_img(img):
|
|
|
|
has_siblings = [
|
|
|
|
elem
|
|
|
|
for elem in list(img.previous_siblings) + list(img.next_siblings)
|
|
|
|
if elem != '\n'
|
|
|
|
]
|
2020-08-04 11:22:19 +02:00
|
|
|
if img.parent.name in ['a', 'p'] and not has_siblings:
|
|
|
|
img.parent.replace_with('')
|
|
|
|
else:
|
|
|
|
img.replace_with('')
|
2020-08-03 21:09:02 +02:00
|
|
|
|
|
|
|
soup = bs(article.descriptif, 'html.parser')
|
|
|
|
img = soup.find('img')
|
|
|
|
|
|
|
|
src = img and img.attrs.get('src', None)
|
|
|
|
|
|
|
|
if src and src.startswith('/'):
|
|
|
|
src = 'http://{}{}'.format(settings.DRUPAL_FQDN, src)
|
|
|
|
|
|
|
|
if src and re.match(r'^(https?)?://', src):
|
|
|
|
fetch_logo(src)
|
|
|
|
remove_img(img)
|
2020-08-04 10:54:37 +02:00
|
|
|
article.descriptif = str(soup)
|
2020-08-04 11:22:19 +02:00
|
|
|
|
|
|
|
# L'image est généralement reprise dans le corps avec un format
|
|
|
|
# différent (par ex sans lien vers l'article).
|
|
|
|
soup = bs(article.texte, 'html.parser')
|
|
|
|
img = soup.find('img', src=src)
|
|
|
|
if img:
|
|
|
|
remove_img(img)
|
|
|
|
article.texte = str(soup)
|
|
|
|
|
2020-08-03 21:09:02 +02:00
|
|
|
article.save()
|
|
|
|
|
|
|
|
|
2020-08-08 13:45:45 +02:00
|
|
|
def filter_html(html):
|
2020-08-08 15:49:21 +02:00
|
|
|
inline_elems = ['a', 'em', 'strong', 'cite', 'code', 'b', 'i']
|
|
|
|
block_elems = ['p', 'dd', 'dt', 'dl', 'ul', 'ol', 'li', 'h2', 'h3', 'h4', 'img', 'audio', 'video']
|
|
|
|
|
|
|
|
def explicit_double_br(html):
|
|
|
|
double_br = re.compile(r'<\s*br\s*/?>\s*\n|\n\s*<\s*br\s*/?>')
|
|
|
|
return double_br.sub('\n\n', html)
|
|
|
|
|
2020-08-08 13:45:45 +02:00
|
|
|
def auto_p(html):
|
|
|
|
re_paragraph = re.compile(r'\s*\n\s*\n\s*')
|
2020-08-08 15:49:21 +02:00
|
|
|
return re_paragraph.sub('</p><p class="auto-p">', html)
|
2020-08-08 13:45:45 +02:00
|
|
|
|
|
|
|
def auto_br(html):
|
|
|
|
re_break = re.compile(r'\s*\n\s*')
|
2020-08-08 15:49:21 +02:00
|
|
|
return re_break.sub('<br class="auto-br">', html)
|
|
|
|
|
|
|
|
def remove_spare_br(html):
|
2020-08-08 13:45:45 +02:00
|
|
|
soup = bs(html, 'html.parser')
|
2020-08-08 15:49:21 +02:00
|
|
|
# ou dernier premier élément d'un parent de type bloc
|
|
|
|
def spare_br(elem):
|
|
|
|
return elem.name == 'br' and getattr(elem.parent, 'name', None) in block_elems and not (elem.next_sibling and elem.previous_sibling)
|
|
|
|
[elem.clear() for elem in soup.find_all(spare_br)]
|
2020-08-08 13:45:45 +02:00
|
|
|
return str(soup)
|
|
|
|
|
2020-08-08 15:49:21 +02:00
|
|
|
html = explicit_double_br(html)
|
2020-08-08 13:45:45 +02:00
|
|
|
html = auto_p(html)
|
|
|
|
html = auto_br(html)
|
2020-08-08 15:49:21 +02:00
|
|
|
html = remove_spare_br(html)
|
2020-08-08 13:45:45 +02:00
|
|
|
return html
|
|
|
|
|
|
|
|
|
|
|
|
def sanitarize_html(html, node_fmt):
|
2020-08-03 19:16:19 +02:00
|
|
|
html = strong_to_dl(html)
|
2020-08-08 13:45:45 +02:00
|
|
|
|
|
|
|
if node_fmt == 'PHP code':
|
|
|
|
raise NotImplementedError("Ce node est au format PHP.")
|
2020-08-08 15:49:21 +02:00
|
|
|
elif node_fmt == 'Filtered HTML':
|
2020-08-08 13:45:45 +02:00
|
|
|
html = filter_html(html)
|
2020-08-08 15:49:21 +02:00
|
|
|
elif node_fmt == 'Full HTML':
|
|
|
|
pass
|
|
|
|
else:
|
|
|
|
raise NotImplementedError("Ce node est dans un format inconnu.")
|
2020-08-08 13:45:45 +02:00
|
|
|
|
2020-08-08 15:49:21 +02:00
|
|
|
return str(bs(html, 'html.parser'))
|
2020-08-03 16:59:21 +02:00
|
|
|
|
|
|
|
|
2020-08-04 10:53:21 +02:00
|
|
|
def convert_node(node, options):
|
2020-08-03 16:59:21 +02:00
|
|
|
"""
|
|
|
|
Le point d'entrée fonctionnel c'est les Urls.
|
|
|
|
On se base donc là dessus pour vérifier si l'import
|
|
|
|
est à faire ou pas ou encore à upgrader.
|
|
|
|
"""
|
2020-08-04 10:53:21 +02:00
|
|
|
update = options.get('update', False)
|
|
|
|
force_download = options.get('force_download', False)
|
|
|
|
|
2020-08-03 16:59:21 +02:00
|
|
|
node_urls = drupal.UrlAlias.objects.filter(src='node/{}'.format(node.pk))
|
|
|
|
|
|
|
|
spip_urls = spip.Urls.objects.filter(
|
|
|
|
type='article', url__in=list(node_urls.values_list('dst', flat=True))
|
|
|
|
)
|
|
|
|
|
2020-08-08 11:32:46 +02:00
|
|
|
if spip_urls.exists():
|
|
|
|
article_id = spip_urls.first().id_objet
|
|
|
|
if (
|
|
|
|
sorted(spip_urls.values_list('url', flat=True))
|
|
|
|
!= sorted(node_urls.values_list('dst', flat=True))
|
|
|
|
or len(set(spip_urls.values_list('id_objet', flat=True))) != 1
|
|
|
|
or spip.Urls.objects.filter(
|
|
|
|
type='article', id_objet=article_id
|
2020-08-08 13:45:45 +02:00
|
|
|
).count()
|
|
|
|
!= spip_urls.count()
|
2020-08-08 11:32:46 +02:00
|
|
|
):
|
|
|
|
# incohérence dans les urls
|
|
|
|
raise ValueError(
|
|
|
|
"Cet article existe déjà dans SPIP sans qu'il soit possible "
|
|
|
|
"de s'assurer qu'il s'agisse d'un import prééexistant."
|
|
|
|
)
|
2020-08-03 16:59:21 +02:00
|
|
|
|
2020-08-08 13:45:45 +02:00
|
|
|
node_fmt = node.published_revision.format.name
|
|
|
|
|
2020-08-03 16:59:21 +02:00
|
|
|
article = None
|
|
|
|
article_attributes = {
|
|
|
|
'date': convert_timestamp(node.published_revision.timestamp),
|
|
|
|
'date_modif': convert_timestamp(node.changed),
|
|
|
|
'date_redac': convert_timestamp(node.created),
|
2020-08-08 13:45:45 +02:00
|
|
|
'descriptif': sanitarize_html(
|
|
|
|
node.published_revision.teaser, node_fmt
|
|
|
|
),
|
2020-08-03 16:59:21 +02:00
|
|
|
'maj': convert_timestamp(node.changed),
|
|
|
|
'statut': 'publie' if node.status else 'prepa',
|
2020-08-08 13:45:45 +02:00
|
|
|
'texte': sanitarize_html(node.published_revision.body, node_fmt),
|
2020-08-03 16:59:21 +02:00
|
|
|
'titre': node.title,
|
|
|
|
}
|
|
|
|
if not spip_urls.exists():
|
|
|
|
article = spip.Articles.objects.create(**article_attributes)
|
|
|
|
urls = [
|
|
|
|
spip.Urls(
|
|
|
|
id_objet=article.pk,
|
|
|
|
url=node_url.dst,
|
|
|
|
date=convert_timestamp(node.created),
|
|
|
|
)
|
|
|
|
for node_url in node_urls
|
|
|
|
]
|
|
|
|
spip.Urls.objects.bulk_create(urls)
|
|
|
|
|
2020-08-08 11:00:28 +02:00
|
|
|
logger.info(
|
|
|
|
'Article {} created from node {}.'.format(article.pk, node.pk)
|
|
|
|
)
|
2020-08-03 16:59:21 +02:00
|
|
|
|
|
|
|
elif update:
|
|
|
|
article = spip.Articles(
|
|
|
|
pk=spip_urls.last().id_objet, **article_attributes
|
|
|
|
)
|
|
|
|
article.save()
|
2020-08-08 11:00:28 +02:00
|
|
|
logger.info(
|
|
|
|
'Article {} updated from node {}.'.format(article.pk, node.pk)
|
|
|
|
)
|
2020-08-03 16:59:21 +02:00
|
|
|
|
|
|
|
if article:
|
|
|
|
user_attributes = {
|
|
|
|
'nom': node.user.name,
|
|
|
|
'email': node.user.mail,
|
|
|
|
'en_ligne': convert_timestamp(node.user.access),
|
|
|
|
'maj': convert_timestamp(node.user.created),
|
|
|
|
}
|
2020-08-03 21:09:02 +02:00
|
|
|
|
2020-08-03 16:59:21 +02:00
|
|
|
auteur, _ = spip.Auteurs.objects.update_or_create(
|
|
|
|
login=node.user.name, defaults=user_attributes
|
|
|
|
)
|
|
|
|
|
2020-08-04 09:37:22 +02:00
|
|
|
spip.AuteursLiens.objects.get_or_create(
|
2020-08-03 16:59:21 +02:00
|
|
|
auteur=auteur, id_objet=article.pk, objet='article'
|
|
|
|
)
|
2020-08-03 21:09:02 +02:00
|
|
|
|
2020-08-04 10:53:21 +02:00
|
|
|
fetch_and_remove_logo(article, force_download)
|
2020-08-04 09:37:22 +02:00
|
|
|
|
|
|
|
for term_node in node.termnode_set.all():
|
|
|
|
groupe, _ = spip.GroupesMots.objects.get_or_create(
|
|
|
|
titre=term_node.data.theme.name,
|
|
|
|
descriptif=term_node.data.theme.description,
|
|
|
|
texte=term_node.data.theme.help,
|
|
|
|
defaults={'maj': now},
|
|
|
|
)
|
|
|
|
mot, _ = spip.Mots.objects.get_or_create(
|
|
|
|
groupe=groupe,
|
|
|
|
type=groupe.titre,
|
|
|
|
titre=term_node.data.name,
|
|
|
|
descriptif=term_node.data.description,
|
|
|
|
defaults={'maj': now},
|
|
|
|
)
|
|
|
|
spip.MotsLiens.objects.get_or_create(
|
|
|
|
mot=mot, id_objet=article.pk, objet='article'
|
|
|
|
)
|