From c8a075fc7a49eef91a026e59dbfdf23fb4e612e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Poulain?= Date: Sat, 8 Aug 2020 13:45:45 +0200 Subject: [PATCH] feat(convert): implante les filtres html autobr et autop de drupal --- drupal2spip_lal/base/convert.py | 55 +++++++++++++++++-- .../base/management/commands/import.py | 1 - 2 files changed, 49 insertions(+), 7 deletions(-) diff --git a/drupal2spip_lal/base/convert.py b/drupal2spip_lal/base/convert.py index e00af0d..4291b82 100644 --- a/drupal2spip_lal/base/convert.py +++ b/drupal2spip_lal/base/convert.py @@ -110,10 +110,48 @@ def fetch_and_remove_logo(article, force_download): article.save() -def sanitarize_html(html): +def filter_html(html): + def auto_p(html): + re_paragraph = re.compile(r'\s*\n\s*\n\s*') + soup = bs(html, 'html.parser') + for string in soup.find_all(string=re_paragraph): + new_item = bs('', 'html.parser') + for substring in [s for s in re_paragraph.split(string) if s]: + p = new_item.new_tag('p') + p.string = substring.strip() + new_item.append(p) + string.replace_with(new_item) + return str(soup) + + def auto_br(html): + re_break = re.compile(r'\s*\n\s*') + soup = bs(html, 'html.parser') + for string in soup.find_all(string=re_break): + new_item = bs('', 'html.parser') + for index, substring in enumerate([s for s in re_break.split(string) if s]): + if index: + br = new_item.new_tag('br') + new_item.append(br) + new_item.append(substring.strip()) + string.replace_with(new_item) + return str(soup) + + html = auto_p(html) + html = auto_br(html) + return html + + +def sanitarize_html(html, node_fmt): html = strong_to_dl(html) - soup = bs(html, 'html.parser') - return str(soup) + + if node_fmt == 'PHP code': + raise NotImplementedError("Ce node est au format PHP.") + if node_fmt == 'Filtered HTML': + html = filter_html(html) + if node_fmt == 'Filtered HTML': + html = str(bs(html, 'html.parser')) + + return html def convert_node(node, options): @@ -139,7 +177,8 @@ def convert_node(node, options): or len(set(spip_urls.values_list('id_objet', flat=True))) != 1 or spip.Urls.objects.filter( type='article', id_objet=article_id - ).count() != spip_urls.count() + ).count() + != spip_urls.count() ): # incohérence dans les urls raise ValueError( @@ -147,15 +186,19 @@ def convert_node(node, options): "de s'assurer qu'il s'agisse d'un import prééexistant." ) + node_fmt = node.published_revision.format.name + article = None article_attributes = { 'date': convert_timestamp(node.published_revision.timestamp), 'date_modif': convert_timestamp(node.changed), 'date_redac': convert_timestamp(node.created), - 'descriptif': sanitarize_html(node.published_revision.teaser), + 'descriptif': sanitarize_html( + node.published_revision.teaser, node_fmt + ), 'maj': convert_timestamp(node.changed), 'statut': 'publie' if node.status else 'prepa', - 'texte': sanitarize_html(node.published_revision.body), + 'texte': sanitarize_html(node.published_revision.body, node_fmt), 'titre': node.title, } if not spip_urls.exists(): diff --git a/drupal2spip_lal/base/management/commands/import.py b/drupal2spip_lal/base/management/commands/import.py index dca7c59..5136b91 100644 --- a/drupal2spip_lal/base/management/commands/import.py +++ b/drupal2spip_lal/base/management/commands/import.py @@ -5,7 +5,6 @@ from django.core.management.base import BaseCommand from drupal2spip_lal.base.convert import convert_node from drupal2spip_lal.drupal.models import Node - logger = logging.getLogger('drupal2spip_lal')