From 969e9bb1550ae460a14e1a27b4d1cb3be8e08ce2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Poulain?= Date: Mon, 3 Aug 2020 19:16:19 +0200 Subject: [PATCH] feat(convert): reconstruction des dl implicites --- drupal2spip_lal/base/convert.py | 34 +++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) diff --git a/drupal2spip_lal/base/convert.py b/drupal2spip_lal/base/convert.py index c28a866..7e02a9b 100644 --- a/drupal2spip_lal/base/convert.py +++ b/drupal2spip_lal/base/convert.py @@ -1,4 +1,6 @@ +import re from datetime import datetime +from itertools import groupby from django.utils.timezone import make_aware, now @@ -17,7 +19,39 @@ def convert_timestamp(timestamp): return make_aware(datetime.fromtimestamp(timestamp)) +def strong_to_dl(html): + """ + Marie-Odile nous fait des dl-like à base de strong. + Parser html avec des regex est mal mais on essaie de reconstruire la dl + avant de casser la forme avec bs. + """ + is_strong = r'(?P
.+)(?P
.*)$' + + def is_strong_item(s): + return bool(re.match(is_strong, s)) + + items = re.split(r'[\r\n]+', html) + grouped_items = groupby(items, key=is_strong_item) + r = [] + for key, group in grouped_items: + group = list(group) + if key and len(group) > 2: + dl = ['
'] + for elem in group: + match = re.match(is_strong, elem).groupdict() + dl += [ + '
{}
'.format(match['dt'].strip()), + '
{}
'.format(match['dd'].strip()), + ] + dl.append('
') + r += dl + else: + r += group + return '\n'.join(r) + + def sanitarize_html(html): + html = strong_to_dl(html) soup = bs(html, 'html.parser') return soup.prettify(formatter="html5")