diff --git a/drupal2spip_lal/base/convert.py b/drupal2spip_lal/base/convert.py index c28a866..7e02a9b 100644 --- a/drupal2spip_lal/base/convert.py +++ b/drupal2spip_lal/base/convert.py @@ -1,4 +1,6 @@ +import re from datetime import datetime +from itertools import groupby from django.utils.timezone import make_aware, now @@ -17,7 +19,39 @@ def convert_timestamp(timestamp): return make_aware(datetime.fromtimestamp(timestamp)) +def strong_to_dl(html): + """ + Marie-Odile nous fait des dl-like à base de strong. + Parser html avec des regex est mal mais on essaie de reconstruire la dl + avant de casser la forme avec bs. + """ + is_strong = r'(?P
.+)(?P
.*)$' + + def is_strong_item(s): + return bool(re.match(is_strong, s)) + + items = re.split(r'[\r\n]+', html) + grouped_items = groupby(items, key=is_strong_item) + r = [] + for key, group in grouped_items: + group = list(group) + if key and len(group) > 2: + dl = ['
'] + for elem in group: + match = re.match(is_strong, elem).groupdict() + dl += [ + '
{}
'.format(match['dt'].strip()), + '
{}
'.format(match['dd'].strip()), + ] + dl.append('
') + r += dl + else: + r += group + return '\n'.join(r) + + def sanitarize_html(html): + html = strong_to_dl(html) soup = bs(html, 'html.parser') return soup.prettify(formatter="html5")