feat(convert): reconstruction des dl implicites

This commit is contained in:
François Poulain 2020-08-03 19:16:19 +02:00
parent 1456a62b25
commit 969e9bb155
1 changed files with 34 additions and 0 deletions

View File

@ -1,4 +1,6 @@
import re
from datetime import datetime
from itertools import groupby
from django.utils.timezone import make_aware, now
@ -17,7 +19,39 @@ def convert_timestamp(timestamp):
return make_aware(datetime.fromtimestamp(timestamp))
def strong_to_dl(html):
"""
Marie-Odile nous fait des dl-like à base de strong.
Parser html avec des regex est mal mais on essaie de reconstruire la dl
avant de casser la forme avec bs.
"""
is_strong = r'<strong>(?P<dt>.+)</strong>(?P<dd>.*)$'
def is_strong_item(s):
return bool(re.match(is_strong, s))
items = re.split(r'[\r\n]+', html)
grouped_items = groupby(items, key=is_strong_item)
r = []
for key, group in grouped_items:
group = list(group)
if key and len(group) > 2:
dl = ['<dl>']
for elem in group:
match = re.match(is_strong, elem).groupdict()
dl += [
'<dt>{}</dt>'.format(match['dt'].strip()),
'<dd>{}</dd>'.format(match['dd'].strip()),
]
dl.append('</dl>')
r += dl
else:
r += group
return '\n'.join(r)
def sanitarize_html(html):
html = strong_to_dl(html)
soup = bs(html, 'html.parser')
return soup.prettify(formatter="html5")