feat(convert): reconstruction des dl implicites
This commit is contained in:
parent
1456a62b25
commit
969e9bb155
|
@ -1,4 +1,6 @@
|
||||||
|
import re
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from itertools import groupby
|
||||||
|
|
||||||
from django.utils.timezone import make_aware, now
|
from django.utils.timezone import make_aware, now
|
||||||
|
|
||||||
|
@ -17,7 +19,39 @@ def convert_timestamp(timestamp):
|
||||||
return make_aware(datetime.fromtimestamp(timestamp))
|
return make_aware(datetime.fromtimestamp(timestamp))
|
||||||
|
|
||||||
|
|
||||||
|
def strong_to_dl(html):
|
||||||
|
"""
|
||||||
|
Marie-Odile nous fait des dl-like à base de strong.
|
||||||
|
Parser html avec des regex est mal mais on essaie de reconstruire la dl
|
||||||
|
avant de casser la forme avec bs.
|
||||||
|
"""
|
||||||
|
is_strong = r'<strong>(?P<dt>.+)</strong>(?P<dd>.*)$'
|
||||||
|
|
||||||
|
def is_strong_item(s):
|
||||||
|
return bool(re.match(is_strong, s))
|
||||||
|
|
||||||
|
items = re.split(r'[\r\n]+', html)
|
||||||
|
grouped_items = groupby(items, key=is_strong_item)
|
||||||
|
r = []
|
||||||
|
for key, group in grouped_items:
|
||||||
|
group = list(group)
|
||||||
|
if key and len(group) > 2:
|
||||||
|
dl = ['<dl>']
|
||||||
|
for elem in group:
|
||||||
|
match = re.match(is_strong, elem).groupdict()
|
||||||
|
dl += [
|
||||||
|
'<dt>{}</dt>'.format(match['dt'].strip()),
|
||||||
|
'<dd>{}</dd>'.format(match['dd'].strip()),
|
||||||
|
]
|
||||||
|
dl.append('</dl>')
|
||||||
|
r += dl
|
||||||
|
else:
|
||||||
|
r += group
|
||||||
|
return '\n'.join(r)
|
||||||
|
|
||||||
|
|
||||||
def sanitarize_html(html):
|
def sanitarize_html(html):
|
||||||
|
html = strong_to_dl(html)
|
||||||
soup = bs(html, 'html.parser')
|
soup = bs(html, 'html.parser')
|
||||||
return soup.prettify(formatter="html5")
|
return soup.prettify(formatter="html5")
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue