feat(convert): reconstruction des dl implicites
This commit is contained in:
parent
1456a62b25
commit
969e9bb155
@ -1,4 +1,6 @@
|
||||
import re
|
||||
from datetime import datetime
|
||||
from itertools import groupby
|
||||
|
||||
from django.utils.timezone import make_aware, now
|
||||
|
||||
@ -17,7 +19,39 @@ def convert_timestamp(timestamp):
|
||||
return make_aware(datetime.fromtimestamp(timestamp))
|
||||
|
||||
|
||||
def strong_to_dl(html):
|
||||
"""
|
||||
Marie-Odile nous fait des dl-like à base de strong.
|
||||
Parser html avec des regex est mal mais on essaie de reconstruire la dl
|
||||
avant de casser la forme avec bs.
|
||||
"""
|
||||
is_strong = r'<strong>(?P<dt>.+)</strong>(?P<dd>.*)$'
|
||||
|
||||
def is_strong_item(s):
|
||||
return bool(re.match(is_strong, s))
|
||||
|
||||
items = re.split(r'[\r\n]+', html)
|
||||
grouped_items = groupby(items, key=is_strong_item)
|
||||
r = []
|
||||
for key, group in grouped_items:
|
||||
group = list(group)
|
||||
if key and len(group) > 2:
|
||||
dl = ['<dl>']
|
||||
for elem in group:
|
||||
match = re.match(is_strong, elem).groupdict()
|
||||
dl += [
|
||||
'<dt>{}</dt>'.format(match['dt'].strip()),
|
||||
'<dd>{}</dd>'.format(match['dd'].strip()),
|
||||
]
|
||||
dl.append('</dl>')
|
||||
r += dl
|
||||
else:
|
||||
r += group
|
||||
return '\n'.join(r)
|
||||
|
||||
|
||||
def sanitarize_html(html):
|
||||
html = strong_to_dl(html)
|
||||
soup = bs(html, 'html.parser')
|
||||
return soup.prettify(formatter="html5")
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user