From 578e02bb3e770e1bd8583642c399059d7a5a257f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Poulain?= Date: Sat, 8 Aug 2020 15:49:21 +0200 Subject: [PATCH] ref(convert): tentative de filtre html avec des regex --- drupal2spip_lal/base/convert.py | 44 +++++++++++++++++---------------- 1 file changed, 23 insertions(+), 21 deletions(-) diff --git a/drupal2spip_lal/base/convert.py b/drupal2spip_lal/base/convert.py index 4291b82..727ad4e 100644 --- a/drupal2spip_lal/base/convert.py +++ b/drupal2spip_lal/base/convert.py @@ -111,33 +111,33 @@ def fetch_and_remove_logo(article, force_download): def filter_html(html): + inline_elems = ['a', 'em', 'strong', 'cite', 'code', 'b', 'i'] + block_elems = ['p', 'dd', 'dt', 'dl', 'ul', 'ol', 'li', 'h2', 'h3', 'h4', 'img', 'audio', 'video'] + + def explicit_double_br(html): + double_br = re.compile(r'<\s*br\s*/?>\s*\n|\n\s*<\s*br\s*/?>') + return double_br.sub('\n\n', html) + def auto_p(html): re_paragraph = re.compile(r'\s*\n\s*\n\s*') - soup = bs(html, 'html.parser') - for string in soup.find_all(string=re_paragraph): - new_item = bs('', 'html.parser') - for substring in [s for s in re_paragraph.split(string) if s]: - p = new_item.new_tag('p') - p.string = substring.strip() - new_item.append(p) - string.replace_with(new_item) - return str(soup) + return re_paragraph.sub('

', html) def auto_br(html): re_break = re.compile(r'\s*\n\s*') + return re_break.sub('
', html) + + def remove_spare_br(html): soup = bs(html, 'html.parser') - for string in soup.find_all(string=re_break): - new_item = bs('', 'html.parser') - for index, substring in enumerate([s for s in re_break.split(string) if s]): - if index: - br = new_item.new_tag('br') - new_item.append(br) - new_item.append(substring.strip()) - string.replace_with(new_item) + # ou dernier premier élément d'un parent de type bloc + def spare_br(elem): + return elem.name == 'br' and getattr(elem.parent, 'name', None) in block_elems and not (elem.next_sibling and elem.previous_sibling) + [elem.clear() for elem in soup.find_all(spare_br)] return str(soup) + html = explicit_double_br(html) html = auto_p(html) html = auto_br(html) + html = remove_spare_br(html) return html @@ -146,12 +146,14 @@ def sanitarize_html(html, node_fmt): if node_fmt == 'PHP code': raise NotImplementedError("Ce node est au format PHP.") - if node_fmt == 'Filtered HTML': + elif node_fmt == 'Filtered HTML': html = filter_html(html) - if node_fmt == 'Filtered HTML': - html = str(bs(html, 'html.parser')) + elif node_fmt == 'Full HTML': + pass + else: + raise NotImplementedError("Ce node est dans un format inconnu.") - return html + return str(bs(html, 'html.parser')) def convert_node(node, options):