From 578e02bb3e770e1bd8583642c399059d7a5a257f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Fran=C3=A7ois=20Poulain?= <fpoulain@metrodore.fr>
Date: Sat, 8 Aug 2020 15:49:21 +0200
Subject: [PATCH] ref(convert): tentative de filtre html avec des regex

---
 drupal2spip_lal/base/convert.py | 44 +++++++++++++++++----------------
 1 file changed, 23 insertions(+), 21 deletions(-)
diff --git a/drupal2spip_lal/base/convert.py b/drupal2spip_lal/base/convert.py
index 4291b82..727ad4e 100644
--- a/drupal2spip_lal/base/convert.py
+++ b/drupal2spip_lal/base/convert.py
@@ -111,33 +111,33 @@ def fetch_and_remove_logo(article, force_download):
 
 
 def filter_html(html):
+    inline_elems = ['a', 'em', 'strong', 'cite', 'code', 'b', 'i']
+    block_elems = ['p', 'dd', 'dt', 'dl', 'ul', 'ol', 'li', 'h2', 'h3', 'h4', 'img', 'audio', 'video']
+
+    def explicit_double_br(html):
+        double_br = re.compile(r'<\s*br\s*/?>\s*\n|\n\s*<\s*br\s*/?>')
+        return double_br.sub('\n\n', html)
+
     def auto_p(html):
         re_paragraph = re.compile(r'\s*\n\s*\n\s*')
-        soup = bs(html, 'html.parser')
-        for string in soup.find_all(string=re_paragraph):
-            new_item = bs('', 'html.parser')
-            for substring in [s for s in re_paragraph.split(string) if s]:
-                p = new_item.new_tag('p')
-                p.string = substring.strip()
-                new_item.append(p)
-            string.replace_with(new_item)
-        return str(soup)
+        return re_paragraph.sub('</p><p class="auto-p">', html)
 
     def auto_br(html):
         re_break = re.compile(r'\s*\n\s*')
+        return re_break.sub('<br class="auto-br">', html)
+
+    def remove_spare_br(html):
         soup = bs(html, 'html.parser')
-        for string in soup.find_all(string=re_break):
-            new_item = bs('', 'html.parser')
-            for index, substring in enumerate([s for s in re_break.split(string) if s]):
-                if index:
-                    br = new_item.new_tag('br')
-                    new_item.append(br)
-                new_item.append(substring.strip())
-            string.replace_with(new_item)
+        # ou dernier premier élément d'un parent de type bloc
+        def spare_br(elem):
+            return elem.name == 'br' and getattr(elem.parent, 'name', None) in block_elems and not (elem.next_sibling and elem.previous_sibling)
+        [elem.clear() for elem in soup.find_all(spare_br)]
         return str(soup)
 
+    html = explicit_double_br(html)
     html = auto_p(html)
     html = auto_br(html)
+    html = remove_spare_br(html)
     return html
 
 
@@ -146,12 +146,14 @@ def sanitarize_html(html, node_fmt):
 
     if node_fmt == 'PHP code':
         raise NotImplementedError("Ce node est au format PHP.")
-    if node_fmt == 'Filtered HTML':
+    elif node_fmt == 'Filtered HTML':
         html = filter_html(html)
-    if node_fmt == 'Filtered HTML':
-        html = str(bs(html, 'html.parser'))
+    elif node_fmt == 'Full HTML':
+        pass
+    else:
+        raise NotImplementedError("Ce node est dans un format inconnu.")
 
-    return html
+    return str(bs(html, 'html.parser'))
 
 
 def convert_node(node, options):