From e4816bc2eeff675d54aa84047a4a988d697949c9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Poulain?= Date: Sat, 8 Aug 2020 18:24:27 +0200 Subject: [PATCH] feat(convert): ajoute les liens auto dans le filtrage html --- drupal2spip_lal/base/convert.py | 42 +++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/drupal2spip_lal/base/convert.py b/drupal2spip_lal/base/convert.py index b53c046..76aaa96 100644 --- a/drupal2spip_lal/base/convert.py +++ b/drupal2spip_lal/base/convert.py @@ -118,7 +118,49 @@ def filter_html(html): except Exception as e: raise ValueError("Echec de auto_p: {}".format(e)) + def auto_a(html): + soup = bs(html, 'html.parser') + + email_pattern = re.compile(r'(\b[\w\.\+_-]+@(\w+\.)+\w+\b)') + for line in soup.find_all(string=email_pattern): + if line.parent.name == 'a': + continue + a_string = email_pattern.sub( + r'\1', line.string + ) + a_soup = bs(a_string, 'html.parser') + line.replace_with(a_soup) + + protocols = [ + 'http', + 'https', + 'ftp', + 'news', + 'nntp', + 'tel', + 'telnet', + 'mailto', + 'irc', + 'ssh', + 'sftp', + 'webcal', + 'rtsp', + ] + link_pattern = re.compile( + r'((\b({})s?)?://(\w+\.)+\w+/?[^\s]*)'.format('|'.join(protocols)) + ) + for line in soup.find_all(string=link_pattern): + if line.parent.name == 'a': + continue + a_string = link_pattern.sub( + r'\1', line.string + ) + a_soup = bs(a_string, 'html.parser') + line.replace_with(a_soup) + return str(soup) + html = auto_p(html) + html = auto_a(html) return html