feat(convert): ajoute les liens auto dans le filtrage html

This commit is contained in:
François Poulain 2020-08-08 18:24:27 +02:00
parent 7ce36668ea
commit e4816bc2ee

View File

@ -118,7 +118,49 @@ def filter_html(html):
except Exception as e:
raise ValueError("Echec de auto_p: {}".format(e))
def auto_a(html):
soup = bs(html, 'html.parser')
email_pattern = re.compile(r'(\b[\w\.\+_-]+@(\w+\.)+\w+\b)')
for line in soup.find_all(string=email_pattern):
if line.parent.name == 'a':
continue
a_string = email_pattern.sub(
r'<a class= "auto-a" href="mailto:\1">\1</a>', line.string
)
a_soup = bs(a_string, 'html.parser')
line.replace_with(a_soup)
protocols = [
'http',
'https',
'ftp',
'news',
'nntp',
'tel',
'telnet',
'mailto',
'irc',
'ssh',
'sftp',
'webcal',
'rtsp',
]
link_pattern = re.compile(
r'((\b({})s?)?://(\w+\.)+\w+/?[^\s]*)'.format('|'.join(protocols))
)
for line in soup.find_all(string=link_pattern):
if line.parent.name == 'a':
continue
a_string = link_pattern.sub(
r'<a class="auto-a" href="\1">\1</a>', line.string
)
a_soup = bs(a_string, 'html.parser')
line.replace_with(a_soup)
return str(soup)
html = auto_p(html)
html = auto_a(html)
return html