ref(convert): tentative de filtre html avec des regex
This commit is contained in:
parent
c8a075fc7a
commit
578e02bb3e
|
@ -111,33 +111,33 @@ def fetch_and_remove_logo(article, force_download):
|
||||||
|
|
||||||
|
|
||||||
def filter_html(html):
|
def filter_html(html):
|
||||||
|
inline_elems = ['a', 'em', 'strong', 'cite', 'code', 'b', 'i']
|
||||||
|
block_elems = ['p', 'dd', 'dt', 'dl', 'ul', 'ol', 'li', 'h2', 'h3', 'h4', 'img', 'audio', 'video']
|
||||||
|
|
||||||
|
def explicit_double_br(html):
|
||||||
|
double_br = re.compile(r'<\s*br\s*/?>\s*\n|\n\s*<\s*br\s*/?>')
|
||||||
|
return double_br.sub('\n\n', html)
|
||||||
|
|
||||||
def auto_p(html):
|
def auto_p(html):
|
||||||
re_paragraph = re.compile(r'\s*\n\s*\n\s*')
|
re_paragraph = re.compile(r'\s*\n\s*\n\s*')
|
||||||
soup = bs(html, 'html.parser')
|
return re_paragraph.sub('</p><p class="auto-p">', html)
|
||||||
for string in soup.find_all(string=re_paragraph):
|
|
||||||
new_item = bs('', 'html.parser')
|
|
||||||
for substring in [s for s in re_paragraph.split(string) if s]:
|
|
||||||
p = new_item.new_tag('p')
|
|
||||||
p.string = substring.strip()
|
|
||||||
new_item.append(p)
|
|
||||||
string.replace_with(new_item)
|
|
||||||
return str(soup)
|
|
||||||
|
|
||||||
def auto_br(html):
|
def auto_br(html):
|
||||||
re_break = re.compile(r'\s*\n\s*')
|
re_break = re.compile(r'\s*\n\s*')
|
||||||
|
return re_break.sub('<br class="auto-br">', html)
|
||||||
|
|
||||||
|
def remove_spare_br(html):
|
||||||
soup = bs(html, 'html.parser')
|
soup = bs(html, 'html.parser')
|
||||||
for string in soup.find_all(string=re_break):
|
# ou dernier premier élément d'un parent de type bloc
|
||||||
new_item = bs('', 'html.parser')
|
def spare_br(elem):
|
||||||
for index, substring in enumerate([s for s in re_break.split(string) if s]):
|
return elem.name == 'br' and getattr(elem.parent, 'name', None) in block_elems and not (elem.next_sibling and elem.previous_sibling)
|
||||||
if index:
|
[elem.clear() for elem in soup.find_all(spare_br)]
|
||||||
br = new_item.new_tag('br')
|
|
||||||
new_item.append(br)
|
|
||||||
new_item.append(substring.strip())
|
|
||||||
string.replace_with(new_item)
|
|
||||||
return str(soup)
|
return str(soup)
|
||||||
|
|
||||||
|
html = explicit_double_br(html)
|
||||||
html = auto_p(html)
|
html = auto_p(html)
|
||||||
html = auto_br(html)
|
html = auto_br(html)
|
||||||
|
html = remove_spare_br(html)
|
||||||
return html
|
return html
|
||||||
|
|
||||||
|
|
||||||
|
@ -146,12 +146,14 @@ def sanitarize_html(html, node_fmt):
|
||||||
|
|
||||||
if node_fmt == 'PHP code':
|
if node_fmt == 'PHP code':
|
||||||
raise NotImplementedError("Ce node est au format PHP.")
|
raise NotImplementedError("Ce node est au format PHP.")
|
||||||
if node_fmt == 'Filtered HTML':
|
elif node_fmt == 'Filtered HTML':
|
||||||
html = filter_html(html)
|
html = filter_html(html)
|
||||||
if node_fmt == 'Filtered HTML':
|
elif node_fmt == 'Full HTML':
|
||||||
html = str(bs(html, 'html.parser'))
|
pass
|
||||||
|
else:
|
||||||
|
raise NotImplementedError("Ce node est dans un format inconnu.")
|
||||||
|
|
||||||
return html
|
return str(bs(html, 'html.parser'))
|
||||||
|
|
||||||
|
|
||||||
def convert_node(node, options):
|
def convert_node(node, options):
|
||||||
|
|
Loading…
Reference in New Issue