feat(convert): implante les filtres html autobr et autop de drupal

This commit is contained in:
François Poulain 2020-08-08 13:45:45 +02:00
parent cc923c59a2
commit c8a075fc7a
2 changed files with 49 additions and 7 deletions

View File

@ -110,10 +110,48 @@ def fetch_and_remove_logo(article, force_download):
article.save()
def sanitarize_html(html):
def filter_html(html):
def auto_p(html):
re_paragraph = re.compile(r'\s*\n\s*\n\s*')
soup = bs(html, 'html.parser')
for string in soup.find_all(string=re_paragraph):
new_item = bs('', 'html.parser')
for substring in [s for s in re_paragraph.split(string) if s]:
p = new_item.new_tag('p')
p.string = substring.strip()
new_item.append(p)
string.replace_with(new_item)
return str(soup)
def auto_br(html):
re_break = re.compile(r'\s*\n\s*')
soup = bs(html, 'html.parser')
for string in soup.find_all(string=re_break):
new_item = bs('', 'html.parser')
for index, substring in enumerate([s for s in re_break.split(string) if s]):
if index:
br = new_item.new_tag('br')
new_item.append(br)
new_item.append(substring.strip())
string.replace_with(new_item)
return str(soup)
html = auto_p(html)
html = auto_br(html)
return html
def sanitarize_html(html, node_fmt):
html = strong_to_dl(html)
soup = bs(html, 'html.parser')
return str(soup)
if node_fmt == 'PHP code':
raise NotImplementedError("Ce node est au format PHP.")
if node_fmt == 'Filtered HTML':
html = filter_html(html)
if node_fmt == 'Filtered HTML':
html = str(bs(html, 'html.parser'))
return html
def convert_node(node, options):
@ -139,7 +177,8 @@ def convert_node(node, options):
or len(set(spip_urls.values_list('id_objet', flat=True))) != 1
or spip.Urls.objects.filter(
type='article', id_objet=article_id
).count() != spip_urls.count()
).count()
!= spip_urls.count()
):
# incohérence dans les urls
raise ValueError(
@ -147,15 +186,19 @@ def convert_node(node, options):
"de s'assurer qu'il s'agisse d'un import prééexistant."
)
node_fmt = node.published_revision.format.name
article = None
article_attributes = {
'date': convert_timestamp(node.published_revision.timestamp),
'date_modif': convert_timestamp(node.changed),
'date_redac': convert_timestamp(node.created),
'descriptif': sanitarize_html(node.published_revision.teaser),
'descriptif': sanitarize_html(
node.published_revision.teaser, node_fmt
),
'maj': convert_timestamp(node.changed),
'statut': 'publie' if node.status else 'prepa',
'texte': sanitarize_html(node.published_revision.body),
'texte': sanitarize_html(node.published_revision.body, node_fmt),
'titre': node.title,
}
if not spip_urls.exists():

View File

@ -5,7 +5,6 @@ from django.core.management.base import BaseCommand
from drupal2spip_lal.base.convert import convert_node
from drupal2spip_lal.drupal.models import Node
logger = logging.getLogger('drupal2spip_lal')