feat(convert): devine les logos et les récupère

This commit is contained in:
François Poulain 2020-08-03 21:09:02 +02:00
parent 969e9bb155
commit 8d6260d8a9
3 changed files with 53 additions and 1 deletions

View File

@ -1,9 +1,12 @@
import os
import re import re
from datetime import datetime from datetime import datetime
from itertools import groupby from itertools import groupby
from django.utils.timezone import make_aware, now from django.conf import settings
from django.utils.timezone import make_aware
import request
from bs4 import BeautifulSoup as bs from bs4 import BeautifulSoup as bs
from drupal2spip_lal.drupal import models as drupal from drupal2spip_lal.drupal import models as drupal
@ -50,6 +53,47 @@ def strong_to_dl(html):
return '\n'.join(r) return '\n'.join(r)
def fetch_and_remove_logo(article):
def fetch_logo(src):
"""
SPIP gère les logos à la façon d'un hack : un fichier dans IMG nommé
'arton{}.{}'.format(article.pk, ext)
"""
ext = src.split('.')[-1]
filename = 'arton{}.{}'.format(article.pk, ext)
path = os.path.join(settings.SPIP_LOGO_DIR, filename)
r = request.get(src, stream=True)
with open(path, 'wb') as fd:
for chunk in r.iter_content(chunk_size=128):
fd.write(chunk)
def remove_img(img):
has_siblings = [
elem
for elem in list(img.previous_siblings) + list(img.next_siblings)
if elem != '\n'
]
if img.parent.name == 'a' and not has_siblings:
img = img.parent
img.replace_with('')
soup = bs(article.descriptif, 'html.parser')
img = soup.find('img')
src = img and img.attrs.get('src', None)
if src and src.startswith('/'):
src = 'http://{}{}'.format(settings.DRUPAL_FQDN, src)
if src and re.match(r'^(https?)?://', src):
fetch_logo(src)
remove_img(img)
article.descriptif = soup.prettify(formatter="html5")
article.save()
def sanitarize_html(html): def sanitarize_html(html):
html = strong_to_dl(html) html = strong_to_dl(html)
soup = bs(html, 'html.parser') soup = bs(html, 'html.parser')
@ -116,6 +160,7 @@ def convert_node(node, update=False):
'en_ligne': convert_timestamp(node.user.access), 'en_ligne': convert_timestamp(node.user.access),
'maj': convert_timestamp(node.user.created), 'maj': convert_timestamp(node.user.created),
} }
auteur, _ = spip.Auteurs.objects.update_or_create( auteur, _ = spip.Auteurs.objects.update_or_create(
login=node.user.name, defaults=user_attributes login=node.user.name, defaults=user_attributes
) )
@ -123,3 +168,5 @@ def convert_node(node, update=False):
spip.AuteursLiens.objects.update_or_create( spip.AuteursLiens.objects.update_or_create(
auteur=auteur, id_objet=article.pk, objet='article' auteur=auteur, id_objet=article.pk, objet='article'
) )
fetch_and_remove_logo(article)

View File

@ -177,6 +177,8 @@ STATICFILES_FINDERS = [
# https://docs.djangoproject.com/en/stable/ref/settings/#media-root # https://docs.djangoproject.com/en/stable/ref/settings/#media-root
MEDIA_ROOT = var_dir('media') MEDIA_ROOT = var_dir('media')
SPIP_LOGO_DIR = env('SPIP_LOGO_DIR', default=MEDIA_ROOT)
# https://docs.djangoproject.com/en/stable/ref/settings/#media-url # https://docs.djangoproject.com/en/stable/ref/settings/#media-url
MEDIA_URL = os.path.join(APP_LOCATION, 'media/') MEDIA_URL = os.path.join(APP_LOCATION, 'media/')
@ -249,3 +251,5 @@ CSRF_COOKIE_PATH = APP_LOCATION
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
# APPLICATION AND 3RD PARTY LIBRARY SETTINGS # APPLICATION AND 3RD PARTY LIBRARY SETTINGS
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
DRUPAL_FQDN = env('DRUPAL_FQDN', default='www.april.org')

View File

@ -9,4 +9,5 @@ phpserialize
# HTML # HTML
# ------------------------------------------------------------------------------ # ------------------------------------------------------------------------------
request
beautifulsoup4 beautifulsoup4