From 8d6260d8a94e768df444bfdb201c064a412c175c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Poulain?= Date: Mon, 3 Aug 2020 21:09:02 +0200 Subject: [PATCH] =?UTF-8?q?feat(convert):=20devine=20les=20logos=20et=20le?= =?UTF-8?q?s=20r=C3=A9cup=C3=A8re?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- drupal2spip_lal/base/convert.py | 49 +++++++++++++++++++++++++++++++- drupal2spip_lal/settings/base.py | 4 +++ requirements/base.txt | 1 + 3 files changed, 53 insertions(+), 1 deletion(-) diff --git a/drupal2spip_lal/base/convert.py b/drupal2spip_lal/base/convert.py index 7e02a9b..88bbe18 100644 --- a/drupal2spip_lal/base/convert.py +++ b/drupal2spip_lal/base/convert.py @@ -1,9 +1,12 @@ +import os import re from datetime import datetime from itertools import groupby -from django.utils.timezone import make_aware, now +from django.conf import settings +from django.utils.timezone import make_aware +import request from bs4 import BeautifulSoup as bs from drupal2spip_lal.drupal import models as drupal @@ -50,6 +53,47 @@ def strong_to_dl(html): return '\n'.join(r) +def fetch_and_remove_logo(article): + def fetch_logo(src): + """ + SPIP gère les logos à la façon d'un hack : un fichier dans IMG nommé + 'arton{}.{}'.format(article.pk, ext) + """ + ext = src.split('.')[-1] + filename = 'arton{}.{}'.format(article.pk, ext) + path = os.path.join(settings.SPIP_LOGO_DIR, filename) + + r = request.get(src, stream=True) + + with open(path, 'wb') as fd: + for chunk in r.iter_content(chunk_size=128): + fd.write(chunk) + + def remove_img(img): + has_siblings = [ + elem + for elem in list(img.previous_siblings) + list(img.next_siblings) + if elem != '\n' + ] + if img.parent.name == 'a' and not has_siblings: + img = img.parent + img.replace_with('') + + soup = bs(article.descriptif, 'html.parser') + img = soup.find('img') + + src = img and img.attrs.get('src', None) + + if src and src.startswith('/'): + src = 'http://{}{}'.format(settings.DRUPAL_FQDN, src) + + if src and re.match(r'^(https?)?://', src): + fetch_logo(src) + remove_img(img) + article.descriptif = soup.prettify(formatter="html5") + article.save() + + def sanitarize_html(html): html = strong_to_dl(html) soup = bs(html, 'html.parser') @@ -116,6 +160,7 @@ def convert_node(node, update=False): 'en_ligne': convert_timestamp(node.user.access), 'maj': convert_timestamp(node.user.created), } + auteur, _ = spip.Auteurs.objects.update_or_create( login=node.user.name, defaults=user_attributes ) @@ -123,3 +168,5 @@ def convert_node(node, update=False): spip.AuteursLiens.objects.update_or_create( auteur=auteur, id_objet=article.pk, objet='article' ) + + fetch_and_remove_logo(article) diff --git a/drupal2spip_lal/settings/base.py b/drupal2spip_lal/settings/base.py index 817d5ec..6d3100a 100644 --- a/drupal2spip_lal/settings/base.py +++ b/drupal2spip_lal/settings/base.py @@ -177,6 +177,8 @@ STATICFILES_FINDERS = [ # https://docs.djangoproject.com/en/stable/ref/settings/#media-root MEDIA_ROOT = var_dir('media') +SPIP_LOGO_DIR = env('SPIP_LOGO_DIR', default=MEDIA_ROOT) + # https://docs.djangoproject.com/en/stable/ref/settings/#media-url MEDIA_URL = os.path.join(APP_LOCATION, 'media/') @@ -249,3 +251,5 @@ CSRF_COOKIE_PATH = APP_LOCATION # ------------------------------------------------------------------------------ # APPLICATION AND 3RD PARTY LIBRARY SETTINGS # ------------------------------------------------------------------------------ + +DRUPAL_FQDN = env('DRUPAL_FQDN', default='www.april.org') diff --git a/requirements/base.txt b/requirements/base.txt index 4f842d5..fdf3dca 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -9,4 +9,5 @@ phpserialize # HTML # ------------------------------------------------------------------------------ +request beautifulsoup4