import os import re from datetime import datetime from itertools import groupby from django.conf import settings from django.utils.timezone import make_aware import request from bs4 import BeautifulSoup as bs from drupal2spip_lal.drupal import models as drupal from drupal2spip_lal.spip import models as spip # Questions # - quelle utilisation des rubriques ? # - quelle hiérarchie des mots clés ? # - autobr sélectif ? def convert_timestamp(timestamp): return make_aware(datetime.fromtimestamp(timestamp)) def strong_to_dl(html): """ Marie-Odile nous fait des dl-like à base de strong. Parser html avec des regex est mal mais on essaie de reconstruire la dl avant de casser la forme avec bs. """ is_strong = r'(?P
.+)(?P
.*)$' def is_strong_item(s): return bool(re.match(is_strong, s)) items = re.split(r'[\r\n]+', html) grouped_items = groupby(items, key=is_strong_item) r = [] for key, group in grouped_items: group = list(group) if key and len(group) > 2: dl = ['
'] for elem in group: match = re.match(is_strong, elem).groupdict() dl += [ '
{}
'.format(match['dt'].strip()), '
{}
'.format(match['dd'].strip()), ] dl.append('
') r += dl else: r += group return '\n'.join(r) def fetch_and_remove_logo(article): def fetch_logo(src): """ SPIP gère les logos à la façon d'un hack : un fichier dans IMG nommé 'arton{}.{}'.format(article.pk, ext) """ ext = src.split('.')[-1] filename = 'arton{}.{}'.format(article.pk, ext) path = os.path.join(settings.SPIP_LOGO_DIR, filename) r = request.get(src, stream=True) with open(path, 'wb') as fd: for chunk in r.iter_content(chunk_size=128): fd.write(chunk) def remove_img(img): has_siblings = [ elem for elem in list(img.previous_siblings) + list(img.next_siblings) if elem != '\n' ] if img.parent.name == 'a' and not has_siblings: img = img.parent img.replace_with('') soup = bs(article.descriptif, 'html.parser') img = soup.find('img') src = img and img.attrs.get('src', None) if src and src.startswith('/'): src = 'http://{}{}'.format(settings.DRUPAL_FQDN, src) if src and re.match(r'^(https?)?://', src): fetch_logo(src) remove_img(img) article.descriptif = soup.prettify(formatter="html5") article.save() def sanitarize_html(html): html = strong_to_dl(html) soup = bs(html, 'html.parser') return soup.prettify(formatter="html5") def convert_node(node, update=False): """ Le point d'entrée fonctionnel c'est les Urls. On se base donc là dessus pour vérifier si l'import est à faire ou pas ou encore à upgrader. """ node_urls = drupal.UrlAlias.objects.filter(src='node/{}'.format(node.pk)) spip_urls = spip.Urls.objects.filter( type='article', url__in=list(node_urls.values_list('dst', flat=True)) ) spip_urls.count() if spip_urls.exists() and ( sorted(spip_urls.values_list('url', flat=True)) != sorted(node_urls.values_list('dst', flat=True)) or len(set(spip_urls.values_list('id_objet', flat=True))) != 1 ): # incohérence dans les urls raise NotImplementedError article = None article_attributes = { 'date': convert_timestamp(node.published_revision.timestamp), 'date_modif': convert_timestamp(node.changed), 'date_redac': convert_timestamp(node.created), 'descriptif': sanitarize_html(node.published_revision.teaser), 'maj': convert_timestamp(node.changed), 'statut': 'publie' if node.status else 'prepa', 'texte': sanitarize_html(node.published_revision.body), 'titre': node.title, } if not spip_urls.exists(): article = spip.Articles.objects.create(**article_attributes) urls = [ spip.Urls( id_objet=article.pk, url=node_url.dst, date=convert_timestamp(node.created), ) for node_url in node_urls ] spip.Urls.objects.bulk_create(urls) print('Article {} created from node {}.'.format(article.pk, node.pk)) elif update: article = spip.Articles( pk=spip_urls.last().id_objet, **article_attributes ) article.save() print('Article {} updated from node {}.'.format(article.pk, node.pk)) if article: user_attributes = { 'nom': node.user.name, 'email': node.user.mail, 'en_ligne': convert_timestamp(node.user.access), 'maj': convert_timestamp(node.user.created), } auteur, _ = spip.Auteurs.objects.update_or_create( login=node.user.name, defaults=user_attributes ) spip.AuteursLiens.objects.update_or_create( auteur=auteur, id_objet=article.pk, objet='article' ) fetch_and_remove_logo(article)