drupal2spip_lal/drupal2spip_lal/base/convert.py

import os
import re
from datetime import datetime
from itertools import groupby

from django.conf import settings
from django.utils.timezone import make_aware, now

import phpserialize
import request
from bs4 import BeautifulSoup as bs

from drupal2spip_lal.drupal import models as drupal
from drupal2spip_lal.spip import models as spip

# Questions
# - quelle utilisation des rubriques ?
# - quelle hiérarchie des mots clés ?
# - autobr sélectif ?
# - importer les drupaleries ? (fn, toc, etc.) + autobr selon le format
# - convertir en format SPIP ?


def convert_timestamp(timestamp):
    return make_aware(datetime.fromtimestamp(timestamp))


def strong_to_dl(html):
    """
    Marie-Odile nous fait des dl-like à base de strong.
    Parser html avec des regex est mal mais on essaie de reconstruire la dl
    avant de casser la forme avec bs.
    """
    is_strong = r'<strong>(?P<dt>.+)</strong>(?P<dd>.*)$'

    def is_strong_item(s):
        return bool(re.match(is_strong, s))

    items = re.split(r'[\r\n]+', html)
    grouped_items = groupby(items, key=is_strong_item)
    r = []
    for key, group in grouped_items:
        group = list(group)
        if key and len(group) > 2:
            dl = ['<dl>']
            for elem in group:
                match = re.match(is_strong, elem).groupdict()
                dl += [
                    '<dt>{}</dt>'.format(match['dt'].strip()),
                    '<dd>{}</dd>'.format(match['dd'].strip()),
                ]
            dl.append('</dl>')
            r += dl
        else:
            r += group
    return '\n'.join(r)


def fetch_and_remove_logo(article, force_download):
    def fetch_logo(src):
        """
        SPIP gère les logos à la façon d'un hack : un fichier dans IMG nommé
        'arton{}.{}'.format(article.pk, ext)
        """
        ext = src.split('.')[-1]
        filename = 'arton{}.{}'.format(article.pk, ext)
        path = os.path.join(settings.SPIP_LOGO_DIR, filename)

        if not os.access(path, os.R_OK) or force_download:
            r = request.get(src, stream=True)
            with open(path, 'wb') as fd:
                for chunk in r.iter_content(chunk_size=128):
                    fd.write(chunk)

    def remove_img(img):
        has_siblings = [
            elem
            for elem in list(img.previous_siblings) + list(img.next_siblings)
            if elem != '\n'
        ]
        if img.parent.name in ['a', 'p'] and not has_siblings:
            img.parent.replace_with('')
        else:
            img.replace_with('')

    soup = bs(article.descriptif, 'html.parser')
    img = soup.find('img')

    src = img and img.attrs.get('src', None)

    if src and src.startswith('/'):
        src = 'http://{}{}'.format(settings.DRUPAL_FQDN, src)

    if src and re.match(r'^(https?)?://', src):
        fetch_logo(src)
        remove_img(img)
        article.descriptif = str(soup)

        # L'image est généralement reprise dans le corps avec un format
        # différent (par ex sans lien vers l'article).
        soup = bs(article.texte, 'html.parser')
        img = soup.find('img', src=src)
        if img:
            remove_img(img)
        article.texte = str(soup)

        article.save()


def sanitarize_html(html):
    html = strong_to_dl(html)
    soup = bs(html, 'html.parser')
    return str(soup)


def convert_node(node, options):
    """
    Le point d'entrée fonctionnel c'est les Urls.
    On se base donc là dessus pour vérifier si l'import
    est à faire ou pas ou encore à upgrader.
    """
    update = options.get('update', False)
    force_download = options.get('force_download', False)

    node_urls = drupal.UrlAlias.objects.filter(src='node/{}'.format(node.pk))

    spip_urls = spip.Urls.objects.filter(
        type='article', url__in=list(node_urls.values_list('dst', flat=True))
    )

    spip_urls.count()
    if spip_urls.exists() and (
        sorted(spip_urls.values_list('url', flat=True))
        != sorted(node_urls.values_list('dst', flat=True))
        or len(set(spip_urls.values_list('id_objet', flat=True))) != 1
    ):
        # incohérence dans les urls
        raise NotImplementedError

    article = None
    article_attributes = {
        'date': convert_timestamp(node.published_revision.timestamp),
        'date_modif': convert_timestamp(node.changed),
        'date_redac': convert_timestamp(node.created),
        'descriptif': sanitarize_html(node.published_revision.teaser),
        'maj': convert_timestamp(node.changed),
        'statut': 'publie' if node.status else 'prepa',
        'texte': sanitarize_html(node.published_revision.body),
        'titre': node.title,
    }
    if not spip_urls.exists():
        article = spip.Articles.objects.create(**article_attributes)
        urls = [
            spip.Urls(
                id_objet=article.pk,
                url=node_url.dst,
                date=convert_timestamp(node.created),
            )
            for node_url in node_urls
        ]
        spip.Urls.objects.bulk_create(urls)

        print('Article {} created from node {}.'.format(article.pk, node.pk))

    elif update:
        article = spip.Articles(
            pk=spip_urls.last().id_objet, **article_attributes
        )
        article.save()
        print('Article {} updated from node {}.'.format(article.pk, node.pk))

    if article:
        user_attributes = {
            'nom': node.user.name,
            'email': node.user.mail,
            'en_ligne': convert_timestamp(node.user.access),
            'maj': convert_timestamp(node.user.created),
        }

        auteur, _ = spip.Auteurs.objects.update_or_create(
            login=node.user.name, defaults=user_attributes
        )

        spip.AuteursLiens.objects.get_or_create(
            auteur=auteur, id_objet=article.pk, objet='article'
        )

        fetch_and_remove_logo(article, force_download)

        for term_node in node.termnode_set.all():
            groupe, _ = spip.GroupesMots.objects.get_or_create(
                titre=term_node.data.theme.name,
                descriptif=term_node.data.theme.description,
                texte=term_node.data.theme.help,
                defaults={'maj': now},
            )
            mot, _ = spip.Mots.objects.get_or_create(
                groupe=groupe,
                type=groupe.titre,
                titre=term_node.data.name,
                descriptif=term_node.data.description,
                defaults={'maj': now},
            )
            spip.MotsLiens.objects.get_or_create(
                mot=mot, id_objet=article.pk, objet='article'
            )

        for numero, revision in enumerate(
            node.noderevisions_set.order_by('timestamp'), start=1
        ):
            user_attributes = {
                'nom': revision.user.name,
                'email': revision.user.mail,
                'en_ligne': convert_timestamp(revision.user.access),
                'maj': convert_timestamp(revision.user.created),
            }
            auteur, _ = spip.Auteurs.objects.update_or_create(
                login=revision.user.name, defaults=user_attributes
            )

            if numero == 1:
                fragment = phpserialize.dumps({1: revision.body}).decode()
            else:
                fragment = phpserialize.dumps(
                    {1: previous.body, 2: revision.body}
                ).decode()
                # fragment = phpserialize.dumps({1: str(numero-1), 2: revision.body}).decode()

            spip.VersionsFragments.objects.update_or_create(
                id_fragment=numero,
                id_objet=article.pk,
                objet='article',
                version_min_id=numero,
                defaults={'fragment': fragment, 'version_max_id': numero,},
            )

            champs_versionnes = {'texte': numero}

            version, _ = spip.Versions.objects.update_or_create(
                id_version=numero,
                id_objet=article.pk,
                objet='article',
                defaults={
                    'auteur': auteur,
                    'titre_version': 'Version initiale'
                    if numero == 1 and not revision.log
                    else revision.log,
                    'date': convert_timestamp(revision.timestamp),
                    'permanent': '' if numero == 1 else 'non',
                    'champs': phpserialize.dumps(champs_versionnes).decode(),
                },
            )
            previous = revision