2020-02-18 08:57:00 +01:00
|
|
|
# Portions of this file are derived from Pleroma:
|
|
|
|
# Pleroma: A lightweight social networking server
|
|
|
|
# Copyright © 2017-2020 Pleroma Authors <https://pleroma.social/>
|
|
|
|
# SPDX-License-Identifier: AGPL-3.0-only
|
|
|
|
|
|
|
|
defmodule Mobilizon.Service.RichMedia.Parsers.Fallback do
|
|
|
|
@moduledoc """
|
|
|
|
Module to parse fallback data in HTML pages (plain old title and meta description)
|
|
|
|
"""
|
2021-03-23 15:18:03 +01:00
|
|
|
require Logger
|
|
|
|
|
2020-02-18 08:57:00 +01:00
|
|
|
@spec parse(String.t(), map()) :: {:ok, map()} | {:error, String.t()}
|
|
|
|
def parse(html, data) do
|
2021-03-23 15:18:03 +01:00
|
|
|
Logger.debug("Running Fallback parser")
|
|
|
|
|
2020-02-18 08:57:00 +01:00
|
|
|
data =
|
|
|
|
data
|
|
|
|
|> maybe_put(html, :title)
|
|
|
|
|> maybe_put(html, :description)
|
|
|
|
|
|
|
|
if Enum.empty?(data) do
|
|
|
|
{:error, "Not even a title"}
|
|
|
|
else
|
|
|
|
{:ok, data}
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
defp maybe_put(meta, html, attr) do
|
|
|
|
case get_page(html, attr) do
|
|
|
|
"" -> meta
|
|
|
|
content -> Map.put_new(meta, attr, content)
|
|
|
|
end
|
|
|
|
end
|
|
|
|
|
|
|
|
defp get_page(html, :title) do
|
2021-05-03 14:52:37 +02:00
|
|
|
html
|
|
|
|
|> Floki.parse_document!()
|
2022-01-18 12:52:45 +01:00
|
|
|
|> Floki.find("title")
|
2021-05-03 14:52:37 +02:00
|
|
|
|> List.first()
|
|
|
|
|> Floki.text()
|
|
|
|
|> String.trim()
|
2020-02-18 08:57:00 +01:00
|
|
|
end
|
|
|
|
|
|
|
|
defp get_page(html, :description) do
|
2021-05-03 14:52:37 +02:00
|
|
|
case html
|
|
|
|
|> Floki.parse_document!()
|
|
|
|
|> Floki.find("html meta[name='description']")
|
|
|
|
|> List.first() do
|
2020-02-18 08:57:00 +01:00
|
|
|
nil -> ""
|
|
|
|
elem -> elem |> Floki.attribute("content") |> List.first() |> String.trim()
|
|
|
|
end
|
|
|
|
end
|
|
|
|
end
|