From fbe5a8d0c4ba0996bef595275b5313ae0e1c3dfd Mon Sep 17 00:00:00 2001 From: Thomas Citharel Date: Tue, 18 Jan 2022 12:47:45 +0100 Subject: [PATCH] Detect and convert html body in the correct charset before parsing it Signed-off-by: Thomas Citharel --- config/config.exs | 6 +++ lib/service/rich_media/parser.ex | 75 ++++++++++++++++++++++++++++++++ mix.exs | 1 + mix.lock | 1 + 4 files changed, 83 insertions(+) diff --git a/config/config.exs b/config/config.exs index 10a20f43f..44d0666c2 100644 --- a/config/config.exs +++ b/config/config.exs @@ -185,6 +185,12 @@ config :phoenix, :filter_parameters, ["password", "token"] config :absinthe, schema: Mobilizon.GraphQL.Schema config :absinthe, Absinthe.Logger, filter_variables: ["token", "password", "secret"] +config :codepagex, :encodings, [ + :ascii, + ~r[iso8859]i, + :"VENDORS/MICSFT/WINDOWS/CP1252" +] + config :mobilizon, Mobilizon.Web.Gettext, split_module_by: [:locale, :domain] config :ex_cldr, diff --git a/lib/service/rich_media/parser.ex b/lib/service/rich_media/parser.ex index 0a6e3691c..ae735901d 100644 --- a/lib/service/rich_media/parser.ex +++ b/lib/service/rich_media/parser.ex @@ -74,6 +74,7 @@ defmodule Mobilizon.Service.RichMedia.Parser do {:is_html, _response_headers, true} <- {:is_html, response_headers, is_html(response_headers)} do body + |> convert_utf8(response_headers) |> maybe_parse() |> Map.put(:url, url) |> maybe_add_favicon() @@ -317,4 +318,78 @@ defmodule Mobilizon.Service.RichMedia.Parser do defp default_user_agent(_url) do Config.instance_user_agent() end + + defp convert_utf8(body, headers) do + headers + |> get_header("Content-Type") + |> handle_charset(body) + end + + defp handle_charset(nil, body) do + case detect_charset_from_meta(body) do + "" -> body + nil -> body + charset -> convert_body(body, charset) + end + end + + defp handle_charset(content_type, body) do + case charset_from_content_type(content_type) do + nil -> handle_charset(nil, body) + charset -> convert_body(body, charset) + end + end + + defp charset_from_content_type(content_type) do + with [_, params] <- :binary.split(content_type, ";"), + %{"charset" => charset} <- Utils.params(params) do + charset + else + _ -> nil + end + end + + defp detect_charset_from_meta(body) do + Logger.debug("Trying to detect charset from meta") + + document = Floki.parse_document!(body) + + case document + |> Floki.find("meta[http-equiv=\"content-type\"]") + |> List.first() do + nil -> + case document + |> Floki.find("meta[http-equiv=\"Content-Type\"]") + |> List.first() do + nil -> nil + meta -> content_type_from_meta(meta) + end + + meta -> + content_type_from_meta(meta) + end + end + + defp content_type_from_meta(meta) do + Logger.debug("Finding content-type into element") + + meta + |> Floki.attribute("content") + |> List.first() + |> String.trim() + |> charset_from_content_type() + end + + defp convert_body(body, "utf-8"), do: body + + defp convert_body(body, charset) do + Logger.debug("Converting body from #{charset}") + Codepagex.to_string!(body, fix_charset(charset)) + end + + defp fix_charset("windows-1252"), do: :"VENDORS/MICSFT/WINDOWS/CP1252" + + defp fix_charset(charset) do + String.replace(charset, "-", "_") + end end diff --git a/mix.exs b/mix.exs index 144cc27af..ec1fd2136 100644 --- a/mix.exs +++ b/mix.exs @@ -203,6 +203,7 @@ defmodule Mobilizon.Mixfile do {:export, "~> 0.1.0"}, {:tz_world, "~> 1.0"}, {:tzdata, "~> 1.1"}, + {:codepagex, "~> 0.1.6"}, # Dev and test dependencies {:phoenix_live_reload, "~> 1.2", only: [:dev, :e2e]}, {:ex_machina, "~> 2.3", only: [:dev, :test]}, diff --git a/mix.lock b/mix.lock index 6979c28c7..24272a2b6 100644 --- a/mix.lock +++ b/mix.lock @@ -11,6 +11,7 @@ "cachex": {:hex, :cachex, "3.4.0", "868b2959ea4aeb328c6b60ff66c8d5123c083466ad3c33d3d8b5f142e13101fb", [:mix], [{:eternal, "~> 1.2", [hex: :eternal, repo: "hexpm", optional: false]}, {:jumper, "~> 1.0", [hex: :jumper, repo: "hexpm", optional: false]}, {:sleeplocks, "~> 1.1", [hex: :sleeplocks, repo: "hexpm", optional: false]}, {:unsafe, "~> 1.0", [hex: :unsafe, repo: "hexpm", optional: false]}], "hexpm", "370123b1ab4fba4d2965fb18f87fd758325709787c8c5fce35b3fe80645ccbe5"}, "certifi": {:hex, :certifi, "2.8.0", "d4fb0a6bb20b7c9c3643e22507e42f356ac090a1dcea9ab99e27e0376d695eba", [:rebar3], [], "hexpm", "6ac7efc1c6f8600b08d625292d4bbf584e14847ce1b6b5c44d983d273e1097ea"}, "cldr_utils": {:hex, :cldr_utils, "2.17.0", "05453797e5b89f936c54c5602ac881e46b1ba4423a803c27a414466f4b598c94", [:mix], [{:castore, "~> 0.1", [hex: :castore, repo: "hexpm", optional: true]}, {:certifi, "~> 2.5", [hex: :certifi, repo: "hexpm", optional: true]}, {:decimal, "~> 1.9 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}], "hexpm", "6077ddaaa155f27755638225617bdc00c004f39b3c9355b688e52a3fc98d57e8"}, + "codepagex": {:hex, :codepagex, "0.1.6", "49110d09a25ee336a983281a48ef883da4c6190481e0b063afe2db481af6117e", [:mix], [], "hexpm", "1521461097dde281edf084062f525a4edc6a5e49f4fd1f5ec41c9c4955d5bd59"}, "combine": {:hex, :combine, "0.10.0", "eff8224eeb56498a2af13011d142c5e7997a80c8f5b97c499f84c841032e429f", [:mix], [], "hexpm", "1b1dbc1790073076580d0d1d64e42eae2366583e7aecd455d1215b0d16f2451b"}, "comeonin": {:hex, :comeonin, "5.3.2", "5c2f893d05c56ae3f5e24c1b983c2d5dfb88c6d979c9287a76a7feb1e1d8d646", [:mix], [], "hexpm", "d0993402844c49539aeadb3fe46a3c9bd190f1ecf86b6f9ebd71957534c95f04"}, "connection": {:hex, :connection, "1.1.0", "ff2a49c4b75b6fb3e674bfc5536451607270aac754ffd1bdfe175abe4a6d7a68", [:mix], [], "hexpm", "722c1eb0a418fbe91ba7bd59a47e28008a189d47e37e0e7bb85585a016b2869c"},