Detect and convert html body in the correct charset before parsing it

Signed-off-by: Thomas Citharel <tcit@tcit.fr>
This commit is contained in:
Thomas Citharel 2022-01-18 12:47:45 +01:00
parent c8735e5837
commit fbe5a8d0c4
No known key found for this signature in database
GPG Key ID: A061B9DDE0CA0773
4 changed files with 83 additions and 0 deletions

View File

@ -185,6 +185,12 @@ config :phoenix, :filter_parameters, ["password", "token"]
config :absinthe, schema: Mobilizon.GraphQL.Schema
config :absinthe, Absinthe.Logger, filter_variables: ["token", "password", "secret"]
config :codepagex, :encodings, [
:ascii,
~r[iso8859]i,
:"VENDORS/MICSFT/WINDOWS/CP1252"
]
config :mobilizon, Mobilizon.Web.Gettext, split_module_by: [:locale, :domain]
config :ex_cldr,

View File

@ -74,6 +74,7 @@ defmodule Mobilizon.Service.RichMedia.Parser do
{:is_html, _response_headers, true} <-
{:is_html, response_headers, is_html(response_headers)} do
body
|> convert_utf8(response_headers)
|> maybe_parse()
|> Map.put(:url, url)
|> maybe_add_favicon()
@ -317,4 +318,78 @@ defmodule Mobilizon.Service.RichMedia.Parser do
defp default_user_agent(_url) do
Config.instance_user_agent()
end
defp convert_utf8(body, headers) do
headers
|> get_header("Content-Type")
|> handle_charset(body)
end
defp handle_charset(nil, body) do
case detect_charset_from_meta(body) do
"" -> body
nil -> body
charset -> convert_body(body, charset)
end
end
defp handle_charset(content_type, body) do
case charset_from_content_type(content_type) do
nil -> handle_charset(nil, body)
charset -> convert_body(body, charset)
end
end
defp charset_from_content_type(content_type) do
with [_, params] <- :binary.split(content_type, ";"),
%{"charset" => charset} <- Utils.params(params) do
charset
else
_ -> nil
end
end
defp detect_charset_from_meta(body) do
Logger.debug("Trying to detect charset from meta")
document = Floki.parse_document!(body)
case document
|> Floki.find("meta[http-equiv=\"content-type\"]")
|> List.first() do
nil ->
case document
|> Floki.find("meta[http-equiv=\"Content-Type\"]")
|> List.first() do
nil -> nil
meta -> content_type_from_meta(meta)
end
meta ->
content_type_from_meta(meta)
end
end
defp content_type_from_meta(meta) do
Logger.debug("Finding content-type into <meta> element")
meta
|> Floki.attribute("content")
|> List.first()
|> String.trim()
|> charset_from_content_type()
end
defp convert_body(body, "utf-8"), do: body
defp convert_body(body, charset) do
Logger.debug("Converting body from #{charset}")
Codepagex.to_string!(body, fix_charset(charset))
end
defp fix_charset("windows-1252"), do: :"VENDORS/MICSFT/WINDOWS/CP1252"
defp fix_charset(charset) do
String.replace(charset, "-", "_")
end
end

View File

@ -203,6 +203,7 @@ defmodule Mobilizon.Mixfile do
{:export, "~> 0.1.0"},
{:tz_world, "~> 1.0"},
{:tzdata, "~> 1.1"},
{:codepagex, "~> 0.1.6"},
# Dev and test dependencies
{:phoenix_live_reload, "~> 1.2", only: [:dev, :e2e]},
{:ex_machina, "~> 2.3", only: [:dev, :test]},

View File

@ -11,6 +11,7 @@
"cachex": {:hex, :cachex, "3.4.0", "868b2959ea4aeb328c6b60ff66c8d5123c083466ad3c33d3d8b5f142e13101fb", [:mix], [{:eternal, "~> 1.2", [hex: :eternal, repo: "hexpm", optional: false]}, {:jumper, "~> 1.0", [hex: :jumper, repo: "hexpm", optional: false]}, {:sleeplocks, "~> 1.1", [hex: :sleeplocks, repo: "hexpm", optional: false]}, {:unsafe, "~> 1.0", [hex: :unsafe, repo: "hexpm", optional: false]}], "hexpm", "370123b1ab4fba4d2965fb18f87fd758325709787c8c5fce35b3fe80645ccbe5"},
"certifi": {:hex, :certifi, "2.8.0", "d4fb0a6bb20b7c9c3643e22507e42f356ac090a1dcea9ab99e27e0376d695eba", [:rebar3], [], "hexpm", "6ac7efc1c6f8600b08d625292d4bbf584e14847ce1b6b5c44d983d273e1097ea"},
"cldr_utils": {:hex, :cldr_utils, "2.17.0", "05453797e5b89f936c54c5602ac881e46b1ba4423a803c27a414466f4b598c94", [:mix], [{:castore, "~> 0.1", [hex: :castore, repo: "hexpm", optional: true]}, {:certifi, "~> 2.5", [hex: :certifi, repo: "hexpm", optional: true]}, {:decimal, "~> 1.9 or ~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}], "hexpm", "6077ddaaa155f27755638225617bdc00c004f39b3c9355b688e52a3fc98d57e8"},
"codepagex": {:hex, :codepagex, "0.1.6", "49110d09a25ee336a983281a48ef883da4c6190481e0b063afe2db481af6117e", [:mix], [], "hexpm", "1521461097dde281edf084062f525a4edc6a5e49f4fd1f5ec41c9c4955d5bd59"},
"combine": {:hex, :combine, "0.10.0", "eff8224eeb56498a2af13011d142c5e7997a80c8f5b97c499f84c841032e429f", [:mix], [], "hexpm", "1b1dbc1790073076580d0d1d64e42eae2366583e7aecd455d1215b0d16f2451b"},
"comeonin": {:hex, :comeonin, "5.3.2", "5c2f893d05c56ae3f5e24c1b983c2d5dfb88c6d979c9287a76a7feb1e1d8d646", [:mix], [], "hexpm", "d0993402844c49539aeadb3fe46a3c9bd190f1ecf86b6f9ebd71957534c95f04"},
"connection": {:hex, :connection, "1.1.0", "ff2a49c4b75b6fb3e674bfc5536451607270aac754ffd1bdfe175abe4a6d7a68", [:mix], [], "hexpm", "722c1eb0a418fbe91ba7bd59a47e28008a189d47e37e0e7bb85585a016b2869c"},