From ab843dff4cf22ed28071b4fea1a1edbe843215e0 Mon Sep 17 00:00:00 2001 From: Thomas Citharel Date: Sun, 22 Aug 2021 16:17:20 +0200 Subject: [PATCH] Fixed deduplicated files from orphan media being deleted as well Happens when a file is uploaded, then orphaned, and a similar file is used somewhere. The CleanMedia job service didn't consider that case Signed-off-by: Thomas Citharel --- .gitignore | 1 + .../fix_unattached_media_in_body.ex | 107 ------------------ lib/mix/tasks/mobilizon/media/clean_orphan.ex | 4 +- lib/mobilizon/medias/medias.ex | 23 ++-- lib/mobilizon/storage/custom_functions.ex | 10 ++ lib/service/clean_orphan_media.ex | 43 ++++++- test/service/clean_orphan_media_test.exs | 6 +- test/support/factory.ex | 2 +- test/tasks/media/clean_orphan_test.exs | 39 ++++++- 9 files changed, 98 insertions(+), 137 deletions(-) delete mode 100644 lib/mix/tasks/mobilizon/maintenance/fix_unattached_media_in_body.ex create mode 100644 lib/mobilizon/storage/custom_functions.ex diff --git a/.gitignore b/.gitignore index b4b15332c..5b7ebafb6 100644 --- a/.gitignore +++ b/.gitignore @@ -31,6 +31,7 @@ priv/cert/ cover/ site/ test/fixtures/image_tmp.jpg +test/fixtures/picture_tmp.png test/fixtures/DSCN0010_tmp.jpg test/uploads/ uploads/* diff --git a/lib/mix/tasks/mobilizon/maintenance/fix_unattached_media_in_body.ex b/lib/mix/tasks/mobilizon/maintenance/fix_unattached_media_in_body.ex deleted file mode 100644 index d1934bad7..000000000 --- a/lib/mix/tasks/mobilizon/maintenance/fix_unattached_media_in_body.ex +++ /dev/null @@ -1,107 +0,0 @@ -defmodule Mix.Tasks.Mobilizon.Maintenance.FixUnattachedMediaInBody do - @moduledoc """ - Task to reattach media files that were added in event, post or comment bodies without being attached to their entities. - - This task should only be run once. - """ - use Mix.Task - - import Mix.Tasks.Mobilizon.Common - alias Mobilizon.{Discussions, Events, Medias, Posts} - alias Mobilizon.Discussions.Comment - alias Mobilizon.Events.Event - alias Mobilizon.Posts.Post - alias Mobilizon.Storage.Repo - require Logger - - @preferred_cli_env "prod" - - # TODO: Remove me in Mobilizon 1.2 - - @shortdoc "Reattaches inline media from events and posts" - def run([]) do - start_mobilizon() - - shell_info("Going to extract pictures from events") - extract_inline_pictures_from_bodies(Event) - shell_info("Going to extract pictures from posts") - extract_inline_pictures_from_bodies(Post) - shell_info("Going to extract pictures from comments") - extract_inline_pictures_from_bodies(Comment) - end - - defp extract_inline_pictures_from_bodies(entity) do - Repo.transaction( - fn -> - entity - |> Repo.stream() - |> Stream.map(&extract_pictures(&1)) - |> Stream.map(fn {entity, pics} -> save_entity(entity, pics) end) - |> Stream.run() - end, - timeout: :infinity - ) - end - - defp extract_pictures(entity) do - extracted_pictures = entity |> get_body() |> parse_body() |> get_media_entities_from_urls() - - attached_picture = entity |> get_picture() |> get_media_entity_from_media_id() - attached_pictures = [attached_picture] |> Enum.filter(& &1) - - {entity, extracted_pictures ++ attached_pictures} - end - - defp get_body(%Event{description: description}), do: description - defp get_body(%Post{body: body}), do: body - defp get_body(%Comment{text: text}), do: text - - defp get_picture(%Event{picture_id: picture_id}), do: picture_id - defp get_picture(%Post{picture_id: picture_id}), do: picture_id - defp get_picture(%Comment{}), do: nil - - defp parse_body(nil), do: [] - - defp parse_body(body) do - with res <- Regex.scan(~r/]*?src\s*=\s*['\"]([^'\"]*?)['\"][^>]*?>/, body), - res <- Enum.map(res, fn [_, res] -> res end) do - res - end - end - - defp get_media_entities_from_urls(media_urls) do - media_urls - |> Enum.map(fn media_url -> - # We prefer orphan media, but fallback on already attached media just in case - Medias.get_unattached_media_by_url(media_url) || Medias.get_media_by_url(media_url) - end) - |> Enum.filter(& &1) - end - - defp get_media_entity_from_media_id(nil), do: nil - - defp get_media_entity_from_media_id(media_id) do - Medias.get_media(media_id) - end - - defp save_entity(%Event{} = _event, []), do: :ok - - defp save_entity(%Event{} = event, media) do - event = Repo.preload(event, [:contacts, :media]) - Events.update_event(event, %{media: media}) - end - - defp save_entity(%Post{} = _post, []), do: :ok - - defp save_entity(%Post{} = post, media) do - post = Repo.preload(post, [:media]) - Posts.update_post(post, %{media: media}) - end - - defp save_entity(%Comment{} = _comment, []), do: :ok - - defp save_entity(%Comment{} = comment, media) do - comment = Repo.preload(comment, [:media]) - Discussions.update_comment(comment, %{media: media}) - end -end diff --git a/lib/mix/tasks/mobilizon/media/clean_orphan.ex b/lib/mix/tasks/mobilizon/media/clean_orphan.ex index 7b3ee34e5..09ba6a37f 100644 --- a/lib/mix/tasks/mobilizon/media/clean_orphan.ex +++ b/lib/mix/tasks/mobilizon/media/clean_orphan.ex @@ -64,7 +64,9 @@ defmodule Mix.Tasks.Mobilizon.Media.CleanOrphan do end Enum.each(medias, fn media -> - shell_info("ID: #{media.id}, Actor: #{media.actor_id}, URL: #{media.file.url}") + shell_info( + "ID: #{hd(media).id}, Actor: #{hd(media).actor_id}, Deduplicated #{length(media)} times, URL: #{hd(media).file.url}" + ) end) end diff --git a/lib/mobilizon/medias/medias.ex b/lib/mobilizon/medias/medias.ex index 565b828c7..7b4362621 100644 --- a/lib/mobilizon/medias/medias.ex +++ b/lib/mobilizon/medias/medias.ex @@ -4,6 +4,7 @@ defmodule Mobilizon.Medias do """ import Ecto.Query + import Mobilizon.Storage.CustomFunctions alias Ecto.Multi @@ -40,23 +41,15 @@ defmodule Mobilizon.Medias do end @doc """ - Get an unattached media by it's URL + Get all media by an URL. """ - def get_unattached_media_by_url(url) do + @spec get_all_media_by_url(String.t()) :: Media.t() | nil + def get_all_media_by_url(url) do url + |> String.split("?", parts: 2) + |> hd |> media_by_url_query() - |> join(:left, [m], e in assoc(m, :events)) - |> join(:left, [m], ep in assoc(m, :event_picture)) - |> join(:left, [m], p in assoc(m, :posts)) - |> join(:left, [m], pp in assoc(m, :posts_picture)) - |> join(:left, [m], c in assoc(m, :comments)) - |> where([_m, e], is_nil(e.id)) - |> where([_m, _e, ep], is_nil(ep.id)) - |> where([_m, _e, _ep, p], is_nil(p.id)) - |> where([_m, _e, _ep, _p, pp], is_nil(pp.id)) - |> where([_m, _e, _ep, _p, _pp, c], is_nil(c.id)) - |> limit(1) - |> Repo.one() + |> Repo.all() end @doc """ @@ -163,7 +156,7 @@ defmodule Mobilizon.Medias do defp media_by_url_query(url) do from( p in Media, - where: fragment("? @> ?", p.file, ~s|{"url": "#{url}"}|) + where: split_part(fragment("file->>'url'"), "?", 1) == ^url ) end diff --git a/lib/mobilizon/storage/custom_functions.ex b/lib/mobilizon/storage/custom_functions.ex new file mode 100644 index 000000000..eab327234 --- /dev/null +++ b/lib/mobilizon/storage/custom_functions.ex @@ -0,0 +1,10 @@ +defmodule Mobilizon.Storage.CustomFunctions do + @moduledoc """ + Helper module for custom PostgreSQL functions + """ + defmacro split_part(string, delimiter, position) do + quote do + fragment("split_part(?, ?, ?)", unquote(string), unquote(delimiter), unquote(position)) + end + end +end diff --git a/lib/service/clean_orphan_media.ex b/lib/service/clean_orphan_media.ex index 66703d30c..27c674aa5 100644 --- a/lib/service/clean_orphan_media.ex +++ b/lib/service/clean_orphan_media.ex @@ -5,6 +5,7 @@ defmodule Mobilizon.Service.CleanOrphanMedia do alias Mobilizon.Actors.Actor alias Mobilizon.Medias + alias Mobilizon.Medias.File alias Mobilizon.Medias.Media alias Mobilizon.Storage.Repo import Ecto.Query @@ -25,8 +26,10 @@ defmodule Mobilizon.Service.CleanOrphanMedia do if Keyword.get(opts, :dry_run, false) do {:ok, medias} else - Enum.each(medias, fn media -> - Medias.delete_media(media, ignore_file_not_found: true) + Enum.each(medias, fn media_list -> + Enum.each(media_list, fn media -> + Medias.delete_media(media, ignore_file_not_found: true) + end) end) {:ok, medias} @@ -49,7 +52,7 @@ defmodule Mobilizon.Service.CleanOrphanMedia do |> Enum.join(" UNION ") |> (&"NOT EXISTS(#{&1})").() - @spec find_media(Keyword.t()) :: list(Media.t()) + @spec find_media(Keyword.t()) :: list(list(Media.t())) defp find_media(opts) do default_grace_period = Mobilizon.Config.get([:instance, :orphan_upload_grace_period_hours], 48) @@ -68,6 +71,38 @@ defmodule Mobilizon.Service.CleanOrphanMedia do where: fragment(@union_query) ) - Repo.all(query) + query + |> Repo.all() + |> Enum.filter(fn %Media{file: %File{url: url}} -> + is_all_media_orphan?(url, expiration_date) + end) + |> Enum.chunk_by(fn %Media{file: %File{url: url}} -> + url + |> String.split("?", parts: 2) + |> hd + end) + end + + def is_all_media_orphan?(url, expiration_date) do + url + |> Medias.get_all_media_by_url() + |> Enum.all?(&is_media_orphan?(&1, expiration_date)) + end + + @spec is_media_orphan?(Media.t(), DateTime.t()) :: boolean() + def is_media_orphan?(%Media{id: media_id}, expiration_date) do + query = + from(m in Media, + as: :media, + distinct: true, + join: a in Actor, + on: a.id == m.actor_id, + where: m.id == ^media_id, + where: is_nil(a.domain), + where: m.inserted_at < ^expiration_date, + where: fragment(@union_query) + ) + + Repo.exists?(query) end end diff --git a/test/service/clean_orphan_media_test.exs b/test/service/clean_orphan_media_test.exs index 21c1c2264..f0ca3197d 100644 --- a/test/service/clean_orphan_media_test.exs +++ b/test/service/clean_orphan_media_test.exs @@ -16,7 +16,7 @@ defmodule Mobilizon.Service.CleanOrphanMediaTest do refute is_nil(Medias.get_media(media_id)) refute is_nil(Medias.get_media(media_2_id)) - assert {:ok, [found_media]} = CleanOrphanMedia.clean() + assert {:ok, [[found_media]]} = CleanOrphanMedia.clean() assert found_media.id == media_id assert is_nil(Medias.get_media(media_id)) @@ -31,7 +31,7 @@ defmodule Mobilizon.Service.CleanOrphanMediaTest do refute is_nil(Medias.get_media(media_id)) refute is_nil(Medias.get_media(media_2_id)) - assert {:ok, [found_media]} = CleanOrphanMedia.clean(dry_run: true) + assert {:ok, [[found_media]]} = CleanOrphanMedia.clean(dry_run: true) assert found_media.id == media_id refute is_nil(Medias.get_media(media_id)) @@ -46,7 +46,7 @@ defmodule Mobilizon.Service.CleanOrphanMediaTest do refute is_nil(Medias.get_media(media_id)) refute is_nil(Medias.get_media(media_2_id)) - assert {:ok, [found_media]} = CleanOrphanMedia.clean(grace_period: 12) + assert {:ok, [[found_media]]} = CleanOrphanMedia.clean(grace_period: 12) assert found_media.id == media_id assert is_nil(Medias.get_media(media_id)) diff --git a/test/support/factory.ex b/test/support/factory.ex index db1a40967..953ac8c0b 100644 --- a/test/support/factory.ex +++ b/test/support/factory.ex @@ -280,7 +280,7 @@ defmodule Mobilizon.Factory do %Mobilizon.Medias.File{ name: "My Media", url: url, - content_type: "image/png", + content_type: "image/jpg", size: 13_120 } end diff --git a/test/tasks/media/clean_orphan_test.exs b/test/tasks/media/clean_orphan_test.exs index 0aeb7e00b..83440dcf2 100644 --- a/test/tasks/media/clean_orphan_test.exs +++ b/test/tasks/media/clean_orphan_test.exs @@ -43,21 +43,22 @@ defmodule Mix.Tasks.Mobilizon.Media.CleanOrphanTest do test "with media returned" do media1 = insert(:media) media2 = insert(:media) + media3 = insert(:media, file: create_file()) with_mock CleanOrphanMedia, - clean: fn [dry_run: true, grace_period: 48] -> {:ok, [media1, media2]} end do + clean: fn [dry_run: true, grace_period: 48] -> {:ok, [[media1, media2], [media3]]} end do CleanOrphan.run(["--dry-run"]) assert_received {:mix_shell, :info, [output_received]} assert output_received == "List of files that would have been deleted" assert_received {:mix_shell, :info, [output_received]} assert output_received == - "ID: #{media1.id}, Actor: #{media1.actor_id}, URL: #{media1.file.url}" + "ID: #{media1.id}, Actor: #{media1.actor_id}, Deduplicated 2 times, URL: #{media1.file.url}" assert_received {:mix_shell, :info, [output_received]} assert output_received == - "ID: #{media2.id}, Actor: #{media2.actor_id}, URL: #{media2.file.url}" + "ID: #{media3.id}, Actor: #{media3.actor_id}, Deduplicated 1 times, URL: #{media3.file.url}" assert_received {:mix_shell, :info, [output_received]} assert output_received == "2 files would have been deleted" @@ -78,21 +79,22 @@ defmodule Mix.Tasks.Mobilizon.Media.CleanOrphanTest do test "with media returned" do media1 = insert(:media) media2 = insert(:media) + media3 = insert(:media, file: create_file()) with_mock CleanOrphanMedia, - clean: fn [dry_run: false, grace_period: 48] -> {:ok, [media1, media2]} end do + clean: fn [dry_run: false, grace_period: 48] -> {:ok, [[media1, media2], [media3]]} end do CleanOrphan.run(["--verbose"]) assert_received {:mix_shell, :info, [output_received]} assert output_received == "List of files that have been deleted" assert_received {:mix_shell, :info, [output_received]} assert output_received == - "ID: #{media1.id}, Actor: #{media1.actor_id}, URL: #{media1.file.url}" + "ID: #{media1.id}, Actor: #{media1.actor_id}, Deduplicated 2 times, URL: #{media1.file.url}" assert_received {:mix_shell, :info, [output_received]} assert output_received == - "ID: #{media2.id}, Actor: #{media2.actor_id}, URL: #{media2.file.url}" + "ID: #{media3.id}, Actor: #{media3.actor_id}, Deduplicated 1 times, URL: #{media3.file.url}" assert_received {:mix_shell, :info, [output_received]} assert output_received == "2 files have been deleted" @@ -121,4 +123,29 @@ defmodule Mix.Tasks.Mobilizon.Media.CleanOrphanTest do end end end + + defp create_file do + File.cp!("test/fixtures/picture.png", "test/fixtures/picture_tmp.png") + + file = %Plug.Upload{ + content_type: "image/png", + path: Path.absname("test/fixtures/picture_tmp.png"), + filename: "image.png" + } + + {:ok, data} = Mobilizon.Web.Upload.store(file) + + %{ + content_type: "image/png", + name: "image.png", + url: url + } = data + + %Mobilizon.Medias.File{ + name: "My Media", + url: url, + content_type: "image/png", + size: 13_120 + } + end end