Skip to content

Commit

Permalink
Download indexed videos (#10)
Browse files Browse the repository at this point in the history
* Clarified documentation

* more comments

* [WIP] hooked up basic video downloading; starting work on metadata

* Added metadata model and parsing

Adding the metadata model made me realize that, in many cases, yt-dlp
returns undesired input in stdout, breaking parsing. In order to get
the metadata model working, I had to change the way in which the app
interacts with yt-dlp. Now, output is written as a file to disk which
is immediately re-read and returned.

* Added tests for video download worker

* Hooked up video downloading to the channel indexing pipeline

* Adds tasks for media items

* Updated video metadata parser to extract the title
  • Loading branch information
kieraneglin authored Jan 31, 2024
1 parent 445e7c7 commit b202b3b
Show file tree
Hide file tree
Showing 48 changed files with 771 additions and 155 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,4 @@ npm-debug.log

/.elixir_ls
.env
.DS_Store
2 changes: 2 additions & 0 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -29,5 +29,7 @@ RUN chmod +x ./docker-run.sh

# Install Elixir deps
RUN mix deps.get
# Gives us iex shell history
ENV ERL_AFLAGS="-kernel shell_history enabled"

EXPOSE 4008
3 changes: 2 additions & 1 deletion config/config.exs
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,8 @@ config :pinchflat,
yt_dlp_executable: System.find_executable("yt-dlp"),
yt_dlp_runner: Pinchflat.MediaClient.Backends.YtDlp.CommandRunner,
# TODO: figure this out
media_directory: :not_implemented
media_directory: :not_implemented,
metadata_directory: Path.join([System.tmp_dir!(), "pinchflat", "metadata"])

# Configures the endpoint
config :pinchflat, PinchflatWeb.Endpoint,
Expand Down
3 changes: 2 additions & 1 deletion config/dev.exs
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import Config

config :pinchflat,
media_directory: Path.join([System.tmp_dir!(), "yt-dlp"])
media_directory: Path.join([File.cwd!(), "tmp", "videos"]),
metadata_directory: Path.join([File.cwd!(), "tmp", "metadata"])

# Configure your database
config :pinchflat, Pinchflat.Repo,
Expand Down
3 changes: 2 additions & 1 deletion config/test.exs
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,8 @@ import Config
config :pinchflat,
# Specifying backend data here makes mocking and local testing SUPER easy
yt_dlp_executable: Path.join([File.cwd!(), "/test/support/scripts/yt-dlp-mocks/repeater.sh"]),
media_directory: Path.join([System.tmp_dir!(), "yt-dlp"])
media_directory: Path.join([System.tmp_dir!(), "videos"]),
metadata_directory: Path.join([System.tmp_dir!(), "metadata"])

config :pinchflat, Oban, testing: :manual

Expand Down
25 changes: 22 additions & 3 deletions lib/pinchflat/media.ex
Original file line number Diff line number Diff line change
Expand Up @@ -4,9 +4,11 @@ defmodule Pinchflat.Media do
"""

import Ecto.Query, warn: false
alias Pinchflat.Repo

alias Pinchflat.Repo
alias Pinchflat.Tasks
alias Pinchflat.Media.MediaItem
alias Pinchflat.MediaSource.Channel

@doc """
Returns the list of media_items. Returns [%MediaItem{}, ...].
Expand All @@ -15,6 +17,20 @@ defmodule Pinchflat.Media do
Repo.all(MediaItem)
end

@doc """
Returns a list of pending media_items for a given channel, where
pending means the `video_filepath` is `nil`.
Returns [%MediaItem{}, ...].
"""
def list_pending_media_items_for(%Channel{} = channel) do
from(
m in MediaItem,
where: m.channel_id == ^channel.id and is_nil(m.video_filepath)
)
|> Repo.all()
end

@doc """
Gets a single media_item.
Expand All @@ -25,7 +41,7 @@ defmodule Pinchflat.Media do
@doc """
Creates a media_item. Returns {:ok, %MediaItem{}} | {:error, %Ecto.Changeset{}}.
"""
def create_media_item(attrs \\ %{}) do
def create_media_item(attrs) do
%MediaItem{}
|> MediaItem.changeset(attrs)
|> Repo.insert()
Expand All @@ -41,9 +57,12 @@ defmodule Pinchflat.Media do
end

@doc """
Deletes a media_item. Returns {:ok, %MediaItem{}} | {:error, %Ecto.Changeset{}}.
Deletes a media_item and its associated tasks.
Returns {:ok, %MediaItem{}} | {:error, %Ecto.Changeset{}}.
"""
def delete_media_item(%MediaItem{} = media_item) do
Tasks.delete_tasks_for(media_item)
Repo.delete(media_item)
end

Expand Down
9 changes: 7 additions & 2 deletions lib/pinchflat/media/media_item.ex
Original file line number Diff line number Diff line change
Expand Up @@ -6,27 +6,32 @@ defmodule Pinchflat.Media.MediaItem do
use Ecto.Schema
import Ecto.Changeset

alias Pinchflat.Tasks.Task
alias Pinchflat.MediaSource.Channel
alias Pinchflat.Media.MediaMetadata

@required_fields ~w(media_id channel_id)a
@allowed_fields ~w(title media_id video_filepath channel_id)a

# IDEA: consider making an attached `metadata` model to store the JSON response from whatever backend is used

schema "media_items" do
field :title, :string
field :media_id, :string
field :video_filepath, :string

belongs_to :channel, Channel

has_one :metadata, MediaMetadata, on_replace: :update

has_many :tasks, Task

timestamps(type: :utc_datetime)
end

@doc false
def changeset(media_item, attrs) do
media_item
|> cast(attrs, @allowed_fields)
|> cast_assoc(:metadata, with: &MediaMetadata.changeset/2, required: false)
|> validate_required(@required_fields)
|> unique_constraint([:media_id, :channel_id])
end
Expand Down
28 changes: 28 additions & 0 deletions lib/pinchflat/media/media_metadata.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
defmodule Pinchflat.Media.MediaMetadata do
@moduledoc """
The MediaMetadata schema.
Look. Don't @ me about Metadata vs. Metadatum. I'm very sensitive.
"""

use Ecto.Schema
import Ecto.Changeset

alias Pinchflat.Media.MediaItem

schema "media_metadata" do
field :client_response, :map

belongs_to :media_item, MediaItem

timestamps(type: :utc_datetime)
end

@doc false
def changeset(media_metadata, attrs) do
media_metadata
|> cast(attrs, [:client_response])
|> validate_required([:client_response])
|> unique_constraint([:media_item_id])
end
end
Original file line number Diff line number Diff line change
Expand Up @@ -3,5 +3,5 @@ defmodule Pinchflat.MediaClient.Backends.BackendCommandRunner do
A behaviour for running CLI commands against a downloader backend
"""

@callback run(binary(), keyword()) :: {:ok, binary()} | {:error, binary(), integer()}
@callback run(binary(), keyword(), binary()) :: {:ok, binary()} | {:error, binary(), integer()}
end
4 changes: 2 additions & 2 deletions lib/pinchflat/media_client/backends/yt_dlp/channel.ex
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ defmodule Pinchflat.MediaClient.Backends.YtDlp.Channel do
Returns {:ok, %ChannelDetails{}} | {:error, any, ...}.
"""
def get_channel_details(channel_url) do
opts = [print: "%(.{channel,channel_id})j", playlist_end: 1]
opts = [playlist_end: 1]

with {:ok, output} <- backend_runner().run(channel_url, opts),
with {:ok, output} <- backend_runner().run(channel_url, opts, "%(.{channel,channel_id})j"),
{:ok, parsed_json} <- Phoenix.json_library().decode(output) do
{:ok, ChannelDetails.new(parsed_json["channel_id"], parsed_json["channel"])}
else
Expand Down
44 changes: 35 additions & 9 deletions lib/pinchflat/media_client/backends/yt_dlp/command_runner.ex
Original file line number Diff line number Diff line change
Expand Up @@ -3,31 +3,57 @@ defmodule Pinchflat.MediaClient.Backends.YtDlp.CommandRunner do
Runs yt-dlp commands using the `System.cmd/3` function
"""

require Logger

alias Pinchflat.Utils.StringUtils
alias Pinchflat.MediaClient.Backends.BackendCommandRunner

@behaviour BackendCommandRunner

@doc """
Runs a yt-dlp command and returns the string output
Runs a yt-dlp command and returns the string output. Saves the output to
a file and then returns its contents because yt-dlp will return warnings
to stdout even if the command is successful, but these will break JSON parsing.
Returns {:ok, binary()} | {:error, output, status}.
# IDEA: deduplicate command opts, keeping the last one on conflict
although possibly not needed (and a LOT easier) if yt-dlp
just ignores duplicate options (ie: look into that)
IDEA: Indexing takes a long time, but the output is actually streamed to stdout.
Maybe we could listen to that stream instead so we can index videos as they're discovered.
See: https://stackoverflow.com/a/49061086/5665799
"""
@impl BackendCommandRunner
def run(url, command_opts) do
def run(url, command_opts, output_template) do
command = backend_executable()
formatted_command_opts = parse_options(command_opts) ++ [url]
# These must stay in exactly this order, hence why I'm giving it its own variable.
# Also, can't use RAM file since yt-dlp needs a concrete filepath.
json_output_path = generate_json_output_path()
print_to_file_opts = [{:print_to_file, output_template}, json_output_path]
formatted_command_opts = [url] ++ parse_options(command_opts ++ print_to_file_opts)

Logger.debug("[yt-dlp] called with: #{Enum.join(formatted_command_opts, " ")}")

case System.cmd(command, formatted_command_opts, stderr_to_stdout: true) do
{output, 0} -> {:ok, output}
{output, status} -> {:error, output, status}
{_, 0} ->
# IDEA: consider deleting the file after reading it
# (even on error? especially on error?)
File.read(json_output_path)

{output, status} ->
{:error, output, status}
end
end

defp generate_json_output_path do
metadata_directory = Application.get_env(:pinchflat, :metadata_directory)
filepath = Path.join([metadata_directory, "#{StringUtils.random_string(64)}.json"])

# Ensure the file can be created and written to BEFORE we run the `yt-dlp` command
:ok = File.mkdir_p!(Path.dirname(filepath))
:ok = File.write(filepath, "")

filepath
end

# We want to satisfy the following behaviours:
#
# 1. If the key is an atom, convert it to a string and convert it to kebab case (for convenience)
Expand Down Expand Up @@ -55,7 +81,7 @@ defmodule Pinchflat.MediaClient.Backends.YtDlp.CommandRunner do
end

defp parse_option(arg, acc) when is_binary(arg) do
[arg | acc]
acc ++ [arg]
end

defp backend_executable do
Expand Down
27 changes: 27 additions & 0 deletions lib/pinchflat/media_client/backends/yt_dlp/metadata_parser.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
defmodule Pinchflat.MediaClient.Backends.YtDlp.MetadataParser do
@moduledoc """
yt-dlp offers a LOT of metadata in its JSON response, some of which
needs to be extracted and included in various models.
For now, also squirrel all of it away in the `media_metadata` table.
I might revisit this or pare it down later, but I'd rather need it
and not have it, ya know?
"""

@doc """
Parses the given JSON response from yt-dlp and returns a map of
the needful media_item attributes, along with anything needed for
its associations.
Returns map()
"""
def parse_for_media_item(metadata) do
%{
title: metadata["title"],
video_filepath: metadata["filepath"],
metadata: %{
client_response: metadata
}
}
end
end
6 changes: 3 additions & 3 deletions lib/pinchflat/media_client/backends/yt_dlp/video.ex
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@ defmodule Pinchflat.MediaClient.Backends.YtDlp.Video do
"""

@doc """
Downloads a single video (and possible metadata) directly to its
Downloads a single video (and possibly its metadata) directly to its
final destination. Returns the parsed JSON output from yt-dlp.
Returns {:ok, map()} | {:error, any, ...}.
"""
def download(url, command_opts \\ []) do
opts = [:no_simulate, print: "%()j"] ++ command_opts
opts = [:no_simulate] ++ command_opts

with {:ok, output} <- backend_runner().run(url, opts),
with {:ok, output} <- backend_runner().run(url, opts, "after_move:%()j"),
{:ok, parsed_json} <- Phoenix.json_library().decode(output) do
{:ok, parsed_json}
else
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@ defmodule Pinchflat.MediaClient.Backends.YtDlp.VideoCollection do
"""
def get_video_ids(url, command_opts \\ []) do
runner = Application.get_env(:pinchflat, :yt_dlp_runner)
opts = command_opts ++ [:simulate, :skip_download, print: :id]
opts = command_opts ++ [:simulate, :skip_download]

case runner.run(url, opts) do
case runner.run(url, opts, "%(id)s") do
{:ok, output} -> {:ok, String.split(output, "\n", trim: true)}
res -> res
end
Expand Down
36 changes: 33 additions & 3 deletions lib/pinchflat/media_client/video_downloader.ex
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,41 @@ defmodule Pinchflat.MediaClient.VideoDownloader do
it open-ish for future expansion (just in case).
"""

alias Pinchflat.Repo
alias Pinchflat.Media
alias Pinchflat.Media.MediaItem
alias Pinchflat.Profiles.MediaProfile

alias Pinchflat.MediaClient.Backends.YtDlp.Video, as: YtDlpVideo
alias Pinchflat.Profiles.Options.YtDlp.OptionBuilder, as: YtDlpOptionBuilder
alias Pinchflat.MediaClient.Backends.YtDlp.MetadataParser, as: YtDlpMetadataParser

@doc """
Downloads a single video based on the settings in the given media profile.
Downloads a video for a media item, updating the media item based on the metadata
returned by the backend. Also saves the entire metadata response to the associated
media_metadata record.
Returns {:ok, %ChannelDetails{}} | {:error, any, ...}.
Returns {:ok, %MediaItem{}} | {:error, any, ...any}
"""
def download_for_media_profile(url, %MediaProfile{} = media_profile, backend \\ :yt_dlp) do
def download_for_media_item(%MediaItem{} = media_item, backend \\ :yt_dlp) do
item_with_preloads = Repo.preload(media_item, [:metadata, channel: :media_profile])
media_profile = item_with_preloads.channel.media_profile

case download_for_media_profile(media_item.media_id, media_profile, backend) do
{:ok, parsed_json} ->
parser = metadata_parser(backend)
parsed_attrs = parser.parse_for_media_item(parsed_json)

# Don't forgor to use preloaded associations or updates to
# associations won't work!
Media.update_media_item(item_with_preloads, parsed_attrs)

err ->
err
end
end

defp download_for_media_profile(url, %MediaProfile{} = media_profile, backend) do
option_builder = option_builder(backend)
video_backend = video_backend(backend)
{:ok, options} = option_builder.build(media_profile)
Expand All @@ -37,4 +61,10 @@ defmodule Pinchflat.MediaClient.VideoDownloader do
:yt_dlp -> YtDlpVideo
end
end

defp metadata_parser(backend) do
case backend do
:yt_dlp -> YtDlpMetadataParser
end
end
end
Loading

0 comments on commit b202b3b

Please sign in to comment.