检测Elixir/OTP supervisor子生成和终止_Elixir_Otp

检测Elixir/OTP supervisor子生成和终止

elixir

检测Elixir/OTP supervisor子生成和终止,elixir,otp,Elixir,Otp,作为一项学术练习，我正在用长生不老药建立一个工作队列。当前，我的工作人员在创建时必须手动向队列注册（请参见MyQuestion.Worker.start\u link）我希望我的主管在创建/重新启动可用的工作人员时将其注册到队列中，因为这似乎有助于测试工作人员并最大限度地减少耦合有没有办法做到我在下面的代码中描述的MyQuestion.Supervisor defmodule MyQuestion.Supervisor do use Supervisor def start_lin

作为一项学术练习，我正在用长生不老药建立一个工作队列。当前，我的工作人员在创建时必须手动向队列注册（请参见

MyQuestion.Worker.start\u link

）

我希望我的主管在创建/重新启动可用的工作人员时将其注册到队列中，因为这似乎有助于测试工作人员并最大限度地减少耦合

有没有办法做到我在下面的代码中描述的

MyQuestion.Supervisor

defmodule MyQuestion.Supervisor do
  use Supervisor

  def start_link do
    supervisor = Supervisor.start_link(__MODULE__, :ok)
  end

  def init(:ok) do
    children = [
      worker(MyQuestion.JobQueue, []),
      worker(MyQuestion.Worker, [], id: :worker_0),
      worker(MyQuestion.Worker, [], id: :worker_1)]
    supervise(children, strategy: :rest_for_one)
  end

  # LOOKING FOR SOMETHING LIKE THIS
  # on worker spawn, I want to add the worker to the queue
  def child_spawned(pid, {MyQuestion.Worker, _, _}) do
    # add worker to queue
    MyQuestion.JobQueue.add_new_worker(pid)
  end

  # LOOKING FOR SOMETHING LIKE THIS
  # I want some way to do the following (imagine the callback existed)
  def child_terminated(pid, reason, state) 
    # with this information I could tell the job queue to mark 
    # the job associated with the pid as failed and to retry
    # or maybe extract the job id from the worker state, etc.
    MyQuestion.JobQueue.remove_worker(pid)
    MyQuestion.JobQueue.restart_job_for_failed_worker(pid)
  end

end

defmodule MyQuestion.JobQueue do
  def start_link do
    Agent.start_link(fn -> [] end, name: __MODULE__)
  end

  def new_worker(pid) do
    # register pid with agent state in available worker list, etc.
  end

  def add_job(job_description) do
    # find idle worker and run job
    <... snip ...>
  end

  <... snip ...>
end

defmodule MyQuestion.Worker do
  use GenServer
  def start_link do
    # start worker
    {:ok, worker} = GenServer.start_link(__MODULE__, [])

    # Now we have a worker pid, so we can register that pid with the queue
    # I wish this could be in the supervisor or else where.
    MyQuestion.JobQueue.add_new_worker(worker)

    # must return gen server's start link
    {:ok, worker}
  end

  <... snip ...>
end

defmodule MyQuestion.Supervisor do
使用主管
def启动链接do
supervisor=supervisor.start\u链接（\u\u模块\u，：ok）
结束
def init（：ok）do
儿童=[
worker（MyQuestion.JobQueue，[]），
工作者（MyQuestion.worker，[]，id:：worker_0），
worker（MyQuestion.worker，[]，id:：worker_1）]
监督（儿童，策略：：一人休息）
结束
#在找这样的东西吗
#在worker spawn上，我想将worker添加到队列中
def child_派生（pid，{MyQuestion.Worker，u，}）do
#将工作进程添加到队列
MyQuestion.JobQueue.add\u new\u worker（pid）
结束
#在找这样的东西吗
#我想用某种方法来做以下事情（假设回调已经存在）
def子项_已终止（pid、原因、状态）
#有了这些信息，我可以告诉作业队列进行标记
#与pid as关联的作业失败，无法重试
#或者从工作者状态中提取作业id，等等。
MyQuestion.JobQueue.remove_worker（pid）
MyQueue.JobQueue.restart\u job\u for\u failed\u worker（pid）
结束
结束
defmodule MyQuestion.JobQueue do
def启动链接do
Agent.start\u链接（fn->[]结束，名称：\模块\模块）
结束
def新工作人员（pid）do
#在可用工作人员列表等中使用代理状态注册pid。
结束
def添加作业（作业描述）do
#找到空闲的工人并运行作业
结束
结束
defmodulemyquestion.workerdo
使用GenServer
def启动链接do
#开始工作
{：好的，worker}=GenServer.start_链接（_模块，[]）
#现在我们有了一个工作pid，所以我们可以在队列中注册该pid
#我希望这可能是在主管或其他地方。
MyQuestion.JobQueue.add\u new\u worker（worker）
#必须返回发电机服务器的启动链接
{：好的，工人}
结束
结束

他们的关键是调用

过程。监视器（pid）

——然后您将收到调用

处理信息

——并手动调用

主管。启动子系统

，这将为您提供pid

我以前曾尝试使用

处理信息

，但始终无法调用它

Process.monitor（pid）

必须从您希望接收通知的同一进程调用，因此您必须从

handle\u call

函数内部调用它，以将监视器与服务器进程关联起来。可能有一个函数可以将代码作为另一个进程运行（即从进程运行（作业队列pid，fn->process.monitor（pid到监视器）end）），但我找不到任何内容

附件是作业队列的一个非常简单的实现。我对Elixir只了解了一天，所以代码既混乱又不惯用，但我附上它是因为似乎缺少关于这个主题的示例代码

看看HeavyIndustry.JobQueue，

处理信息

，

创建新工人

。这段代码有一个明显的问题：它能够在工作人员崩溃时重新启动工作人员，但无法从该代码启动下一个作业的队列（因为需要

GenServer.call

inside

handle\u info

，这会使我们陷入死锁）。我认为可以通过将启动作业的流程与跟踪作业的流程分离来解决这个问题。如果您运行示例代码，您将注意到，即使队列中还有一个作业，它最终也会停止运行作业（作业

：crash

）

defmodule HeavyIndustry.Supervisor do
  use Supervisor

  def start_link do
    Supervisor.start_link(__MODULE__, :ok)
  end

  def init(:ok) do
    # default to supervising nothing, we will add
    supervise([], strategy: :one_for_one)
  end

  def create_children(supervisor, worker_count) do
    # create the job queue. defaults to no workers
    Supervisor.start_child(supervisor, worker(HeavyIndustry.JobQueue, [[supervisor, worker_count]]))
  end
end

defmodule HeavyIndustry.JobQueue do
  use GenServer

  @job_queue_name __MODULE__

  def start_link(args, _) do
    GenServer.start_link(__MODULE__, args, name: @job_queue_name)
  end

  def init([supervisor, n]) do
    # set some default state
    state = %{
      supervisor: supervisor,
      max_workers: n,
      jobs: [],
      workers: %{
        idle: [],
        busy: []
      }
    }
    {:ok, state}
  end

  def setup() do
    # we want to be aware of worker failures. we hook into this by calling
    # Process.monitor(pid), but this links the calling process with the monitored
    # process. To make sure the calls come to US and not the process that called
    # setup, we create the workers by passing a message to our server process
    state = GenServer.call(@job_queue_name, :setup)

    # gross passing the whole state back here to monitor but the monitoring must
    # be started from the server process and we can't call GenServer.call from
    # inside the :setup call else we deadlock.
    workers = state.workers.idle
    GenServer.call(@job_queue_name, {:monitor_pids, workers})
  end

  def add_job(from, job) do
    # add job to queue
    {:ok, our_job_id} = GenServer.call(@job_queue_name, {:create_job, %{job: job, reply_to: from}})

    # try to run the next job
    case GenServer.call(@job_queue_name, :start_next_job) do
      # started our job
      {:ok, started_job_id = ^our_job_id} -> {:ok, :started}
      # started *a* job
      {:ok, _} -> {:ok, :pending}
      # couldnt start any job but its ok...
      {:error, :no_idle_workers} -> {:ok, :pending}
      # something fell over...
      {:error, e} -> {:error, e}
      # yeah I know this is bad.
      _ -> {:ok}
    end
  end

  def start_next_job do
    GenServer.call(@job_queue_name, :start_next_job)
  end

  ##
  # Internal API
  ##

  def handle_call(:setup, _, state) do
    workers = Enum.map(0..(state.max_workers-1), fn (n) ->
      {:ok, pid} = start_new_worker(state.supervisor)
      pid
    end)
    state = %{state | workers: %{state.workers | idle: workers}}
    {:reply, state, state}
  end

  defp start_new_worker(supervisor) do
    spec = Supervisor.Spec.worker(HeavyIndustry.Worker, [], id: :"Worker.#{:os.system_time}", restart: :temporary)
    # start worker
    Supervisor.start_child(supervisor, spec)
  end

  def handle_call({:monitor_pids, list}, _, state) do
    Enum.each(list, &Process.monitor(&1))
    {:reply, :ok, state}
  end

  def handle_call({:create_job, job}, from, state) do
    job = %{
      job: job.job,
      reply_to: job.reply_to,
      id: :os.system_time, # id for task
      status: :pending, # start pending, go active, then remove
      pid: nil
    }
    # add new job to jobs list
    state = %{state | jobs: state.jobs ++ [job]}
    {:reply, {:ok, job.id}, state}
  end

  def handle_call(:start_next_job, _, state) do
    IO.puts "==> Start Next Job"
    IO.inspect state
    IO.puts "=================="

    reply = case {find_idle_worker(state.workers), find_next_job(state.jobs)} do
      {{:error, :no_idle_workers}, _} ->
        # no workers for job, doesnt matter if we have a job
        {:error, :no_idle_workers}

      {_, nil} ->
        # no job, doesnt matter if we have a worker
        {:error, :no_more_jobs}

      {{:ok, worker}, job} ->
        # have worker, have job, do work

        # update state to set job active and worker busy
        jobs = state.jobs -- [job]
        job = %{job | status: :active, pid: worker}
        jobs = jobs ++ [job]

        idle = state.workers.idle -- [worker]
        busy = state.workers.busy ++ [worker]

        state = %{state | jobs: jobs, workers: %{idle: idle, busy: busy}}

        {:ok, task_id} = Task.start(fn ->
          result = GenServer.call(worker, job.job)

          remove_job(job)
          free_worker(worker)

          send job.reply_to, %{answer: result, job: job.job}

          start_next_job
        end)
        {:ok, job.id}
    end

    {:reply, reply, state}
  end

  defp find_idle_worker(workers) do
    case workers do
      %{idle: [], busy: _} -> {:error, :no_idle_workers}
      %{idle: [worker | idle], busy: busy} -> {:ok, worker}
    end
  end

  defp find_next_job(jobs) do
    jobs |> Enum.find(&(&1.status == :pending))
  end

  defp free_worker(worker) do
    GenServer.call(@job_queue_name, {:free_worker, worker})
  end
  defp remove_job(job) do
    GenServer.call(@job_queue_name, {:remove_job, job})
  end

  def handle_call({:free_worker, worker}, from, state) do
    idle = state.workers.idle ++ [worker]
    busy = state.workers.busy -- [worker]
    {:reply, :ok, %{state | workers: %{idle: idle, busy: busy}}}
  end

  def handle_call({:remove_job, job}, from, state) do
    jobs = state.jobs -- [job]
    {:reply, :ok, %{state | jobs: jobs}}
  end

  def handle_info(msg = {reason, ref, :process, pid, _reason}, state) do
    IO.puts "Worker collapsed: #{reason} #{inspect pid}, clear and restart job"

    # find job for collapsed worker
    # set job to pending again
    job = Enum.find(state.jobs, &(&1.pid == pid))
    fixed_job = %{job | status: :pending, pid: nil}
    jobs = (state.jobs -- [job]) ++ [fixed_job]

    # remote worker from lists
    idle = state.workers.idle -- [pid]
    busy = state.workers.busy -- [pid]

    # start new worker
    {:ok, pid} = start_new_worker(state.supervisor)

    # add worker from lists
    idle = state.workers.idle ++ [pid]

    # cant call GenServer.call from here to monitor pid,
    # so duplicate the code a bit...
    Process.monitor(pid)

    # update state
    state = %{state | jobs: jobs, workers: %{idle: idle, busy: busy}}

    {:noreply, state}
  end
end

defmodule HeavyIndustry.Worker do
  use GenServer

  def start_link do
    GenServer.start_link(__MODULE__, :ok)
  end

  def init(:ok) do
    # workers have no persistent state
    IO.puts "==> Worker up! #{inspect self}"
    {:ok, nil}
  end

  def handle_call({:sum, list}, from, _) do
    sum = Enum.reduce(list, fn (n, acc) -> acc + n end)
    {:reply, sum, nil}
  end

  def handle_call({:fib, n}, from, _) do
    sum = fib_calc(n)
    {:reply, sum, nil}
  end

  def handle_call({:stop}, from, state) do
    {:stop, "my-stop-reason", "my-stop-reply", state}
  end

  def handle_call({:crash}, from, _) do
    {:reply, "this will crash" ++ 1234, nil}
  end

  def handle_call({:timeout}, from, _) do
    :timer.sleep 10000
    {:reply, "this will timeout", nil}
  end

  # Slow fib
  defp fib_calc(0), do: 0
  defp fib_calc(1), do: 1
  defp fib_calc(n), do: fib_calc(n-1) + fib_calc(n-2)

end

defmodule Looper do
  def start do
    {:ok, pid} = HeavyIndustry.Supervisor.start_link
    {:ok, job_queue} = HeavyIndustry.Supervisor.create_children(pid, 2)
    HeavyIndustry.JobQueue.setup()
    add_jobs
    loop
  end

  def add_jobs do
    jobs = [
      {:sum, [100, 200, 300]},
      {:crash},
      {:fib, 35},
      {:fib, 35},
      {:sum, [88, 88, 99]},
      {:fib, 35},
      {:fib, 35},
      {:fib, 35},
      {:sum, 0..100},
      # {:stop}, # stop not really a failure

      {:sum, [88, 88, 99]},
      # {:timeout},
      {:sum, [-1]}
    ]
    Enum.each(jobs, fn (job) ->
      IO.puts "~~~~> Add job: #{inspect job}"
      case HeavyIndustry.JobQueue.add_job(self, job) do
        {:ok, :started} -> IO.puts "~~~~> Started job immediately"
        {:ok, :pending} -> IO.puts "~~~~> Job in queue"
        val -> IO.puts "~~~~> ... val: #{inspect val}"
      end
    end)
  end

  def loop do
    receive do
      value ->
        IO.puts "~~~~> Received: #{inspect value}"
        loop
    end
  end
end

Looper.start