Ruby on rails 在Ruby on Rails中从direct url aws sdk将大文件上载到S3

Ruby on rails 在Ruby on Rails中从direct url aws sdk将大文件上载到S3,ruby-on-rails,multithreading,amazon-s3,Ruby On Rails,Multithreading,Amazon S3,在我的应用程序中,我目前支持从直接下载url上传,用户可以输入url,也可以从Box file picker小部件生成url。我通过一个Net:HTTP请求来实现这一点,将每个段写入文件系统 现在我想改为将url中的文件存储在S3中,因为文件太大,无法放入内存 以下是我目前正在处理的一个片段: queue = Queue.new up_url = presigned_url_from_aws down_uri = remote_download_url producer = Thread.ne

在我的应用程序中,我目前支持从直接下载url上传,用户可以输入url,也可以从Box file picker小部件生成url。我通过一个Net:HTTP请求来实现这一点,将每个段写入文件系统

现在我想改为将url中的文件存储在S3中,因为文件太大,无法放入内存

以下是我目前正在处理的一个片段:

queue = Queue.new
up_url = presigned_url_from_aws
down_uri = remote_download_url

producer = Thread.new do
  # stream the file from the url,
  # (code based on something currently working)
  Net::HTTP.start(down_uri.host, down_uri.port, :use_ssl => (down_uri.scheme == 'https')) {|http|
    http.request_get(down_uri.path) {|res|

      res.read_body {|seg|
        queue << seg
        update_progress()
      }
    }
  }
end

consumer = Thread.new do
  # turn queue input into body_stream ?
end

# Use presigned url to upload file to aws
Net::HTTP.start(up_url.host) do |http|
  http.send_request("PUT", up_url.request_uri, body_stream, {
      # This is required, or Net::HTTP will add a default unsigned content-type.
      "content-type" => "",
  })
end
queue=queue.new
up\u url=从aws预先签名的\u url\u
down\u uri=远程下载\u url
producer=Thread.new do
#从url流式传输文件,
#(基于当前工作的代码)
Net::HTTP.start(down_uri.host,down_uri.port,:use_ssl=>(down_uri.scheme=='https')){HTTP|
http.request_get(down_uri.path){res|
res.read|u body{| seg|
队列“”,
})
结束

我最终找到了一个有效的解决方案。与以前一样,这段代码在ProgressJob类中。我使用aws多部分上载api。我创建了一个段队列,一个生产者线程将段放入队列,一个消费者线程将段从队列中取出以进行进一步处理,以及一个线程在正确的时间关闭队列。在在消费线程中,我将这些片段放入StringIO对象,直到每个片段都被删除,但最后一个片段的大小至少为5MB(上传部分的最小大小),并在获得这些片段时将其发送到s3,以避免填满磁盘或内存。有很多陷阱,但下面是我最终得到的工作代码,以防这对其他人有所帮助:

require 'tempfile'
require 'open-uri'
require 'fileutils'
require 'net/http'
require 'aws-sdk-s3'

class CreateDatafileFromRemoteJob < ProgressJob::Base

  Thread.abort_on_exception=true

  FIVE_MB = 1024 * 1024 * 5

  def initialize(dataset_id, datafile, remote_url, filename, filesize)
    @remote_url = remote_url
    @dataset_id = dataset_id
    @datafile = datafile
    @filename = filename
    @filesize = filesize #string because it is used in display

    if filesize.to_f < 4000
      progress_max = 2
    else
      progress_max = (filesize.to_f / 4000).to_i + 1
    end

    super progress_max: progress_max
  end

  def perform

    more_segs_to_do = true
    upload_incomplete = true

    @datafile.binary_name = @filename
    @datafile.storage_root = Application.storage_manager.draft_root.name
    @datafile.storage_key = File.join(@datafile.web_id, @filename)
    @datafile.binary_size = @filesize
    @datafile.save!

    if IDB_CONFIG[:aws][:s3_mode]

      upload_key = @datafile.storage_key
      upload_bucket = Application.storage_manager.draft_root.bucket

      if Application.storage_manager.draft_root.prefix
        upload_key = "#{Application.storage_manager.draft_root.prefix}#{@datafile.storage_key}"
      end

      client = Application.aws_client

      if @filesize.to_f < FIVE_MB
        web_contents = open(@remote_url) {|f| f.read}
        Application.storage_manager.draft_root.copy_io_to(@datafile.storage_key, web_contents, nil, @filesize.to_f)
        upload_incomplete = false

      else

        parts = []

        seg_queue = Queue.new

        mutex = Mutex.new

        segs_complete = false
        segs_todo = 0
        segs_done = 0

        begin

          upload_id = aws_mulitpart_start(client, upload_bucket, upload_key)

          seg_producer = Thread.new do

            uri = URI.parse(@remote_url)

            Net::HTTP.start(uri.host, uri.port, :use_ssl => (uri.scheme == 'https')) {|http|
              http.request_get(uri.path) {|res|

                res.read_body {|seg|
                  mutex.synchronize {
                    segs_todo = segs_todo + 1
                  }
                  seg_queue << seg
                  update_progress
                }
              }
            }
            mutex.synchronize {
              segs_complete = true
            }

          end

          seg_consumer = Thread.new do

            part_number = 1

            partio = StringIO.new("", 'wb+')

            while seg = seg_queue.deq # wait for queue to be closed in controller thread

              partio << seg

              if partio.size > FIVE_MB

                partio.rewind

                mutex.synchronize {

                  etag = aws_upload_part(client, partio, upload_bucket, upload_key, part_number, upload_id)

                  parts_hash = {etag: etag, part_number: part_number}

                  parts.push(parts_hash)

                }

                part_number = part_number + 1

                partio.close if partio&.closed?

                partio = StringIO.new("", 'wb+')

              end

              mutex.synchronize {
                segs_done = segs_done + 1
              }

            end

            # upload last part, less than 5 MB
            mutex.synchronize {

              partio.rewind

              etag = aws_upload_part(client, partio, upload_bucket, upload_key, part_number, upload_id)

              parts_hash = {etag: etag, part_number: part_number}

              parts.push(parts_hash)

              Rails.logger.warn("Another part bites the dust: #{part_number}")

              partio.close if partio&.closed?

              aws_complete_upload(client, upload_bucket, upload_key, parts, upload_id)

              upload_incomplete = false
            }

          end

          controller = Thread.new do

            while more_segs_to_do
              sleep 0.9
              mutex.synchronize {
                if segs_complete && ( segs_done == segs_todo)
                  more_segs_to_do = false
                end
              }
            end

            seg_queue.close

          end

        rescue Exception => ex
          # ..|..
          #

          Rails.logger.warn("something went wrong during multipart upload")
          Rails.logger.warn(ex.class)
          Rails.logger.warn(ex.message)
          ex.backtrace.each do |line|
            Rails.logger.warn(line)
          end

          Application.aws_client.abort_multipart_upload({
                                                            bucket: upload_bucket,
                                                            key: upload_key,
                                                            upload_id: upload_id,
                                                        })
          raise ex

        end

      end

    else

      filepath = "#{Application.storage_manager.draft_root.path}/#{@datafile.storage_key}"

      dir_name = File.dirname(filepath)

      FileUtils.mkdir_p(dir_name) unless File.directory?(dir_name)

      File.open(filepath, 'wb+') do |outfile|
        uri = URI.parse(@remote_url)
        Net::HTTP.start(uri.host, uri.port, :use_ssl => (uri.scheme == 'https')) {|http|
          http.request_get(uri.path) {|res|

            res.read_body {|seg|
              outfile << seg
              update_progress
            }
          }
        }

      end

      upload_incomplete = false

    end

    while upload_incomplete
      sleep 1.3
    end

  end

  def aws_mulitpart_start(client, upload_bucket, upload_key)
    start_response = client.create_multipart_upload({
                                                        bucket: upload_bucket,
                                                        key: upload_key,
                                                    })

    start_response.upload_id

  end

  def aws_upload_part(client, partio, upload_bucket, upload_key, part_number, upload_id)

    part_response = client.upload_part({
                                           body: partio,
                                           bucket: upload_bucket,
                                           key: upload_key,
                                           part_number: part_number,
                                           upload_id: upload_id,
                                       })

    part_response.etag


  end

  def aws_complete_upload(client, upload_bucket, upload_key, parts, upload_id)

    response = client.complete_multipart_upload({
                                                    bucket: upload_bucket,
                                                    key: upload_key,
                                                    multipart_upload: {parts: parts, },
                                                    upload_id: upload_id,
                                                })
  end

end
require'tempfile'
需要“打开uri”
需要“fileutils”
需要“net/http”
需要“aws-sdk-s3”
类CreateDatafileFromRemoteJob(uri.scheme=='https')){HTTP|
http.request_get(uri.path){res|
res.read|u body{| seg|
互斥同步{
segs_todo=segs_todo+1
}
seg_队列
# ..|..
#
Rails.logger.warn(“在多部分上传过程中出错”)
Rails.logger.warn(ex.class)
Rails.logger.warn(例如消息)
例如,回溯每个do |行|
Rails.logger.warn(行)
结束
Application.aws\u client.abort\u多部分上传({
bucket:upload_bucket,
key:upload_key,
上传标识:上传标识,
})
加薪
结束
结束
其他的
filepath=“#{Application.storage_manager.draft_root.path}/#{@datafile.storage_key}”
dir_name=File.dirname(文件路径)
FileUtils.mkdir\u p(dir\u名称),除非File.directory?(dir\u名称)
File.open(文件路径“wb+”)do | outfile|
uri=uri.parse(@remote\uURL)
Net::HTTP.start(uri.host,uri.port,:use_ssl=>(uri.scheme=='https')){HTTP|
http.request_get(uri.path){res|
res.read|u body{| seg|
输出文件