Warning: file_get_contents(/data/phpspider/zhask/data//catemap/5/ruby/20.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Ruby on rails Nokogiri使用Ruby On Rails进行抓取,但未按预期工作_Ruby On Rails_Ruby_Web Scraping_Nokogiri - Fatal编程技术网

Ruby on rails Nokogiri使用Ruby On Rails进行抓取,但未按预期工作

Ruby on rails Nokogiri使用Ruby On Rails进行抓取,但未按预期工作,ruby-on-rails,ruby,web-scraping,nokogiri,Ruby On Rails,Ruby,Web Scraping,Nokogiri,我对RubyonRails完全陌生,但我想我可能遗漏了一些明显的东西。我目前正在开发一个网络应用程序,可以搜索拍卖网站。应用程序的骨骼是由其他人创建的。我目前正在尝试添加新的网站抓取,但它们似乎不起作用 我已经阅读了一些Nokogiri文档,检查了刮取的信息是否确实没有写入数据库(当我通过rails控制台检查时,目标的种子URL已经被删除),并使用chrome extension CSS Selector Tester检查了我是否针对正确的CSS选择器。当我通过rails控制台检查时,记录ID是

我对RubyonRails完全陌生,但我想我可能遗漏了一些明显的东西。我目前正在开发一个网络应用程序,可以搜索拍卖网站。应用程序的骨骼是由其他人创建的。我目前正在尝试添加新的网站抓取,但它们似乎不起作用

我已经阅读了一些Nokogiri文档,检查了刮取的信息是否确实没有写入数据库(当我通过rails控制台检查时,目标的种子URL已经被删除),并使用chrome extension CSS Selector Tester检查了我是否针对正确的CSS选择器。当我通过rails控制台检查时,记录ID是正确的

我已经在下面列出了我认为重要的代码部分,但是我可能遗漏了一些我没有意识到重要的东西

我遇到问题的网站是&

任何帮助都将不胜感激

种子URL

Source.create(name: "Auction.fr", query_template: "https://www.auction.fr/_en/lot/search/?contexte=futures&tri=date_debut%20ASC&query={query}&page={page}")
Source.create(name: "Invaluable.co.uk", query_template: "https://www.invaluable.co.uk/search/api/search-results?keyword={query}&size=1000")
Source.create(name: "Interencheres.com", query_template: "http://www.interencheres.com/en/recherche/lot?search%5Bkeyword%5D={query}&page={page}")
Source.create(name: "Gazette-drouot.com", query_template: "http://catalogue.gazette-drouot.com/html/g/recherche.jsp?numPage={page}&filterDate=1&query={query}&npp=100")
Source.create(name: "Lot-art.com", query_template: "http://www.lot-art.com/auction-search/?form_id=lot_search_form&page=1&mq=&q={query}&ord=recent")
Source.create(name: "Lot-tissimo.com", query_template: "https://lot-tissimo.com/en/cmd=s&lwr=&ww={query}&xw=&srt=SN&wg=EUR&page={page}")
调度程序代码

require 'rufus-scheduler'

require 'nokogiri'
require 'mechanize'
require 'open-uri'
require "net/https"


s = Rufus::Scheduler.singleton


s.interval '1m' do
  setting = Setting.find(1)
  agent = Mechanize.new

  agent.user_agent_alias = 'Windows Chrome'

  agent.cookie_jar.load(File.join(Rails.root, 'tmp/cookies.yaml'))
  List.all.each do |list|
    number_of_new_items = 0

    list.actions.each do |action|
      url = action.source.query_template.gsub('{query}', action.list.query)

      case action.source.id
      when 1 # Auction.fr
        20.downto(1) do |page|
          doc = Nokogiri::HTML(open(url.gsub('{page}', page.to_s)))

          doc.css("div.list-products > ul > li").reverse.each do |item_data|

            price = 0
            if item_data.at_css("h3.h4.adjucation.ft-blue") && /Selling price : ([\d\s]+) €/.match(item_data.at_css("h3.h4.adjucation.ft-blue").text)
              price = /Selling price : ([\d\s]+) €/.match(item_data.at_css("h3.h4.adjucation.ft-blue").text)[1].gsub(" ", "")
            end

            item = action.items.new(
              title: item_data.at_css("h2").text.strip,
              url: item_data.at_css("h2 a")["href"],
              picture: item_data.at_css("div.image-wrap.lazy div.image img")["src"],
              price: price,
              currency: "€"
            )

            ActiveRecord::Base.logger.silence do # This disable writing logs
              if item.save
                number_of_new_items = number_of_new_items + 1
              end
            end

          end
        end

      when 97 # Lot-Tissimo.com
        5.downto(1) do |page|
          doc = Nokogiri::HTML(open(url.gsub('{page}', page.to_s)))

          doc.css("#inhalt > .objektliste").reverse.each do |item_data|

      #      price = 0
      #      if item_data.at_css("h3.h4.adjucation.ft-blue") && /Selling price : ([\d\s]+) €/.match(item_data.at_css("h3.h4.adjucation.ft-blue").text)
      #        price = /Selling price : ([\d\s]+) €/.match(item_data.at_css("h3.h4.adjucation.ft-blue").text)[1].gsub(" ", "")
      #      end

            item = action.items.new(
              title: item_data.at_css("div.objli-desc").text.strip,
              url: item_data.at_css("td.objektliste-foto a")["href"],
              picture: item_data.at_css("td.objektliste-foto a#lot_link img")["src"],
              price: price,
              currency: "€"
            )

            ActiveRecord::Base.logger.silence do # This disable writing logs
              if item.save
                number_of_new_items = number_of_new_items + 1
              end
            end


          end
        end

      when 2 # Invaluable.co.uk
        doc = JSON.parse(open(url).read)

        doc["itemViewList"].reverse.each do |item_data|

          puts item_data["itemView"]["photos"]

          item = action.items.new(
            title: item_data["itemView"]["title"],
            url: "https://www.invaluable.co.uk/buy-now/" + item_data["itemView"]["title"].parameterize + "-" + item_data["itemView"]["ref"],
            picture: item_data["itemView"]["photos"] != nil ? item_data["itemView"]["photos"].first["_links"]["medium"]["href"] : nil,
            price: item_data["itemView"]["price"],
            currency: item_data["itemView"]["currencySymbol"]
          )

          ActiveRecord::Base.logger.silence do # This disable writing logs
            if item.save
              number_of_new_items = number_of_new_items + 1
            end
          end

        end



      when 3 # Interencheres.com

      #  doc = Nokogiri::HTML(open(url))
       5.downto(1) do |page|
        doc = Nokogiri::HTML(open(url.gsub('{page}', page.to_s)))

        doc.css("div#lots_0 div.ligne_vente").reverse.each do |item_data|

          price = 0


          item = action.items.new(
            title: item_data.at_css("div.ph_vente div.des_vente p a").text.strip,
            url: "http://www.interencheres.com" + item_data.at_css("div.ph_vente div.des_vente p a")["href"],
            picture: item_data.at_css("div.ph_vente div.gd_ph_vente img")["src"],
            price: price,
            currency: "€"
          )

          ActiveRecord::Base.logger.silence do # This disable writing logs
            if item.save
              number_of_new_items = number_of_new_items + 1
            end
            end

          end
        end

      when 4 # Gazette-drouot.com

         5.downto(1) do |page|
       #   doc = Nokogiri::HTML(open(url.gsub('{page}', page.to_s)))
         doc = agent.get(url.gsub('{page}', page.to_s))
      #  doc = agent.get(url)
        doc.css("div#recherche_resultats div.lot_recherche").reverse.each do |item_data|

          price = 0

          picture = item_data.at_css("img.image_thumb_recherche") ? item_data.at_css("img.image_thumb_recherche")["src"] : nil
          item = action.items.new(
            title: item_data.at_css("#des_recherche").text.strip.truncate(140),
            url: "http://catalogue.gazette-drouot.com/html/g/" + item_data.at_css("a.lien_under")["href"],
            picture: picture,
            price: price,
            currency: "€"
          )

          ActiveRecord::Base.logger.silence do # This disable writing logs
            if item.save
              number_of_new_items = number_of_new_items + 1
            end
          end
          end

        end

      when 69 # Lot-art.com

        doc = agent.get(url)
        doc.css("div.lot_list_holder").reverse.each do |item_data|

          price = 0

          item = action.items.new(
            title: item_data.at_css("div.lot_list_body a")[0].text.strip.truncate(140),
            url: item_data.at_css("div.lot_list_body")["href"],
            picture: item_data.at_css("a.lot_list_thumb img") ["src"],
            price: price,
            currency: "€"
          )

          ActiveRecord::Base.logger.silence do # This disable writing logs
            if item.save
              number_of_new_items = number_of_new_items + 1
            end
          end


        end

      end

    end

    if number_of_new_items > 0 && setting.notifications_per_hour > setting.notifications_this_hour && setting.pushover_app_token.present? && setting.pushover_user_key.present?
      url = URI.parse("https://api.pushover.net/1/messages.json")
      req = Net::HTTP::Post.new(url.path)
      req.set_form_data({
                          :token => setting.pushover_app_token,
                          :user => setting.pushover_user_key,
                          :message => "#{number_of_new_items} new items on #{list.name}!",
                          :url_title => "Check the list",
                          :url => "http://spottheauction.com/lists/#{list.id}"
      })
      res = Net::HTTP.new(url.host, url.port)
      res.use_ssl = true
      res.verify_mode = OpenSSL::SSL::VERIFY_PEER
      res.start {|http| http.request(req) }
    end
  end
  agent.cookie_jar.save(File.join(Rails.root, 'tmp/cookies.yaml'))
end

s.cron '0 * * * *' do
  setting = Setting.find(1)
  setting.notifications_this_hour = 0
  setting.save
end

new
只初始化实例,不保存实例。你真的在某处调用了
save

您有两个选择:

对项目调用
save

item = action.items.new(
  # ...
)
item.save
或者使用
create
而不是
new

item = action.items.create(
  # ...
)

万一有人碰到这个。我得到了lot-art.com的许可。似乎我在css选择器中缺乏针对nokogiri提取正确数据的特异性


我仍然对lot tissimo有持续的问题,尽管这似乎来自其他方面,因为其他刮板也有问题,例如刮削hub的portia Spider。

抱歉,这些部分是为了便于阅读而编辑掉的。我已经编辑了原始代码片段,将所有内容都包含在scheduler.rb文件中。(感谢您的关注)您是否检查过(例如通过记录响应)这些站点是否实际返回HTML内容?在没有正确的用户代理设置的情况下调用页面时,可能会出现服务器错误或错误页面。您是否检查了选择器是否确实提取了预期的内容?您是否检查了新项目的数量是否发生了变化?我不知道如何记录响应。我已经使用tail-f log/production.log来查看运行日志,但是我没有看到任何显示它是否返回了任何刮取的HTML内容的内容。根据Chrome extension CSS Selector Tester,CSS选择器似乎是正确的。我不知道如何检查新物品的数量是否在变化,但是,应用程序应该给出一个数字,显示有多少新物品被刮走了,而我没有收到任何东西。与此相关的一点是,正常工作的网站刮片在尝试使用新的刮片时不起作用。不要同时执行太多步骤(例如
Nokogiri::HTML(open(url.gsub({page}',page.to_'))
正在构建url,下载页面并初始化Nokogiri.Or
doc.css(“…”).reverse。每个
解析HTML,更改顺序并迭代结果)。相反,将中间步骤分配给变量并将其记录到文件(
Rails.logger.debug“…”
)中,或者在响应意外时引发异常(
raise“No HTML”if open(…)。blank?
)。或者记录
Rails.logger.debug“新项目:{number\u of_NEW\u ITEMS}”