Ruby on rails Nokogiri使用Ruby On Rails进行抓取,但未按预期工作
我对RubyonRails完全陌生,但我想我可能遗漏了一些明显的东西。我目前正在开发一个网络应用程序,可以搜索拍卖网站。应用程序的骨骼是由其他人创建的。我目前正在尝试添加新的网站抓取,但它们似乎不起作用 我已经阅读了一些Nokogiri文档,检查了刮取的信息是否确实没有写入数据库(当我通过rails控制台检查时,目标的种子URL已经被删除),并使用chrome extension CSS Selector Tester检查了我是否针对正确的CSS选择器。当我通过rails控制台检查时,记录ID是正确的 我已经在下面列出了我认为重要的代码部分,但是我可能遗漏了一些我没有意识到重要的东西 我遇到问题的网站是& 任何帮助都将不胜感激 种子URLRuby on rails Nokogiri使用Ruby On Rails进行抓取,但未按预期工作,ruby-on-rails,ruby,web-scraping,nokogiri,Ruby On Rails,Ruby,Web Scraping,Nokogiri,我对RubyonRails完全陌生,但我想我可能遗漏了一些明显的东西。我目前正在开发一个网络应用程序,可以搜索拍卖网站。应用程序的骨骼是由其他人创建的。我目前正在尝试添加新的网站抓取,但它们似乎不起作用 我已经阅读了一些Nokogiri文档,检查了刮取的信息是否确实没有写入数据库(当我通过rails控制台检查时,目标的种子URL已经被删除),并使用chrome extension CSS Selector Tester检查了我是否针对正确的CSS选择器。当我通过rails控制台检查时,记录ID是
Source.create(name: "Auction.fr", query_template: "https://www.auction.fr/_en/lot/search/?contexte=futures&tri=date_debut%20ASC&query={query}&page={page}")
Source.create(name: "Invaluable.co.uk", query_template: "https://www.invaluable.co.uk/search/api/search-results?keyword={query}&size=1000")
Source.create(name: "Interencheres.com", query_template: "http://www.interencheres.com/en/recherche/lot?search%5Bkeyword%5D={query}&page={page}")
Source.create(name: "Gazette-drouot.com", query_template: "http://catalogue.gazette-drouot.com/html/g/recherche.jsp?numPage={page}&filterDate=1&query={query}&npp=100")
Source.create(name: "Lot-art.com", query_template: "http://www.lot-art.com/auction-search/?form_id=lot_search_form&page=1&mq=&q={query}&ord=recent")
Source.create(name: "Lot-tissimo.com", query_template: "https://lot-tissimo.com/en/cmd=s&lwr=&ww={query}&xw=&srt=SN&wg=EUR&page={page}")
调度程序代码
require 'rufus-scheduler'
require 'nokogiri'
require 'mechanize'
require 'open-uri'
require "net/https"
s = Rufus::Scheduler.singleton
s.interval '1m' do
setting = Setting.find(1)
agent = Mechanize.new
agent.user_agent_alias = 'Windows Chrome'
agent.cookie_jar.load(File.join(Rails.root, 'tmp/cookies.yaml'))
List.all.each do |list|
number_of_new_items = 0
list.actions.each do |action|
url = action.source.query_template.gsub('{query}', action.list.query)
case action.source.id
when 1 # Auction.fr
20.downto(1) do |page|
doc = Nokogiri::HTML(open(url.gsub('{page}', page.to_s)))
doc.css("div.list-products > ul > li").reverse.each do |item_data|
price = 0
if item_data.at_css("h3.h4.adjucation.ft-blue") && /Selling price : ([\d\s]+) €/.match(item_data.at_css("h3.h4.adjucation.ft-blue").text)
price = /Selling price : ([\d\s]+) €/.match(item_data.at_css("h3.h4.adjucation.ft-blue").text)[1].gsub(" ", "")
end
item = action.items.new(
title: item_data.at_css("h2").text.strip,
url: item_data.at_css("h2 a")["href"],
picture: item_data.at_css("div.image-wrap.lazy div.image img")["src"],
price: price,
currency: "€"
)
ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end
end
when 97 # Lot-Tissimo.com
5.downto(1) do |page|
doc = Nokogiri::HTML(open(url.gsub('{page}', page.to_s)))
doc.css("#inhalt > .objektliste").reverse.each do |item_data|
# price = 0
# if item_data.at_css("h3.h4.adjucation.ft-blue") && /Selling price : ([\d\s]+) €/.match(item_data.at_css("h3.h4.adjucation.ft-blue").text)
# price = /Selling price : ([\d\s]+) €/.match(item_data.at_css("h3.h4.adjucation.ft-blue").text)[1].gsub(" ", "")
# end
item = action.items.new(
title: item_data.at_css("div.objli-desc").text.strip,
url: item_data.at_css("td.objektliste-foto a")["href"],
picture: item_data.at_css("td.objektliste-foto a#lot_link img")["src"],
price: price,
currency: "€"
)
ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end
end
when 2 # Invaluable.co.uk
doc = JSON.parse(open(url).read)
doc["itemViewList"].reverse.each do |item_data|
puts item_data["itemView"]["photos"]
item = action.items.new(
title: item_data["itemView"]["title"],
url: "https://www.invaluable.co.uk/buy-now/" + item_data["itemView"]["title"].parameterize + "-" + item_data["itemView"]["ref"],
picture: item_data["itemView"]["photos"] != nil ? item_data["itemView"]["photos"].first["_links"]["medium"]["href"] : nil,
price: item_data["itemView"]["price"],
currency: item_data["itemView"]["currencySymbol"]
)
ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end
when 3 # Interencheres.com
# doc = Nokogiri::HTML(open(url))
5.downto(1) do |page|
doc = Nokogiri::HTML(open(url.gsub('{page}', page.to_s)))
doc.css("div#lots_0 div.ligne_vente").reverse.each do |item_data|
price = 0
item = action.items.new(
title: item_data.at_css("div.ph_vente div.des_vente p a").text.strip,
url: "http://www.interencheres.com" + item_data.at_css("div.ph_vente div.des_vente p a")["href"],
picture: item_data.at_css("div.ph_vente div.gd_ph_vente img")["src"],
price: price,
currency: "€"
)
ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end
end
when 4 # Gazette-drouot.com
5.downto(1) do |page|
# doc = Nokogiri::HTML(open(url.gsub('{page}', page.to_s)))
doc = agent.get(url.gsub('{page}', page.to_s))
# doc = agent.get(url)
doc.css("div#recherche_resultats div.lot_recherche").reverse.each do |item_data|
price = 0
picture = item_data.at_css("img.image_thumb_recherche") ? item_data.at_css("img.image_thumb_recherche")["src"] : nil
item = action.items.new(
title: item_data.at_css("#des_recherche").text.strip.truncate(140),
url: "http://catalogue.gazette-drouot.com/html/g/" + item_data.at_css("a.lien_under")["href"],
picture: picture,
price: price,
currency: "€"
)
ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end
end
when 69 # Lot-art.com
doc = agent.get(url)
doc.css("div.lot_list_holder").reverse.each do |item_data|
price = 0
item = action.items.new(
title: item_data.at_css("div.lot_list_body a")[0].text.strip.truncate(140),
url: item_data.at_css("div.lot_list_body")["href"],
picture: item_data.at_css("a.lot_list_thumb img") ["src"],
price: price,
currency: "€"
)
ActiveRecord::Base.logger.silence do # This disable writing logs
if item.save
number_of_new_items = number_of_new_items + 1
end
end
end
end
end
if number_of_new_items > 0 && setting.notifications_per_hour > setting.notifications_this_hour && setting.pushover_app_token.present? && setting.pushover_user_key.present?
url = URI.parse("https://api.pushover.net/1/messages.json")
req = Net::HTTP::Post.new(url.path)
req.set_form_data({
:token => setting.pushover_app_token,
:user => setting.pushover_user_key,
:message => "#{number_of_new_items} new items on #{list.name}!",
:url_title => "Check the list",
:url => "http://spottheauction.com/lists/#{list.id}"
})
res = Net::HTTP.new(url.host, url.port)
res.use_ssl = true
res.verify_mode = OpenSSL::SSL::VERIFY_PEER
res.start {|http| http.request(req) }
end
end
agent.cookie_jar.save(File.join(Rails.root, 'tmp/cookies.yaml'))
end
s.cron '0 * * * *' do
setting = Setting.find(1)
setting.notifications_this_hour = 0
setting.save
end
new
只初始化实例,不保存实例。你真的在某处调用了save
您有两个选择:
对项目调用save
:
item = action.items.new(
# ...
)
item.save
或者使用create
而不是new
:
item = action.items.create(
# ...
)
万一有人碰到这个。我得到了lot-art.com的许可。似乎我在css选择器中缺乏针对nokogiri提取正确数据的特异性
我仍然对lot tissimo有持续的问题,尽管这似乎来自其他方面,因为其他刮板也有问题,例如刮削hub的portia Spider。抱歉,这些部分是为了便于阅读而编辑掉的。我已经编辑了原始代码片段,将所有内容都包含在scheduler.rb文件中。(感谢您的关注)您是否检查过(例如通过记录响应)这些站点是否实际返回HTML内容?在没有正确的用户代理设置的情况下调用页面时,可能会出现服务器错误或错误页面。您是否检查了选择器是否确实提取了预期的内容?您是否检查了新项目的数量是否发生了变化?我不知道如何记录响应。我已经使用tail-f log/production.log来查看运行日志,但是我没有看到任何显示它是否返回了任何刮取的HTML内容的内容。根据Chrome extension CSS Selector Tester,CSS选择器似乎是正确的。我不知道如何检查新物品的数量是否在变化,但是,应用程序应该给出一个数字,显示有多少新物品被刮走了,而我没有收到任何东西。与此相关的一点是,正常工作的网站刮片在尝试使用新的刮片时不起作用。不要同时执行太多步骤(例如
Nokogiri::HTML(open(url.gsub({page}',page.to_'))
正在构建url,下载页面并初始化Nokogiri.Ordoc.css(“…”).reverse。每个
解析HTML,更改顺序并迭代结果)。相反,将中间步骤分配给变量并将其记录到文件(Rails.logger.debug“…”
)中,或者在响应意外时引发异常(raise“No HTML”if open(…)。blank?
)。或者记录Rails.logger.debug“新项目:{number\u of_NEW\u ITEMS}”