Scrapy,python+硒。从页面收集的信息中存在错误
大家好,我需要你们的支持! 我有一些页面,试着把它扔掉。在第页上有下拉菜单和下拉菜单中的选项更改第页上的信息使用selenium更改选项I。我试图迭代选择选项我找到了一个例子,并在更新后从页面收集信息,但在输出文件中我有4个相同的值,就好像选项没有改变一样。Selenium right iterate Options我看到了它,不知道我做错了什么。 这是我的代码:Scrapy,python+硒。从页面收集的信息中存在错误,python,selenium,drop-down-menu,scrapy,Python,Selenium,Drop Down Menu,Scrapy,大家好,我需要你们的支持! 我有一些页面,试着把它扔掉。在第页上有下拉菜单和下拉菜单中的选项更改第页上的信息使用selenium更改选项I。我试图迭代选择选项我找到了一个例子,并在更新后从页面收集信息,但在输出文件中我有4个相同的值,就好像选项没有改变一样。Selenium right iterate Options我看到了它,不知道我做错了什么。 这是我的代码: from colombo.items import ColomboItem from scrapy.contrib.spiders
from colombo.items import ColomboItem
from scrapy.contrib.spiders import CrawlSpider
from selenium import webdriver
from scrapy.selector import Selector
from selenium.webdriver.support.ui import Select
import time
class ColomboSpider(CrawlSpider):
name = 'ColomboSpider'
allowed_domains = ["http://colombo.in.ua"]
start_urls = [
"http://colombo.in.ua/colombo-design/ruchka-colombo-gira-jm11.html",
]
def __init__(self):
CrawlSpider.__init__(self)
# use any browser you wish
self.browser = webdriver.Firefox()
def __del__(self):
self.browser.close()
def parse(self, response):
self.browser.get(response.url)
#let JavaScript Load
time.sleep(3)
optionsList = []# options attribute value
dropdown = self.browser.find_element_by_id("jshop_attr_id13")#my dropdown element name
options = dropdown.find_elements_by_tag_name('option')
for option in options: #iterate over the options, place attribute value in list
optionsList.append(option.get_attribute("value"))
hxs = Selector(response)
items = []
for optionValue in optionsList:
select = Select(self.browser.find_element_by_id("jshop_attr_id13"))#i found dropdown
select.select_by_value(optionValue)#and i click on n-value in my list
time.sleep(2)
firm_list = hxs.xpath('.//div[@class="jshop productfull"]/form[1]')
for sel in firm_list:
item = ColomboItem()
item['price']=sel.xpath('.//span[@id="block_price"]/text()').extract()
item['name']=sel.xpath('.//h1/text()').extract()
items.append(item)
return iter(items)
输出文件中的内容:
1929.61 грн Ручка Colombo GIRA JM11
1929.61 грн Ручка Colombo GIRA JM11
1929.61 грн Ручка Colombo GIRA JM11
1929.61 грн Ручка Colombo GIRA JM11
但是,正确的输出是正确的
1929.61 грн Ручка Colombo GIRA JM11
2275.21 грн Ручка Colombo GIRA JM11
2456.66 грн Ручка Colombo GIRA JM11
2966.42 грн Ручка Colombo GIRA JM11
感谢您的回答我跳过了文本部分,但此代码适用于金额部分
import unittest
from selenium import webdriver
import datetime
import os
from selenium.webdriver.firefox.firefox_profile import FirefoxProfile
from random import randint
import time
from selenium.webdriver.common.action_chains import ActionChains
from selenium.webdriver.common.keys import Keys
from sshtunnel import SSHTunnelForwarder
import MySQLdb
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import autoit
class SprintTests(unittest.TestCase):
def setUp(self):
self.driver = webdriver.Firefox()
self.driver.get("http://colombo.in.ua/colombo-design/ruchka-colombo-gira-jm11.html")
self.driver.implicitly_wait(30)
self.driver.maximize_window()
def test_input(self):
dropdown = self.driver.find_element_by_id("jshop_attr_id13")
options = dropdown.find_elements_by_tag_name('option')
for option in options:
a = self.driver.find_element_by_xpath('//*[@id="block_price"]')
b=a.text
a = self.driver.find_element_by_xpath('//*[@id="jshop_attr_id13"]').click()
print b
autoit.send("{DOWN}{ENTER}")
time.sleep(2)
def tearDown(self):
self.driver.quit()
if __name__ == '__main__':
unittest.main(verbosity=2)
输出为:
1929.61 грн
2275.21 грн
2456.66 грн
2966.42 грн
很抱歉,现在回答:
from colombo.items import ColomboItem
from scrapy.contrib.spiders import CrawlSpider
from selenium import webdriver
from selenium.webdriver.support.ui import Select
import time
class ColomboSpider(CrawlSpider):
name = 'ColomboSpider'
allowed_domains = ["http://colombo.in.ua"]
start_urls = [
]
def __init__(self):
self.driver = webdriver.Firefox()
def __del__(self):
self.driver.close()
def parse(self, response):
self.driver.get(response.url)
#let JavaScript Load
time.sleep(3)
try:
optionsList = []# options attribute value
dropdown = self.driver.find_element_by_id("jshop_attr_id13")#my dropdown element name
options = dropdown.find_elements_by_tag_name('option')
for option in options: #iterate over the options, place attribute value in list
optionsList.append(option.get_attribute("value"))
#hxs = Selector(response)
items = []
for optionValue in optionsList:
select = Select(self.driver.find_element_by_id("jshop_attr_id13"))#i found dropdown
select.select_by_value(optionValue)#and i click on n-value in my list
time.sleep(2)
item = ColomboItem()
item['price']=self.driver.find_element_by_xpath('//*[@id="block_price"]').text
item['name'] = self.driver.find_element_by_xpath('.//h1').text
options_value = self.driver.find_element_by_id("jshop_attr_id13")
for option in options_value.find_elements_by_tag_name('option'):
if option.get_attribute("value") == optionValue:
item['color'] = option.get_attribute("innerHTML")
items.append(item)
return iter(items)
except Exception:
items = []
item = ColomboItem()
item['price']=self.driver.find_element_by_xpath('//*[@id="block_price"]').text
item['name'] = self.driver.find_element_by_xpath('.//h1').text
items.append(item)
return iter(items)
输出为
Chromo - Хром 3333.53 грн Ручка Colombo DEA FF21 (Код: FF21)
Chromat - Матовый хром 3817.33 грн Ручка Colombo DEA FF21 (Код: FF21)
sry,但ubuntu没有自动启动