Python 3.x 将Scrapy响应URL传递给Selenium,然后将Selenium响应返回给Scrapy

Python 3.x 将Scrapy响应URL传递给Selenium,然后将Selenium响应返回给Scrapy,python-3.x,selenium,selenium-webdriver,scrapy,scrapy-spider,Python 3.x,Selenium,Selenium Webdriver,Scrapy,Scrapy Spider,如何将Scrapy响应url传递给Selenium,然后再将Selenium响应传递回Scrapy 我先吃了这只痒痒的蜘蛛。py: # -*- coding: utf-8 -*- import scrapy import re import json class FirstSpider(scrapy.Spider): name = "first" allowed_domains = ["someautosite.co.uk"] start_urls = (

如何将Scrapy响应url传递给Selenium,然后再将Selenium响应传递回Scrapy

我先吃了这只痒痒的蜘蛛。py:

# -*- coding: utf-8 -*-
import scrapy
import re
import json


class FirstSpider(scrapy.Spider):
    name = "first"
    allowed_domains = ["someautosite.co.uk"]
    start_urls = (
        'http://www.someautosite.co.uk/some_specific_search_results',
    )

    def parse(self, response):
        for car_url in response.xpath('//article[contains(@class, "standard")]/div/div[2]/div[1]/h1/a/@href').extract():
            absoluteurl = response.urljoin(car_url)
            # yield {'URL': absoluteurl}
            yield scrapy.Request(absoluteurl, callback=self.parse_car)

    def parse_car(self, response):
        pattern = re.compile(r"var utag_data = ({.*?});", re.MULTILINE | re.DOTALL)
        utag_data = response.xpath('//script[contains(.,"var utag")]/text()').re(pattern)[0]
        utag_data_obj = json.loads(utag_data)
        # make = utag_data_obj['make']
        # model = utag_data_obj['model']
        # yield {'Make':utag_data_obj['make'],
        #        'model':utag_data_obj['model'],
        #        }
        # yield utag_data
        tel = response.xpath('//article/div[3]/section/div/div[@itemprop="telephone"]/text()').extract_first()
        # tel_json_str = '{"tel":"' + str(tel) + '"}'
        # tel_json_obj = json.loads(tel_json_str)
        # Combine 2 JSON objects into one:
        car_json = utag_data_obj.copy()
        car_json.update({"tel": tel})
        yield car_json
        quotations_url = response.xpath('/html/body/article/section/ul/li[2]/a/@href').extract_first()
        yield scrapy.Request(quotations_url, callback=self.parse_quotations)

    def parse_quotations(self, response):  # parse insurance quotation website link with selenium
        import filldata2
然后我有一个Seleniumfilldata2.py模块,它试图从url链接中获取汽车的报价,url链接是在parse_car方法中从上面的scrapy spider代码中提取的

现在,硒模块的启动方式如下:

from selenium import webdriver

from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait  # available since 2.4.0
from selenium.webdriver.support import expected_conditions as EC  # available since 2.26.0
from selenium.webdriver.common.keys import Keys
import time
import six
from six.moves.configparser import SafeConfigParser

regno = 'AA00AAA'
mile = '15000'
firstname = 'John'
lastname = 'Smith'
[...]

def yesno(idul):
    idxpath = '//*[@id="{}"]'.format(idul)
    return idxpath


def findid(idul):
    found = driver.find_element_by_id(idul)
    return found


def clickyes(idul):
    idxpath = '//*[@id="{}"]'.format(idul)
    arg = '{}//span[contains(text(), "Yes")]'.format(idxpath)
    return driver.find_element_by_xpath(arg).click()


def clickno(idul):
    idxpath = '//*[@id="{}"]'.format(idul)
    arg = '{}//span[contains(text(), "No")]'.format(idxpath)
    return driver.find_element_by_xpath(arg).click()


def clickspan(idul):
    idxpath = '//*[@id="{}"]'.format(idul)
    arg = '{}//span[1]'.format(idxpath)
    driver.find_element_by_xpath(arg).click()


class DivSelect(object):
    def __init__(self, idul, divtext):
        self.idul = idul
        self.divtext = divtext
        # exemplu: '//div[contains(text(), "Right Hand")]'
        # self.divulxpath = '//div[contains(text(), "{}")]'.format(self.divtext)
        self.idxpath = '//*[@id="{}"]'.format(self.idul)

    def findid(self):
        el = 'driver.find_element_by_id({})'.format(self.idul)
        return el

    @property
    def clicky(self):  # merge doar la selectare de divuri
        if len(str(self.divtext)) >= 2 and not self.divtext.isdigit():
            arg = '{}//div[contains(text(), "{}")]'.format(self.idxpath, self.divtext)
        else:
            arg = '{}//div[{}]/label/div'.format(self.idxpath, self.divtext)
            print('driver.find_element_by_xpath("{}").click()'.format(arg))
        driver.find_element_by_xpath(arg).click()


def printval(cee, cssid):
    def getval():
        val = driver.find_element_by_xpath('//*[@id="{}"]'.format(cssid)).get_attribute('value')
        if not val:
            val = input('Care e valoarea masinii:\n')
        driver.find_element_by_xpath('//*[@id="{}"]'.format(cssid)).click()
        fillin(cssid, val)
        time.sleep(2)
        # print(val)
        # assert isinstance(val, object)
        return val

    valoare = getval()
    if valoare.lower() == 'pret':
        print('{} estimat este : £ {} '.format(cee, valoare)) if valoare else 'Nu era nici un {}({}) estimat'.format(
            cee, cssid)
    else:
        print('{} estimat/a/e este : {} '.format(cee, valoare)) if valoare else 'Nu era nici un {}({}) estimat'.format(
            cee, cssid)


def clickbutton(cssid):
    driver.find_element_by_xpath('//*[@id="{}"]'.format(cssid)).click()


def fillin(cssid, var):
    return driver.find_element_by_id(str(cssid)).send_keys(var)


def fillinsugestionbox(cssid, var):
    driver.find_element_by_id(str(cssid)).send_keys(var)
    return driver.find_element_by_xpath('//*[@id=\"{0}\"]'.format(cssid)).send_keys(Keys.RETURN)


knowsRegistrationNumber = Yesno('knows-registration-number').clickyes

# 1.2 Then please enter it here to get started:
registrationNumber = driver.find_element_by_id('registration-number')
registrationNumber.send_keys(regno)

# 1.3 Find your vehicle find-vehicle-by-reg
findVehicleByReg = driver.find_element_by_id('find-vehicle-by-reg')
findVehicleByReg.click()
time.sleep(1)

# TODO : if no other variants
# 1.3.1 multiple-vehicles-section : a select list with more options
# multipleVehiclesSection = driver.find_element_by_id('multiple-vehicles-section')
# multipleVehiclesSection.click()
#     possible-vehicles : the select list id
try:
    element = WebDriverWait(driver, 10).until(EC.presence_of_element_located((By.ID, "possible-vehicles")))
    possibleVehicles = driver.find_element_by_id('possible-vehicles')
    possibleVehicles.click()
    print('am asteptat destul')
    dropdown = possibleVehicles.find_elements_by_tag_name('option')
    print('Am selectat :\n      {} \n dintre urmatoarele:'.format(dropdown[1].text))
    for option in dropdown[1:]:
        print(option.text)

    if dropdown:
        dropdown[1].click()
except:
    print('Elementul possible-vehicles nu e prezent')
# finally:

time.sleep(2)

# //*[@id="has-tracker"] Yes/No
hasTracker = Yesno('has-tracker').clickno

# //*[@id="imported"] Yes/No
imported = Yesno('imported').clickno

# //*[@id="steering"] - 2 Divs
# Choose from options :
# Left Hand or # Right Hand
steering = DivSelect('steering', 'Right Hand').clicky

# TODO: vezi ce faci daca nu are pret setat. Pune tu unul
# //*[@id="current-value"] - citeste valoarea
# driver.find_element_by_xpath('//*[@id="current-value"]')

printval('Pret', 'current-value')
# print('Pretul estimat este : £ {} '.format(currentValue)) if  currentValue else 'Nu era nici un pret estimat'

printval('scaune', 'numberOfSeats-dropdown')

# //*[@id="has-modifications"]
hasModifications = Yesno('has-modifications').clickno

# clik next button
# //*[@id="vehicle-lookup-next"]
clickbutton('vehicle-lookup-next')
time.sleep(1)
# ============================================
# 2. Vehicle usage                           |
# ============================================
# 2.1 When did you buy the car?
# //*[@id="vehicle-usage"]//span[1]
vehicleUsage = Yesno('vehicle-usage').clickspan  # I haven't bought this car yet

# 2.2 What do you use your car for?
# //*[@id="use-of-vehicle"]/ol/li[2]/div[2]/label/div/div[2]
# //*[@id="use-of-vehicle"]//div[2]
useOfVehicle = DivSelect('use-of-vehicle', '2').clicky  # Social, Domestic, Pleasure and Commuting (SDPC)

# 2.3 What would you say your annual personal mileage is?
# //*[@id="annual-mileage"]
annualMileage = driver.find_element_by_id('annual-mileage')
annualMileage.send_keys(mile)
[...much more...]
...
...

fillin('email', email)
# Main telephone number
# Let the insurance providers answer your queries
# Let us keep you up to date
# //*[@id="communication-options"]/ol/li[2]/div[4]/label/div/div[2]
DivSelect('communication-options', 'Post').clicky
# Please tick this box to confirm you have read and understood our website Terms and Conditions, \
#   any assumptions we may have made and Your Rewards Terms and Conditions. \
#   If you do not understand any items within this document please contact us.
# //*[@id="contact-details"]/div/ol/li[6]/ol/li[2]/div[2]/label/span
# Yesno('contact-details').clickspan - nu merge
driver.find_element_by_xpath('//*[@id="contact-details"]/div/ol/li[6]/ol/li[2]/div[2]/label/span').click()
# //*[@id="contact-details-next"]
clickbutton('contact-details-next')
driver.implicitly_wait(10)
try:
    element = WebDriverWait(driver, 10).until(EC.visibility_of_element_located((By.ID, "quotes")))
    print('element = ', element)
    try:
        """ 
        wait for loading bar to go away:
        """
        element2 = WebDriverWait(driver, 60).until(EC.invisibility_of_element_located((By.XPATH, '//*[@id="quotes-loading-container"]/div/div[1]')))
        print('element2 = ', element2)
    except:
        print('bara de loading inca este activa. butonul more details cu cotatii nu e vizibil')
except:
    print('tabelul cu cotatii nu e vizibil')

source_code = driver.find_element_by_id('quotes').get_attribute('innerHTML')
# element.get_attribute('innerHTML')
f = open('C:\\Users\\ZZZ\\PycharmProjects\\selenscrapy\\'+str(regno)+'.html', 'wb')
f.write(source_code.encode('utf-8'))
f.close()
我知道密码很乱。我是一名python初学者,我正在使用这段代码从一个汽车销售网站上报废一些汽车,并尝试从另一个网站上获取这些汽车的保险报价。外部保险报价网站的链接(充满javascript,这就是为什么我需要Selenium webdriver)是来自汽车销售网站的重定向链接,因为这两个网站相互协作。 现在,正如我前面所说的,这个引用url需要由selenium解析,我想将它保存在单独的模块文件中,甚至可能是两个单独的文件,一个带有配置,另一个带有要执行的操作

如何将从scrapy FirstSpider parse_Quotes()方法获得的保险报价URL传递给selenium模块,并在FirstSpider parse_Quotes()方法中将selenium脚本(在上面的第二个模块中称为源代码)的响应返回给scrapy


谢谢大家!

您是否可以创建Selenium webdriver并开始在webdriver中进行抓取,而不是在first.py中生成对
Quotences\u url
的请求

def parse_car(self, response):
    ...
    quotations_url = response.xpath('/html/body/article/section/ul/li[2]/a/@href').extract_first()
    # Start to work in a webdriver
    browser = webdriver.Chrome()
    browser.get(quotations_url)
    # ... do whatever you want in the webdriver ...
    # yield your item

我还想知道是否可以将scrapy的响应直接传递给SeleniumWebDriver以继续抓取。我的情况是,我的爬行器偶尔会点击一个用户验证页面,这需要手动干预来解决一些Capcha问题,以便爬行器可以继续工作。