Python 运行使用scrapy with selenium创建的解析器时出现问题

Python 运行使用scrapy with selenium创建的解析器时出现问题,python,python-3.x,selenium,web-scraping,scrapy,Python,Python 3.x,Selenium,Web Scraping,Scrapy,我用Python scrapy结合selenium编写了一个scraper,用于从网站上刮取一些标题。我的scraper中定义的css选择器是完美的。我希望我的scraper继续点击下一页并解析嵌入在每一页中的信息。它在第一页上做得很好,但是当它在selenium部分中扮演角色时,scraper会一遍又一遍地点击相同的链接 因为这是我第一次和scrapy一起使用selenium,所以我没有任何成功的想法。任何修复都将受到高度赞赏 如果我尝试这样做,则其工作平稳(选择器没有问题): 但我的意图是让

我用Python scrapy结合selenium编写了一个scraper,用于从网站上刮取一些
标题。我的scraper中定义的
css选择器是完美的。我希望我的scraper继续点击下一页并解析嵌入在每一页中的信息。它在第一页上做得很好,但是当它在selenium部分中扮演角色时,scraper会一遍又一遍地点击相同的链接

因为这是我第一次和scrapy一起使用selenium,所以我没有任何成功的想法。任何修复都将受到高度赞赏

如果我尝试这样做,则其工作平稳(选择器没有问题):

但我的意图是让我的脚本以这种方式运行:

import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException

class IncomeTaxSpider(scrapy.Spider):
    name = "taxspider"

    start_urls = [
        'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
    ]

    def __init__(self):
        self.driver = webdriver.Chrome()
        self.wait = WebDriverWait(self.driver, 10)

    def click_nextpage(self,link):
        self.driver.get(link)
        elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))

        #It keeeps clicking on the same link over and over again

        self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()  
        self.wait.until(EC.staleness_of(elem))


    def parse(self,response):
        while True:
            for item in response.css("h1.faqsno-heading"):
                name = item.css("div[id^='arrowex']::text").extract_first()
                yield {"Name": name}

            try:
                self.click_nextpage(response.url) #initiate the method to do the clicking
            except TimeoutException:break
这些是登录页上可见的标题(让您知道我在追求什么):


我不愿意从该网站获取数据,因此除了上面尝试的方法之外,任何其他方法对我都是无用的。我唯一的目的是提供与我在第二种方法中尝试的方法相关的任何解决方案。

如果您需要纯硒解决方案:

driver.get("https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx")

while True:
    for item in wait(driver, 10).until(EC.presence_of_all_elements_located((By.CSS_SELECTOR, "div[id^='arrowex']"))):
        print(item.text)
    try:
        driver.find_element_by_xpath("//input[@text='Next' and not(contains(@class, 'disabledImageButton'))]").click()
    except NoSuchElementException:
        break

每当使用“下一页”箭头(使用Selenium)加载页面时,它都会重置回页面“1”。不确定原因(可能是java脚本) 因此,改变了使用输入字段输入所需页码并按enter键导航的方法

这是修改后的代码。希望这对你有用

import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.common.keys import Keys

class IncomeTaxSpider(scrapy.Spider):
    name = "taxspider"
    start_urls = [
        'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
    ]
    def __init__(self):
        self.driver = webdriver.Firefox()
        self.wait = WebDriverWait(self.driver, 10)

    def click_nextpage(self,link, number):
        self.driver.get(link)
        elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))

        #It keeeps clicking on the same link over and over again

    inputElement = self.driver.find_element_by_xpath("//input[@id='ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_txtPageNumber']")
    inputElement.clear()
    inputElement.send_keys(number)
    inputElement.send_keys(Keys.ENTER)
        self.wait.until(EC.staleness_of(elem))


    def parse(self,response):
        number = 1
        while number < 10412: #Website shows it has 10411 pages.
            for item in response.css("h1.faqsno-heading"):
                name = item.css("div[id^='arrowex']::text").extract_first()
                yield {"Name": name}
                print (name)

            try:
                number += 1
                self.click_nextpage(response.url, number) #initiate the method to do the clicking
            except TimeoutException:break
import scrapy
从selenium导入webdriver
从selenium.webdriver.common.by导入
从selenium.webdriver.support.ui导入WebDriverWait
从selenium.webdriver.support将预期的_条件导入为EC
从selenium.common.exceptions导入TimeoutException
从selenium.webdriver.common.keys导入密钥
类IncomeTaxSpider(scrapy.Spider):
name=“taxspider”
起始URL=[
'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
]
定义初始化(自):
self.driver=webdriver.Firefox()
self.wait=WebDriverWait(self.driver,10)
def单击下一页(自我、链接、编号):
self.driver.get(链接)
elem=self.wait.until(位于((By.CSS_选择器,“div[id^='arrowex']))的元素的EC.visibility_)
#它会一次又一次地点击同一个链接
inputElement=self.driver.find_element_by_xpath(//input[@id='ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6D49230; DAE8A_ctl00_txtPageNumber'])
inputElement.clear()
inputElement.send_键(数字)
inputElement.send_键(key.ENTER)
自我。等待。直到(元素的陈旧性)
def解析(自我,响应):
数字=1
虽然数字<10412:#网站显示它有10411页。
对于response.css(“h1.faqsno标题”)中的项目:
name=item.css(“div[id^='arrowex']::text”).extract_first()
产生{“名称”:名称}
印刷品(名称)
尝试:
数字+=1
self.click_nextpage(response.url,number)#启动方法进行单击
除TimeoutException外:中断

创建一个self.page\u num或其他内容

def parse(self,response):
    self.pages = self.driver.find_element_by_css_selector("#ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_totalRecordsDiv.act_search_footer span")
    self.pages = int(self.pages.split('of ')[1].split(']')[0])

    self.page_num = 1

    while self.page_num <= self.pages:
        for item in response.css("h1.faqsno-heading"):
            name = item.css("div[id^='arrowex']::text").extract_first()
            yield {"Name": name}

        try:
            self.click_nextpage(response.url) #initiate the method to do the clicking
        except TimeoutException:break

def click_nextpage(self,link):
    self.driver.get(link)
    elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))

    page_link = 'ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_lnkBtn_' + str(self.page_num)
    self.page_num = self.page_num + 1


    self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()  
    self.wait.until(EC.staleness_of(elem))
def解析(self,response):
self.pages=self.driver.find_element_by_css_selector(“#ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6D49230; dae8a_ctl00_totalRecordsDiv.act_search_footer span”)
self.pages=int(self.pages.split('of')[1]。split(']')[0])
self.page_num=1

虽然self.page_num您的初始代码几乎正确,但缺少一个关键部分。您始终使用相同的响应对象。响应对象需要来自最新的页面源

此外,您在“单击下一页”中一次又一次浏览链接,每次都会将其重置为第1页。这就是为什么会出现第1页和第2页(最大值)。您只需在解析阶段获取url一次,然后让下一页单击即可

下面是最终的代码运行良好

class IncomeTaxSpider(scrapy.Spider):
    name = "taxspider"

    start_urls = [
        'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
    ]

    def __init__(self):
        self.driver = webdriver.Chrome()
        self.wait = WebDriverWait(self.driver, 10)

    def click_nextpage(self,link):
        # self.driver.get(link)
        elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))

        #It keeeps clicking on the same link over and over again

        self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
        self.wait.until(EC.staleness_of(elem))


    def parse(self, response):
        self.driver.get(response.url)

        while True:
            for item in response.css("h1.faqsno-heading"):
                name = item.css("div[id^='arrowex']::text").extract_first()
                yield {"Name": name}

            try:
                self.click_nextpage(response.url) #initiate the method to do the clicking
                response = response.replace(body=self.driver.page_source)
            except TimeoutException:break
在那次改变之后,它工作得很完美


谢谢您的意见,先生。我希望与scrapy一起完成该项目。是否动态生成“下一步”按钮??如果没有,为什么不使用刮痧从一页转到另一页呢?谢谢你的回答@Krishna。虽然您的脚本似乎通过在输入框中输入该harcoded编号来更改页码,但它仍然一遍又一遍地提供相同的输出(第一页的输出)。感谢@Maxwell77提供您的解决方案。这与我所期望的非常接近。您提供的脚本可以增量单击链接。然而,我仍然一遍又一遍地从第一页获得相同的数据。我认为只有在
time.sleep(4)
之后,才能将
self.driver.page\u source
传递给
self.parse
方法,从
中单击下一页(self)
方法。对不起,我已经测试了脚本,它正在我的计算机上运行。我认为没有必要传递页面源代码,因为它总是同一个url,只是内容被重新加载了新信息。你没有理解我的意思。当驱动程序单击下一页链接时,则
driver.page\u source
肯定会有所不同。我说的不是
driver.url
,而是
driver.page\u source
。但是,如果仔细观察爬行器,就会发现,
self.parse()
方法从未使用新的响应进行更新,这是我拥有的内容始终相同的唯一原因。谢谢顺便说一句,我再次测试以确定结果是否与我之前提到的相同。当我创建此帖子时,我非常期待您的干预,因为我总是发现您的解决方案非常有用。但是,关于你的回答:它给了我
第一页
的内容,然后是
第二页
,然后是
第二页
,依此类推。它没有(或不能)去
def parse(self,response):
    self.pages = self.driver.find_element_by_css_selector("#ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_totalRecordsDiv.act_search_footer span")
    self.pages = int(self.pages.split('of ')[1].split(']')[0])

    self.page_num = 1

    while self.page_num <= self.pages:
        for item in response.css("h1.faqsno-heading"):
            name = item.css("div[id^='arrowex']::text").extract_first()
            yield {"Name": name}

        try:
            self.click_nextpage(response.url) #initiate the method to do the clicking
        except TimeoutException:break

def click_nextpage(self,link):
    self.driver.get(link)
    elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))

    page_link = 'ctl00_SPWebPartManager1_g_d6877ff2_42a8_4804_8802_6d49230dae8a_ctl00_lnkBtn_' + str(self.page_num)
    self.page_num = self.page_num + 1


    self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()  
    self.wait.until(EC.staleness_of(elem))
import scrapy
import time
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
from scrapy.crawler import CrawlerProcess

class IncomeTaxSpider(scrapy.Spider):
    name = "taxspider"

    start_urls = [
        'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
    ]

    def __init__(self):
        self.driver = webdriver.Chrome()
        self.wait = WebDriverWait(self.driver, 10)

        link = 'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx'
        self.driver.get(link)

    def click_nextpage(self):        
        elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))

        #It keeeps clicking on the same link over and over again

        self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()  
        self.wait.until(EC.staleness_of(elem))
        time.sleep(4)

    def parse(self,response):
        while True:
            for item in response.css("h1.faqsno-heading"):
                name = item.css("div[id^='arrowex']::text").extract_first()
                yield {"Name": name}

            try:
                self.click_nextpage() #initiate the method to do the clicking
            except TimeoutException:break

process = CrawlerProcess()

process.crawl(IncomeTaxSpider)
process.start()
class IncomeTaxSpider(scrapy.Spider):
    name = "taxspider"

    start_urls = [
        'https://www.incometaxindia.gov.in/Pages/utilities/exempted-institutions.aspx',
    ]

    def __init__(self):
        self.driver = webdriver.Chrome()
        self.wait = WebDriverWait(self.driver, 10)

    def click_nextpage(self,link):
        # self.driver.get(link)
        elem = self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "div[id^='arrowex']")))

        #It keeeps clicking on the same link over and over again

        self.wait.until(EC.visibility_of_element_located((By.CSS_SELECTOR, "input[id$='_imgbtnNext']"))).click()
        self.wait.until(EC.staleness_of(elem))


    def parse(self, response):
        self.driver.get(response.url)

        while True:
            for item in response.css("h1.faqsno-heading"):
                name = item.css("div[id^='arrowex']::text").extract_first()
                yield {"Name": name}

            try:
                self.click_nextpage(response.url) #initiate the method to do the clicking
                response = response.replace(body=self.driver.page_source)
            except TimeoutException:break