Python 在处理多级网页爬行时,如何使scrapy爬虫正常工作

Python 在处理多级网页爬行时,如何使scrapy爬虫正常工作,python,scrapy,Python,Scrapy,我正在学习爬行技能,我想做如下工作: 登录到特定网页(完成) 转到包含我需要的链接的页面 对于该页面中的每个链接,对其内容进行爬网 问题是,我已经测试了我的代码的一个链接,它的工作,但当我尝试它的多级工作。它以一种我无法理解的方式失败了:它只能抓取每个链接的一部分。我想知道我的代码中是否有逻辑错误,请帮助。下面是代码 import scrapy from selenium import webdriver from selenium.webdriver.common.by import By f

我正在学习爬行技能,我想做如下工作:

  • 登录到特定网页(完成)
  • 转到包含我需要的链接的页面
  • 对于该页面中的每个链接,对其内容进行爬网
  • 问题是,我已经测试了我的代码的一个链接,它的工作,但当我尝试它的多级工作。它以一种我无法理解的方式失败了:它只能抓取每个链接的一部分。我想知道我的代码中是否有逻辑错误,请帮助。下面是代码

    import scrapy
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.support.wait import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
    import time
    
    class BaiduSpider(scrapy.Spider):
        name = 'baidu'
        allowed_domains = ['baidu.com']
        start_urls = ['http://tieba.baidu.com']
        main_url = 'http://tieba.baidu.com/f?kw=%E5%B4%94%E6%B0%B8%E5%85%83&ie=utf-8'
        username = ""
        password = ""
    
    def __init__(self, username=username, password=password):
        #options = webdriver.ChromeOptions()
        #options.add_argument('headless')
        #options.add_argument('window-size=1200x600')
        self.driver = webdriver.Chrome()#chrome_options=options)
        self.username = username
        self.password = password
    # checked
    def logIn(self):
        elem = self.driver.find_element_by_css_selector('#com_userbar > ul > li.u_login > div > a')
        elem.click()
        wait = WebDriverWait(self.driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#TANGRAM__PSP_10__footerULoginBtn')))
        elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__footerULoginBtn')
        elem.click()
        elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__userName')
        elem.send_keys(self.username)
        elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__password')
        elem.send_keys(self.password)
        self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__submit').click()
    # basic checked
    def parse(self, response):
        self.driver.get(response.url)
        self.logIn()
        # wait for hand input verify code
        time.sleep(20)
        self.driver.get('http://tieba.baidu.com/f?kw=%E5%B4%94%E6%B0%B8%E5%85%83&ie=utf-8')
        # try first page first
        for url in self.driver.find_elements_by_css_selector('a.j_th_tit'):
            #new_url = response.urljoin(url)
            new_url = url.get_attribute("href")
            yield scrapy.Request(url=new_url, callback=self.parse_sub)
    
    # checked
    def pageScroll(self, url):
        self.log('I am scrolling' + url)
        self.driver.get(url)
        SCROLL_PAUSE_TIME = 0.5
        SCROLL_LENGTH = 1200
        page_height = int(self.driver.execute_script("return document.body.scrollHeight"))
        scrollPosition = 0
        while scrollPosition < page_height:
            scrollPosition = scrollPosition + SCROLL_LENGTH
            self.driver.execute_script("window.scrollTo(0, " + str(scrollPosition) + ");")
            time.sleep(SCROLL_PAUSE_TIME)
        time.sleep(1.2)
    
    def parse_sub(self, response):
        self.log('I visited ' + response.url)
        self.pageScroll(response.url)
    
        for sel in self.driver.find_elements_by_css_selector('div.l_post.j_l_post.l_post_bright'):
            name = sel.find_element_by_css_selector('.d_name').text
            try:
                content = sel.find_element_by_css_selector('.j_d_post_content').text
            except: content = ''
            replys = []
            for i in sel.find_elements_by_xpath('.//div[@class="lzl_cnt"]'):
                user1 = i.find_element_by_xpath('.//a[@username]')
                user1 = self.driver.execute_script("return arguments[0].firstChild.textContent", user1)
                try:
                    user2 = i.find_element_by_xpath('.//span[@class="lzl_content_main"]/a[@username]')
                    user2 = self.driver.execute_script("return arguments[0].firstChild.textContent", user2)
                except: user2 = name
                span = i.find_element_by_xpath('.//span[@class="lzl_content_main"]')
                reply = self.driver.execute_script('return arguments[0].lastChild.textContent;', span)
    
                replys.append(tuple(user1, user2, reply))
            yield {"topic": response.css(".core_title_txt::text").extract(), "name":name, "content":content, "replys":replys}
    
        #follow to next page
    
        #next_sel = self.driver.find_element_by_css_selector('#thread_theme_7 a:nth-child(3)')
        #next_url_name = next_sel.text
    
        #if next_sel and next_url_name == '下一页':
        #    next_url = next_sel.get_attribute('href')
    
        #    yield scrapy.Request(url=next_url, callback=self.parse_sub)
    
    import scrapy
    从selenium导入webdriver
    从selenium.webdriver.common.by导入
    从selenium.webdriver.support.wait导入WebDriverWait
    从selenium.webdriver.support将预期的_条件导入为EC
    导入时间
    贝杜斯皮德类(羊瘙痒蜘蛛):
    名称=‘百度’
    允许的_域=['baidu.com']
    起始URL=['http://tieba.baidu.com']
    主要的http://tieba.baidu.com/f?kw=%E5%B4%94%E6%B0%B8%E5%85%83&ie=utf-8'
    username=“”
    password=“”
    def u u init u u;(self,username=username,password=password):
    #options=webdriver.ChromeOptions()
    #选项。添加参数(“无头”)
    #选项。添加参数('window-size=1200x600')
    self.driver=webdriver.Chrome()#Chrome_options=options)
    self.username=用户名
    self.password=密码
    #检查
    def登录(自我):
    elem=self.driver.find_element_by_css_selector('#com_userbar>ul>li.u_login>div>a'))
    元素单击()
    wait=WebDriverWait(self.driver,10).until(EC.presence_of_element_located((通过.CSS_选择器,'.#七巧板u PSP_10_ufooteruloginbtn'))
    elem=self.driver.find_element_by_css_selector(“#七巧板u PSP_10_ufooterulginbtn”)
    元素单击()
    elem=self.driver.find_element_by_css_选择器(“#七巧板u PSP_10_u用户名”)
    元素发送密钥(self.username)
    elem=self.driver.find_element_by_css_选择器(“#七巧板u PSP_10_u密码”)
    元素发送密钥(自身密码)
    self.driver.find_element_by_css_selector(“#七巧板_upsp_u10_usubmit”)。单击()
    #基本检查
    def解析(自我,响应):
    self.driver.get(response.url)
    self.logIn()
    #等待手动输入验证代码
    时间。睡眠(20)
    self.driver.gethttp://tieba.baidu.com/f?kw=%E5%B4%94%E6%B0%B8%E5%85%83&ie=utf-8')
    #先试试第一页
    对于self.driver.find_elements_by_css_selector('a.j_th_tit'):
    #new_url=response.urljoin(url)
    新建\u url=url.get\u属性(“href”)
    生成scrapy.Request(url=new\u url,callback=self.parse\u sub)
    #检查
    def页面滚动(自我,url):
    self.log('我正在滚动'+url)
    self.driver.get(url)
    滚动\暂停\时间=0.5
    滚动长度=1200
    page\u height=int(self.driver.execute\u脚本(“return document.body.scrollHeight”))
    滚动位置=0
    当滚动位置<页面高度时:
    滚动位置=滚动位置+滚动长度
    self.driver.execute_脚本(“window.scrollTo(0,+str(scrollPosition)+”))
    时间。睡眠(滚动\u暂停\u时间)
    时间。睡眠(1.2)
    def parse_sub(自我,响应):
    self.log('I visted'+response.url)
    self.pageScroll(response.url)
    对于self.driver.find_elements_by_css_selector('div.l_post.j_l_post.l_post_bright'):
    name=sel.find_element_by_css_选择器('.d_name')。text
    尝试:
    content=sel.find_element_by_css_selector('.j_d_post_content')。text
    除外:内容=“”
    答复=[]
    对于sel中的i。通过xpath('.//div[@class=“lzl\u cnt”]”查找元素:
    user1=i.find_element_by_xpath('.//a[@username]'))
    user1=self.driver.execute_脚本(“返回参数[0].firstChild.textContent”,user1)
    尝试:
    user2=i.find_element_by_xpath('.//span[@class=“lzl_content_main”]/a[@username]”)
    user2=self.driver.execute_脚本(“返回参数[0].firstChild.textContent”,user2)
    除外:user2=name
    span=i.find_element_by_xpath('.//span[@class=“lzl_content_main”]”)
    reply=self.driver.execute_脚本('返回参数[0].lastChild.textContent;',span)
    append(元组(user1,user2,reply))
    产生{“topic”:response.css(“.core\u title\u txt::text”).extract(),“name”:name,“content”:content,“replys”:replys}
    #请转到下一页
    #next_sel=self.driver.find_element_by_css_selector(“#thread_theme_7a:n个子(3)”)
    #next\u url\u name=next\u sel.text
    #如果next_sel和next_url_name=='下一页':
    #next\u url=next\u sel.get\u属性('href')
    #生成scrapy.Request(url=next\u url,callback=self.parse\u sub)
    
    看起来您使用的是硬编码容器,而不是通用容器,因此只返回一个链接

    for url in self.driver.find_elements_by_css_selector('a.j_th_tit')
    
    这个-j_th_tit-似乎是一个动态生成的类名,对于所有锚(a)标记可能都不相同

    你可以试试

     for url in self.driver.find_elements_by_css_selector('a')
    
    用于获取页面的所有链接