Python 在处理多级网页爬行时，如何使scrapy爬虫正常工作_Python_Scrapy

Python 在处理多级网页爬行时，如何使scrapy爬虫正常工作

python scrapy

Python 在处理多级网页爬行时，如何使scrapy爬虫正常工作,python,scrapy,Python,Scrapy,我正在学习爬行技能，我想做如下工作：登录到特定网页（完成）转到包含我需要的链接的页面对于该页面中的每个链接，对其内容进行爬网问题是，我已经测试了我的代码的一个链接，它的工作，但当我尝试它的多级工作。它以一种我无法理解的方式失败了：它只能抓取每个链接的一部分。我想知道我的代码中是否有逻辑错误，请帮助。下面是代码 import scrapy from selenium import webdriver from selenium.webdriver.common.by import By f

我正在学习爬行技能，我想做如下工作：

登录到特定网页（完成）

转到包含我需要的链接的页面

对于该页面中的每个链接，对其内容进行爬网

问题是，我已经测试了我的代码的一个链接，它的工作，但当我尝试它的多级工作。它以一种我无法理解的方式失败了：它只能抓取每个链接的一部分。我想知道我的代码中是否有逻辑错误，请帮助。下面是代码

import scrapy
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import time

class BaiduSpider(scrapy.Spider):
    name = 'baidu'
    allowed_domains = ['baidu.com']
    start_urls = ['http://tieba.baidu.com']
    main_url = 'http://tieba.baidu.com/f?kw=%E5%B4%94%E6%B0%B8%E5%85%83&ie=utf-8'
    username = ""
    password = ""

def __init__(self, username=username, password=password):
    #options = webdriver.ChromeOptions()
    #options.add_argument('headless')
    #options.add_argument('window-size=1200x600')
    self.driver = webdriver.Chrome()#chrome_options=options)
    self.username = username
    self.password = password
# checked
def logIn(self):
    elem = self.driver.find_element_by_css_selector('#com_userbar > ul > li.u_login > div > a')
    elem.click()
    wait = WebDriverWait(self.driver,10).until(EC.presence_of_element_located((By.CSS_SELECTOR,'#TANGRAM__PSP_10__footerULoginBtn')))
    elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__footerULoginBtn')
    elem.click()
    elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__userName')
    elem.send_keys(self.username)
    elem = self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__password')
    elem.send_keys(self.password)
    self.driver.find_element_by_css_selector('#TANGRAM__PSP_10__submit').click()
# basic checked
def parse(self, response):
    self.driver.get(response.url)
    self.logIn()
    # wait for hand input verify code
    time.sleep(20)
    self.driver.get('http://tieba.baidu.com/f?kw=%E5%B4%94%E6%B0%B8%E5%85%83&ie=utf-8')
    # try first page first
    for url in self.driver.find_elements_by_css_selector('a.j_th_tit'):
        #new_url = response.urljoin(url)
        new_url = url.get_attribute("href")
        yield scrapy.Request(url=new_url, callback=self.parse_sub)

# checked
def pageScroll(self, url):
    self.log('I am scrolling' + url)
    self.driver.get(url)
    SCROLL_PAUSE_TIME = 0.5
    SCROLL_LENGTH = 1200
    page_height = int(self.driver.execute_script("return document.body.scrollHeight"))
    scrollPosition = 0
    while scrollPosition < page_height:
        scrollPosition = scrollPosition + SCROLL_LENGTH
        self.driver.execute_script("window.scrollTo(0, " + str(scrollPosition) + ");")
        time.sleep(SCROLL_PAUSE_TIME)
    time.sleep(1.2)

def parse_sub(self, response):
    self.log('I visited ' + response.url)
    self.pageScroll(response.url)

    for sel in self.driver.find_elements_by_css_selector('div.l_post.j_l_post.l_post_bright'):
        name = sel.find_element_by_css_selector('.d_name').text
        try:
            content = sel.find_element_by_css_selector('.j_d_post_content').text
        except: content = ''
        replys = []
        for i in sel.find_elements_by_xpath('.//div[@class="lzl_cnt"]'):
            user1 = i.find_element_by_xpath('.//a[@username]')
            user1 = self.driver.execute_script("return arguments[0].firstChild.textContent", user1)
            try:
                user2 = i.find_element_by_xpath('.//span[@class="lzl_content_main"]/a[@username]')
                user2 = self.driver.execute_script("return arguments[0].firstChild.textContent", user2)
            except: user2 = name
            span = i.find_element_by_xpath('.//span[@class="lzl_content_main"]')
            reply = self.driver.execute_script('return arguments[0].lastChild.textContent;', span)

            replys.append(tuple(user1, user2, reply))
        yield {"topic": response.css(".core_title_txt::text").extract(), "name":name, "content":content, "replys":replys}

    #follow to next page

    #next_sel = self.driver.find_element_by_css_selector('#thread_theme_7 a:nth-child(3)')
    #next_url_name = next_sel.text

    #if next_sel and next_url_name == '下一页':
    #    next_url = next_sel.get_attribute('href')

    #    yield scrapy.Request(url=next_url, callback=self.parse_sub)

import scrapy
从selenium导入webdriver
从selenium.webdriver.common.by导入
从selenium.webdriver.support.wait导入WebDriverWait
从selenium.webdriver.support将预期的_条件导入为EC
导入时间
贝杜斯皮德类（羊瘙痒蜘蛛）：
名称=‘百度’
允许的_域=['baidu.com']
起始URL=['http://tieba.baidu.com']
主要的http://tieba.baidu.com/f?kw=%E5%B4%94%E6%B0%B8%E5%85%83&ie=utf-8'
username=“”
password=“”
def u u init u u;（self，username=username，password=password）：
#options=webdriver.ChromeOptions（）
#选项。添加参数（“无头”）
#选项。添加参数（'window-size=1200x600'）
self.driver=webdriver.Chrome（）#Chrome_options=options）
self.username=用户名
self.password=密码
#检查
def登录（自我）：
elem=self.driver.find_element_by_css_selector（'#com_userbar>ul>li.u_login>div>a'））
元素单击（）
wait=WebDriverWait（self.driver，10）.until（EC.presence_of_element_located（（通过.CSS_选择器，'.#七巧板u PSP_10_ufooteruloginbtn'））
elem=self.driver.find_element_by_css_selector（“#七巧板u PSP_10_ufooterulginbtn”）
元素单击（）
elem=self.driver.find_element_by_css_选择器（“#七巧板u PSP_10_u用户名”）
元素发送密钥（self.username）
elem=self.driver.find_element_by_css_选择器（“#七巧板u PSP_10_u密码”）
元素发送密钥（自身密码）
self.driver.find_element_by_css_selector（“#七巧板_upsp_u10_usubmit”）。单击（）
#基本检查
def解析（自我，响应）：
self.driver.get（response.url）
self.logIn（）
#等待手动输入验证代码
时间。睡眠（20）
self.driver.gethttp://tieba.baidu.com/f?kw=%E5%B4%94%E6%B0%B8%E5%85%83&ie=utf-8')
#先试试第一页
对于self.driver.find_elements_by_css_selector（'a.j_th_tit'）：
#new_url=response.urljoin（url）
新建\u url=url.get\u属性（“href”）
生成scrapy.Request（url=new\u url，callback=self.parse\u sub）
#检查
def页面滚动（自我，url）：
self.log（'我正在滚动'+url）
self.driver.get（url）
滚动\暂停\时间=0.5
滚动长度=1200
page\u height=int（self.driver.execute\u脚本（“return document.body.scrollHeight”））
滚动位置=0
当滚动位置<页面高度时：
滚动位置=滚动位置+滚动长度
self.driver.execute_脚本（“window.scrollTo（0，+str（scrollPosition）+”））
时间。睡眠（滚动\u暂停\u时间）
时间。睡眠（1.2）
def parse_sub（自我，响应）：
self.log（'I visted'+response.url）
self.pageScroll（response.url）
对于self.driver.find_elements_by_css_selector（'div.l_post.j_l_post.l_post_bright'）：
name=sel.find_element_by_css_选择器（'.d_name'）。text
尝试：
content=sel.find_element_by_css_selector（'.j_d_post_content'）。text
除外：内容=“”
答复=[]
对于sel中的i。通过xpath（'.//div[@class=“lzl\u cnt”]”查找元素：
user1=i.find_element_by_xpath（'.//a[@username]'））
user1=self.driver.execute_脚本（“返回参数[0].firstChild.textContent”，user1）
尝试：
user2=i.find_element_by_xpath（'.//span[@class=“lzl_content_main”]/a[@username]”）
user2=self.driver.execute_脚本（“返回参数[0].firstChild.textContent”，user2）
除外：user2=name
span=i.find_element_by_xpath（'.//span[@class=“lzl_content_main”]”）
reply=self.driver.execute_脚本（'返回参数[0].lastChild.textContent；'，span）
append（元组（user1，user2，reply））
产生{“topic”：response.css（“.core\u title\u txt:：text”）.extract（），“name”：name，“content”：content，“replys”：replys}
#请转到下一页
#next_sel=self.driver.find_element_by_css_selector（“#thread_theme_7a:n个子（3）”）
#next\u url\u name=next\u sel.text
#如果next_sel和next_url_name=='下一页':
#next\u url=next\u sel.get\u属性（'href'）
#生成scrapy.Request（url=next\u url，callback=self.parse\u sub）

看起来您使用的是硬编码容器，而不是通用容器，因此只返回一个链接

for url in self.driver.find_elements_by_css_selector('a.j_th_tit')

这个-j_th_tit-似乎是一个动态生成的类名，对于所有锚（a）标记可能都不相同

你可以试试

 for url in self.driver.find_elements_by_css_selector('a')

用于获取页面的所有链接