Python 脚本突然停止爬行，没有错误或异常_Python_Selenium_Python Requests_Geckodriver_Urllib3

Python 脚本突然停止爬行，没有错误或异常

python selenium

Python 脚本突然停止爬行，没有错误或异常,python,selenium,python-requests,geckodriver,urllib3,Python,Selenium,Python Requests,Geckodriver,Urllib3,我不知道为什么，但我的脚本一旦点击就会停止爬行。没有错误、异常或警告，所以我有点不知所措有人能帮我吗附言打印项的长度也会引发一些奇怪的行为。它不总是返回32（与每页上的项目数相对应），而是打印第一页的32，第二页的64，第三页的96，依此类推。我通过使用//div[contains（@id，“100\u dealView”）]/div[contains（@class，“dealContainer”）]而不是//div[contains（@id，“100\u dealView”）]作为项变量

我不知道为什么，但我的脚本一旦点击就会停止爬行。没有错误、异常或警告，所以我有点不知所措

有人能帮我吗

附言

打印

项的长度

也会引发一些奇怪的行为。它不总是返回32（与每页上的项目数相对应），而是打印第一页的

，第二页的

，第三页的

，依此类推。我通过使用

//div[contains（@id，“100\u dealView”）]/div[contains（@class，“dealContainer”）]

而不是

//div[contains（@id，“100\u dealView”）]

作为

项

变量的XPath解决了这个问题。我希望这就是它在第9页出现问题的原因。我现在正在做测试更新：现在正在抓取第10页及以后的内容，因此问题已得到解决。

根据您的问题，错误消息

HTTPConnectionPool(host='127.0.0.1', port=58992): Max retries exceeded with url: /session/e8beed9b-4faa-4e91-a659-56761cb604d7/element (Caused by NewConnectionError('<urllib3.connection.HTTPConnection object at 0x0000022D31378A58>: Failed to establish a new connection: [WinError 10061] No connection could be made because the target machine actively refused it'))

您将在中找到详细的解释

解决方案根据Selenium 3.14.1的发行说明：

* Fix ability to set timeout for urllib3 (#6286)

合并是：

结论升级到Selenium 3.14.1后，您将能够设置超时并查看规范回溯，并能够采取所需的操作

工具书类一些相关参考资料：

这个用例我已经从你的网站上下载了你的完整脚本。我不得不对您现有的代码进行一些调整，如下所示：

正如您所使用的：

  ua_string = random.choice(ua_strings)

您必须强制导入

random

作为：

    import random

您已经创建了变量“下一步”按钮，但尚未使用它。我总结了以下四行：

  next_button = WebDriverWait(ff, 15).until(
                  EC.text_to_be_present_in_element((By.PARTIAL_LINK_TEXT, 'Next→'), 'Next→')
              )
  ff.find_element(By.PARTIAL_LINK_TEXT, 'Next→').click()

作为：

您修改的代码块将是：

  # -*- coding: utf-8 -*-
  from selenium import webdriver
  from selenium.webdriver.firefox.options import Options
  from selenium.webdriver.common.by import By
  from selenium.webdriver.support import expected_conditions as EC
  from selenium.webdriver.support.ui import WebDriverWait
  import time
  import random


  """ Set Global Variables
  """
  ua_strings = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36']
  already_scraped_product_titles = []



  """ Create Instances of WebDriver
  """
  def create_webdriver_instance():
      ua_string = random.choice(ua_strings)
      profile = webdriver.FirefoxProfile()
      profile.set_preference('general.useragent.override', ua_string)
      options = Options()
      options.add_argument('--headless')
      return webdriver.Firefox(profile)



  """ Construct List of UA Strings
  """
  def fetch_ua_strings():
      ff = create_webdriver_instance()
      ff.get('https://techblog.willshouse.com/2012/01/03/most-common-user-agents/')
      ua_strings_ff_eles = ff.find_elements_by_xpath('//td[@class="useragent"]')
      for ua_string in ua_strings_ff_eles:
          if 'mobile' not in ua_string.text and 'Trident' not in ua_string.text:
              ua_strings.append(ua_string.text)
      ff.quit()



  """ Log in to Amazon to Use SiteStripe in order to Generate Affiliate Links
  """
  def log_in(ff):
      ff.find_element(By.XPATH, '//a[@id="nav-link-yourAccount"] | //a[@id="nav-link-accountList"]').click()
      ff.find_element(By.ID, 'ap_email').send_keys('anthony_falez@hotmail.com')
      ff.find_element(By.ID, 'continue').click()
      ff.find_element(By.ID, 'ap_password').send_keys('lo0kyLoOkYig0t4h')
      ff.find_element(By.NAME, 'rememberMe').click()
      ff.find_element(By.ID, 'signInSubmit').click()



  """ Build Lists of Product Page URLs
  """
  def initiate_crawl():
      def refresh_page(url):
      ff = create_webdriver_instance()
      ff.get(url)
      ff.find_element(By.XPATH, '//*[@id="FilterItemView_sortOrder_dropdown"]/div/span[2]/span/span/span/span').click()
      ff.find_element(By.XPATH, '//a[contains(text(), "Discount - High to Low")]').click()
      items = WebDriverWait(ff, 15).until(
          EC.visibility_of_all_elements_located((By.XPATH, '//div[contains(@id, "100_dealView_")]'))
      )
      for count, item in enumerate(items):
          slashed_price = item.find_elements(By.XPATH, './/span[contains(@class, "a-text-strike")]')
          active_deals = item.find_elements(By.XPATH, './/*[contains(text(), "Add to Cart")]')
          # For Groups of Items on Sale
          # active_deals = //*[contains(text(), "Add to Cart") or contains(text(), "View Deal")]
          if len(slashed_price) > 0 and len(active_deals) > 0:
              product_title = item.find_element(By.ID, 'dealTitle').text
              if product_title not in already_scraped_product_titles:
                  already_scraped_product_titles.append(product_title)
                  url = ff.current_url
                  # Scrape Details of Each Deal
                  #extract(ff, item.find_element(By.ID, 'dealImage').get_attribute('href'))
                  print(product_title[:10])
                  ff.quit()
                  refresh_page(url)
                  break
          if count+1 is len(items):
              try:
                  print('')
                  print('new page')
                  WebDriverWait(ff, 15).until(EC.text_to_be_present_in_element((By.PARTIAL_LINK_TEXT, 'Next→'), 'Next→'))
                  ff.find_element(By.PARTIAL_LINK_TEXT, 'Next→').click()
                  time.sleep(10)
                  url = ff.current_url
                  print(url)
                  print('')
                  ff.quit()
                  refresh_page(url)
              except Exception as error:
                  """
                  ff.find_element(By.XPATH, '//*[@id="pagination-both-004143081429407891"]/ul/li[9]/a').click()
                  url = ff.current_url
                  ff.quit()
                  refresh_page(url)
                  """
                  print('cannot find ff.find_element(By.PARTIAL_LINK_TEXT, "Next?")')
                  print('Because of... {}'.format(error))
                  ff.quit()

      refresh_page('https://www.amazon.ca/gp/goldbox/ref=gbps_ftr_s-3_4bc8_dct_10-?gb_f_c2xvdC0z=sortOrder:BY_SCORE,discountRanges:10-25%252C25-50%252C50-70%252C70-&pf_rd_p=f5836aee-0969-4c39-9720-4f0cacf64bc8&pf_rd_s=slot-3&pf_rd_t=701&pf_rd_i=gb_main&pf_rd_m=A3DWYIK6Y9EEQB&pf_rd_r=CQ7KBNXT36G95190QJB1&ie=UTF8')

  #def extract_info(ff, url):
  fetch_ua_strings()
  initiate_crawl()

控制台输出：使用Selenium v3.14.0和Firefox Quantum v62.0.3，我可以在控制台上提取以下输出：

  J.Rosée Si
  B.Catcher 
  Bluetooth4
  FRAM G4164
  Major Crim
  20% off Oh
  True Blood
  Prime-Line
  Marathon 3
  True Blood
  B.Catcher 
  4 Film Fav
  True Blood
  Texture Pa
  Westinghou
  True Blood
  ThermoPro 
  ...
  ...
  ...

注意：我本可以优化您的代码并执行相同的web抓取操作，只初始化Firefox浏览器客户端一次，浏览各种产品及其详细信息。但是为了保持你的逻辑和创新，我建议你通过所需的最小更改

我稍微调整了代码，它似乎起作用了。变化：

import random
语句，因为它已被使用，没有它将无法运行
在
product\u title
循环中，这些行被删除：

ff.quit（）
，
刷新页面（url）
和
break

ff.quit（）
语句将导致致命（连接）错误，导致脚本中断
如果计数+1==len（项目）：

你监控了爬行过程吗？第9页还有像“更多”这样的按钮吗？@jihan1008一切都被监控着。我检查了xpath，所有内容，似乎没有任何内容被破坏。您可以使用不同的浏览器版本进行检查。我无法让您的脚本运行，但似乎在某个点上，您会得到长度为0的项，因此枚举循环不会发生。尝试在循环之前打印项目的长度，看看代码结束之前会发生什么。@AndrewMcDowell好主意！我在pt，我相信它一定在剧本的其他地方。我目前已经设置了一组
time.sleep（n）
s，并用它运行了一个测试。之后我会打印长度！感谢您的输入，我的问题中的
HTTPConnectionPool
错误是一个异常值。脚本工作得非常好，直到它在第9页突然停止，没有错误或异常。我设置但未使用“下一步”按钮的唯一原因是，我试图对此进行故障排除，认为可能与此有关，但从未重置它。这里的问题是，为什么它在完成第9页后会停止爬行/刮取？哦，也许这就是为什么我从未重置它<代码>WebDriverWait（ff，15）。直到（EC.text）在元素中出现（（By.PARTIAL\u LINK\u text，'下一步→'), '下一个→')).单击（）返回一个不可单击的
布尔对象。您的code@Anthony根据你的观察，我对我的解决方案做了一些小的修改。我本可以优化你的代码，并执行相同的web废弃操作，只打开一次Firefox浏览器客户端并遍历各种产品。但是请注意您的逻辑和创新我建议您通过所需的最小更改。请您尝试更新的解决方案并告诉我状态好吗？事实上，几天前我自己解决了这个问题。为了清晰起见，我已经更新了我的问题，并再次更新了它。是的，我不知道有什么具体原因使用单个实例遍历网站，而不是创建多个实例。尽管现在我考虑到了这一点，使用ff.back（）创建新实例甚至可能不是必需的功能，但它肯定更直截了当。我有空的时候会给你的答案看一看！谢谢你尝试解决我的问题 WebDriverWait(ff, 15).until(EC.text_to_be_present_in_element((By.PARTIAL_LINK_TEXT, 'Next→'), 'Next→')) ff.find_element(By.PARTIAL_LINK_TEXT, 'Next→').click() # -*- coding: utf-8 -*- from selenium import webdriver from selenium.webdriver.firefox.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait import time import random """ Set Global Variables """ ua_strings = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'] already_scraped_product_titles = [] """ Create Instances of WebDriver """ def create_webdriver_instance(): ua_string = random.choice(ua_strings) profile = webdriver.FirefoxProfile() profile.set_preference('general.useragent.override', ua_string) options = Options() options.add_argument('--headless') return webdriver.Firefox(profile) """ Construct List of UA Strings """ def fetch_ua_strings(): ff = create_webdriver_instance() ff.get('https://techblog.willshouse.com/2012/01/03/most-common-user-agents/') ua_strings_ff_eles = ff.find_elements_by_xpath('//td[@class="useragent"]') for ua_string in ua_strings_ff_eles: if 'mobile' not in ua_string.text and 'Trident' not in ua_string.text: ua_strings.append(ua_string.text) ff.quit() """ Log in to Amazon to Use SiteStripe in order to Generate Affiliate Links """ def log_in(ff): ff.find_element(By.XPATH, '//a[@id="nav-link-yourAccount"] | //a[@id="nav-link-accountList"]').click() ff.find_element(By.ID, 'ap_email').send_keys('anthony_falez@hotmail.com') ff.find_element(By.ID, 'continue').click() ff.find_element(By.ID, 'ap_password').send_keys('lo0kyLoOkYig0t4h') ff.find_element(By.NAME, 'rememberMe').click() ff.find_element(By.ID, 'signInSubmit').click() """ Build Lists of Product Page URLs """ def initiate_crawl(): def refresh_page(url): ff = create_webdriver_instance() ff.get(url) ff.find_element(By.XPATH, '//*[@id="FilterItemView_sortOrder_dropdown"]/div/span[2]/span/span/span/span').click() ff.find_element(By.XPATH, '//a[contains(text(), "Discount - High to Low")]').click() items = WebDriverWait(ff, 15).until( EC.visibility_of_all_elements_located((By.XPATH, '//div[contains(@id, "100_dealView_")]')) ) for count, item in enumerate(items): slashed_price = item.find_elements(By.XPATH, './/span[contains(@class, "a-text-strike")]') active_deals = item.find_elements(By.XPATH, './/*[contains(text(), "Add to Cart")]') # For Groups of Items on Sale # active_deals = //*[contains(text(), "Add to Cart") or contains(text(), "View Deal")] if len(slashed_price) > 0 and len(active_deals) > 0: product_title = item.find_element(By.ID, 'dealTitle').text if product_title not in already_scraped_product_titles: already_scraped_product_titles.append(product_title) url = ff.current_url # Scrape Details of Each Deal #extract(ff, item.find_element(By.ID, 'dealImage').get_attribute('href')) print(product_title[:10]) ff.quit() refresh_page(url) break if count+1 is len(items): try: print('') print('new page') WebDriverWait(ff, 15).until(EC.text_to_be_present_in_element((By.PARTIAL_LINK_TEXT, 'Next→'), 'Next→')) ff.find_element(By.PARTIAL_LINK_TEXT, 'Next→').click() time.sleep(10) url = ff.current_url print(url) print('') ff.quit() refresh_page(url) except Exception as error: """ ff.find_element(By.XPATH, '//*[@id="pagination-both-004143081429407891"]/ul/li[9]/a').click() url = ff.current_url ff.quit() refresh_page(url) """ print('cannot find ff.find_element(By.PARTIAL_LINK_TEXT, "Next?")') print('Because of... {}'.format(error)) ff.quit() refresh_page('https://www.amazon.ca/gp/goldbox/ref=gbps_ftr_s-3_4bc8_dct_10-?gb_f_c2xvdC0z=sortOrder:BY_SCORE,discountRanges:10-25%252C25-50%252C50-70%252C70-&pf_rd_p=f5836aee-0969-4c39-9720-4f0cacf64bc8&pf_rd_s=slot-3&pf_rd_t=701&pf_rd_i=gb_main&pf_rd_m=A3DWYIK6Y9EEQB&pf_rd_r=CQ7KBNXT36G95190QJB1&ie=UTF8') #def extract_info(ff, url): fetch_ua_strings() initiate_crawl() J.Rosée Si B.Catcher Bluetooth4 FRAM G4164 Major Crim 20% off Oh True Blood Prime-Line Marathon 3 True Blood B.Catcher 4 Film Fav True Blood Texture Pa Westinghou True Blood ThermoPro ... ... ... # -*- coding: utf-8 -*- from selenium import webdriver from selenium.webdriver import Firefox from selenium.webdriver.firefox.options import Options from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait import time import random """ Set Global Variables """ ua_strings = ['Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'] already_scraped_product_titles = [] """ Create Instances of WebDriver """ def create_webdriver_instance(): ua_string = random.choice(ua_strings) profile = webdriver.FirefoxProfile() profile.set_preference('general.useragent.override', ua_string) options = Options() options.add_argument('--headless') return webdriver.Firefox(profile) """ Construct List of UA Strings """ def fetch_ua_strings(): ff = create_webdriver_instance() ff.get('https://techblog.willshouse.com/2012/01/03/most-common-user-agents/') ua_strings_ff_eles = ff.find_elements_by_xpath('//td[@class="useragent"]') for ua_string in ua_strings_ff_eles: if 'mobile' not in ua_string.text and 'Trident' not in ua_string.text: ua_strings.append(ua_string.text) ff.quit() """ Build Lists of Product Page URLs """ def initiate_crawl(): def refresh_page(url): ff = create_webdriver_instance() ff.get(url) ff.find_element(By.XPATH, '//*[@id="FilterItemView_sortOrder_dropdown"]/div/span[2]/span/span/span/span').click() ff.find_element(By.XPATH, '//a[contains(text(), "Discount - High to Low")]').click() items = WebDriverWait(ff, 15).until( EC.visibility_of_all_elements_located((By.XPATH, '//div[contains(@id, "100_dealView_")]')) ) print(items) for count, item in enumerate(items): slashed_price = item.find_elements(By.XPATH, './/span[contains(@class, "a-text-strike")]') active_deals = item.find_elements(By.XPATH, './/*[contains(text(), "Add to Cart")]') # For Groups of Items on Sale # active_deals = //*[contains(text(), "Add to Cart") or contains(text(), "View Deal")] if len(slashed_price) > 0 and len(active_deals) > 0: product_title = item.find_element(By.ID, 'dealTitle').text if product_title not in already_scraped_product_titles: already_scraped_product_titles.append(product_title) url = ff.current_url # Scrape Details of Each Deal #extract(ff, item.find_element(By.ID, 'dealImage').get_attribute('href')) print(product_title[:10]) # This ff.quit()-line breaks connection which breaks things.: #ff.quit() # And why #refresh_page(url) #break # 'is' tests for object equality; == tests for value equality: if count+1 == len(items): try: print('') print('new page') next_button = WebDriverWait(ff, 15).until( EC.text_to_be_present_in_element((By.PARTIAL_LINK_TEXT, 'Next→'), 'Next→') ) ff.find_element(By.PARTIAL_LINK_TEXT, 'Next→').click() time.sleep(3) url = ff.current_url print(url) print('') ff.quit() refresh_page(url) except Exception as error: """ ff.find_element(By.XPATH, '//*[@id="pagination-both-004143081429407891"]/ul/li[9]/a').click() url = ff.current_url ff.quit() refresh_page(url) """ print('cannot find ff.find_element(By.PARTIAL_LINK_TEXT, "Next→")') print('Because of... {}'.format(error)) ff.quit() refresh_page('https://www.amazon.ca/gp/goldbox/ref=gbps_ftr_s-3_4bc8_dct_10-?gb_f_c2xvdC0z=sortOrder:BY_SCORE,discountRanges:10-25%252C25-50%252C50-70%252C70-&pf_rd_p=f5836aee-0969-4c39-9720-4f0cacf64bc8&pf_rd_s=slot-3&pf_rd_t=701&pf_rd_i=gb_main&pf_rd_m=A3DWYIK6Y9EEQB&pf_rd_r=CQ7KBNXT36G95190QJB1&ie=UTF8') #def extract_info(ff, url): fetch_ua_strings() initiate_crawl()