Python 刮削过程中连接关闭_Python_Web_Web Scraping

Python 刮削过程中连接关闭

python web web-scraping

Python 刮削过程中连接关闭,python,web,web-scraping,Python,Web,Web Scraping,我正在训练自己使用Scraper，编写一个脚本，用于检索喜欢以下帖子的人的Facebook姓名： from selenium import webdriver import os from bs4 import BeautifulSoup from time import sleep import random import csv import logging logging.basicConfig(level=logging.DEBUG) # CONFIGURATION VARIABLE

我正在训练自己使用Scraper，编写一个脚本，用于检索喜欢以下帖子的人的Facebook姓名：

from selenium import webdriver
import os
from bs4 import BeautifulSoup
from time import sleep
import random
import csv
import logging

logging.basicConfig(level=logging.DEBUG)

# CONFIGURATION VARIABLES
LIMIT = 500 # Number of likers in each page
CSV_FILENAME = 'results' # Name of the resulting CSV file (DO NOT ADD the file extension)
LOADING_PAGE_TIME = 10 # Waiting time to load the new page
MIN_WAITING_TIME = 5 # Min waiting time to load the next page
MAX_WAITING_TIME = 10 # Max waiting time to load the next page
PAGE_LINK = 'https://mbasic.facebook.com/ufi/reaction/profile/browser/?ft_ent_identifier=10158113586853155&refid=52&_ft_=mf_story_key.10158113586853155%3Atop_level_post_id.10158113586853155%3Atl_objid.10158113586853155%3Acontent_owner_id_new.252306033154%3Athrowback_story_fbid.10158113586853155%3Apage_id.252306033154%3Astory_location.4%3Astory_attachment_style.share%3Atds_flgs.3%3Aott.AX_3mehvhnuA70tT%3Apage_insights.%7B%22252306033154%22%3A%7B%22page_id%22%3A252306033154%2C%22page_id_type%22%3A%22page%22%2C%22actor_id%22%3A252306033154%2C%22dm%22%3A%7B%22isShare%22%3A1%2C%22originalPostOwnerID%22%3A0%7D%2C%22psn%22%3A%22EntStatusCreationStory%22%2C%22post_context%22%3A%7B%22object_fbtype%22%3A266%2C%22publish_time%22%3A1601470895%2C%22story_name%22%3A%22EntStatusCreationStory%22%2C%22story_fbid%22%3A%5B10158113586853155%5D%7D%2C%22role%22%3A1%2C%22sl%22%3A4%2C%22targets%22%3A%5B%7B%22actor_id%22%3A252306033154%2C%22page_id%22%3A252306033154%2C%22post_id%22%3A10158113586853155%2C%22role%22%3A1%2C%22share_id%22%3A0%7D%5D%7D%7D%3Athid.252306033154'
logging.debug('Driver initialization')

# Driver initialization
driver_path = os.path.abspath('chromedriver')
chrome_options = webdriver.ChromeOptions()
chrome_options.add_argument("user-data-dir=selenium")
browser = webdriver.Chrome(executable_path=driver_path, options=chrome_options)

# CSV initialization
csvfile = open(CSV_FILENAME + '.csv', 'w')
csvfile.close()

# Scraping process
logging.debug('Scraping initialization')
browser.get(PAGE_LINK.replace("limit=10&", "limit=" + str(LIMIT) + '&'))

cycles = 0
name_scraped = 0
last_url = ''

while 1:
    logging.debug('Starting %s cycle', cycles)
    cycles += 1
    soup = BeautifulSoup(browser.page_source, "html.parser")
    sleep(LOADING_PAGE_TIME)
    # Extracting h3 elements that contain all info about the liker
    person_link = soup.find_all('h3', class_='bj')
    name_scraped += len(person_link)
    logging.debug('Name scraped: %s ', len(person_link))
    logging.debug('Writing CSV file')
    with open('results.csv', 'a') as csvfile:
        writer = csv.writer(csvfile, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL)
        for data in person_link:
            writer.writerow([data.find('a')['href']])
    # Extracting, if it exists, div elements that contains the Show more link
    next_link = soup.find('div', class_='bs')
    if next_link:
        # By default, limit of "Show more" link is 10, it is needed to change it
        next_link = next_link.find('a')['href'].replace("limit=10", "limit=" + str(LIMIT))
        sleep(random.randint(MIN_WAITING_TIME, MAX_WAITING_TIME))
        browser.get("https://mbasic.facebook.com" + next_link)
    else:
        break


logging.debug('Total names scraped: %s', name_scraped)
logging.debug('Last url: %s', "https://mbasic.facebook.com" + next_link)

#browser.quit()

问题是，在刮取了大约4500个名字之后，我得到了错误：

Traceback (most recent call last):
  File "/Users/degiorgilorenzo/Desktop/facebook-scraping/main.py", line 58, in <module>
    browser.get("https://mbasic.facebook.com" + next_link)
  File "/Users/degiorgilorenzo/Desktop/facebook-scraping/venv/lib/python3.7/site-packages/selenium/webdriver/remote/webdriver.py", line 333, in get
    self.execute(Command.GET, {'url': url})
  File "/Users/degiorgilorenzo/Desktop/facebook-scraping/venv/lib/python3.7/site-packages/selenium/webdriver/remote/webdriver.py", line 321, in execute
    self.error_handler.check_response(response)
  File "/Users/degiorgilorenzo/Desktop/facebook-scraping/venv/lib/python3.7/site-packages/selenium/webdriver/remote/errorhandler.py", line 242, in check_response
    raise exception_class(message, screen, stacktrace)
selenium.common.exceptions.WebDriverException: Message: unknown error: net::ERR_CONNECTION_CLOSED
  (Session info: chrome=85.0.4183.121)

回溯（最近一次呼叫最后一次）：
文件“/Users/degiorgilorenzo/Desktop/facebook scraping/main.py”，第58行，在
browser.get（“https://mbasic.facebook.com“+下一个链接）
文件“/Users/degiorgilorenzo/Desktop/facebook scraping/venv/lib/python3.7/site packages/selenium/webdriver/remote/webdriver.py”，第333行，在get中
self.execute（Command.GET，{'url'：url}）
文件“/Users/degiorgilorenzo/Desktop/facebook scraping/venv/lib/python3.7/site packages/selenium/webdriver/remote/webdriver.py”，第321行，在execute中
self.error\u handler.check\u响应（响应）
文件“/Users/degiorgilorenzo/Desktop/facebook scraping/venv/lib/python3.7/site packages/selenium/webdriver/remote/errorhandler.py”，第242行，在检查响应中
引发异常类（消息、屏幕、堆栈跟踪）
selenium.common.exceptions.WebDriverException:消息：未知错误：net:：ERR\u连接\u已关闭
（会话信息：chrome=85.0.4183.121）

并且该页面的链接无法访问。我认为这是一个对服务器的大量请求的问题，但在等待数小时并更改Facebook帐户后，仍然无法访问该链接。将等待时间改为10-20分钟左右并不能解决问题