Json 一次浏览多个JS页面_Json_Python 3.x_Selenium_Web Scraping_Beautifulsoup

Json 一次浏览多个JS页面

json python-3.x selenium web-scraping

Json 一次浏览多个JS页面,json,python-3.x,selenium,web-scraping,beautifulsoup,Json,Python 3.x,Selenium,Web Scraping,Beautifulsoup,我正在尝试webscrape一个网站，它有多个由Javascript呈现的页面。我用的是美容素和硒。我有一个脚本，但只适用于网站的第一页。是否可以对多个javascript呈现的页面进行webscrape处理，还是需要单独进行处理？这是我的剧本： import time from bs4 import BeautifulSoup as soup import requests from selenium import webdriver from selenium.webdriver.chrom

我正在尝试webscrape一个网站，它有多个由Javascript呈现的页面。我用的是美容素和硒。我有一个脚本，但只适用于网站的第一页。是否可以对多个javascript呈现的页面进行webscrape处理，还是需要单独进行处理？这是我的剧本：

import time
from bs4 import BeautifulSoup as soup
import requests
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import json

# The path to where you have your chrome webdriver stored:
webdriver_path = '/Users/rawlins/Downloads/chromedriver'

# Add arguments telling Selenium to not actually open a window
chrome_options = Options()
chrome_options.add_argument('--headless')
#chrome_options.add_argument('--window-size=1920x1080')

# Fire up the headless browser
browser = webdriver.Chrome(executable_path = webdriver_path,
chrome_options = chrome_options)

# Load webpage
url = "https://cnx.org/search?q=subject:Arts"
browser.get(url)

# to ensure that the page has loaded completely.
time.sleep(3)

data = [] 
n = 2
for i in range(1, n+1):
    if (i == 1):
        # handle first page
        response = requests.get(url)
    response = requests.get(url + "&page=" + str(i))
    #response = requests.get(url + "&page=" + str(i),headers={'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.47 Safari/537.36'})

    # Parse HTML, close browser
    page_soup = soup(browser.page_source, 'lxml')
    containers = page_soup.findAll("tr")
    browser.quit()

    for container in containers:
        item = {}
        item['type'] = "Course Material"
        if container.find('td', {'class' : 'title'}):
            item['title'] = container.find('td', {'class' : 'title'}).h4.text.strip()
        else:
            item['title'] = ""
        if container.find('td', {'class' : 'authors'}):
            item['author'] = container.find('td', {'class' : 'authors'}).text.strip()
        else:
            item['author'] = ""
        if container.find('td', {'class' : 'title'}):
            item['link'] = "https://cnx.org/" + container.find('td', {'class' : 'title'}).a["href"]
        else: 
            item['link'] = ""
        if container.find('td', {'class' : 'title'}):
            item['description'] = container.find('td', {'class' : 'title'}).span.text
        else: 
            item['description'] = ""
        item['subject'] = "Arts"
        item['source'] = "OpenStax CNX"
        item['base_url'] = "https://cnx.org/browse"
        item['license'] = "Attribution"
        data.append(item) # add the item to the list

    with open("js-webscrape.json", "w") as writeJSON:
        json.dump(data, writeJSON, ensure_ascii=False)

提前感谢。

这里有几个问题：

您将
```
请求.get（）
```
与
```
浏览器.get（）
```
混合在一起。这里根本不需要
```
请求
```
模块，因为您是通过无头浏览器获取页面的
第一页不需要特殊情况。很好
```
time.sleep（）
```
应该介于
```
browser.get（）
```
和解析之间，以便在将页面送入BeautifulSoup之前完全加载页面
一旦所有页面都被删除，您应该将
```
数据
```
写入
```
for
```
循环外部的JSON文件
在
```
for
```
循环之外退出浏览器，而不是在一次迭代之后
为避免编码错误，请在写入JSON文件时指定编码：使用
```
open（“js webscrape.JSON”，“w”，encoding=“utf-8”）
```

下面是一个可工作的实现，它可以删除所有7个页面：

import time
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import json

# The path to where you have your chrome webdriver stored:
webdriver_path = '/Users/Gebruiker/Downloads/chromedriver_win32/chromedriver'

# Add arguments telling Selenium to not actually open a window
chrome_options = Options()
chrome_options.add_argument('--headless')

# Fire up the headless browser
browser = webdriver.Chrome(executable_path = webdriver_path, options = chrome_options)

# Load webpage
url = "https://cnx.org/search?q=subject:Arts"

data = []
n = 7
for i in range(1, n+1):
    response = browser.get(url + "&page=" + str(i))
    time.sleep(5)

    # Parse HTML
    page_soup = soup(browser.page_source,'lxml')
    containers = page_soup.findAll("tr")

    for container in containers:
        item = dict()
        item['type'] = "Course Material"
        if container.find('td', {'class' : 'title'}):
            item['title'] = container.find('td', {'class' : 'title'}).h4.text.strip()
        else:
            item['title'] = ""
        if container.find('td', {'class' : 'authors'}):
            item['author'] = container.find('td', {'class' : 'authors'}).text.strip()
        else:
            item['author'] = ""
        if container.find('td', {'class' : 'title'}):
            item['link'] = "https://cnx.org/" + container.find('td', {'class' : 'title'}).a["href"]
        else:
            item['link'] = ""
        if container.find('td', {'class' : 'title'}):
            item['description'] = container.find('td', {'class' : 'title'}).span.text
        else:
            item['description'] = ""
        item['subject'] = "Arts"
        item['source'] = "OpenStax CNX"
        item['base_url'] = "https://cnx.org/browse"
        item['license'] = "Attribution"
        data.append(item) # add the item to the list

# write data to file and quit browser when done
print(data)
with open("js-webscrape.json", "w", encoding="utf-8") as writeJSON:
    json.dump(data, writeJSON, ensure_ascii=False)

browser.quit()

这里有几个问题：

您将
```
请求.get（）
```
与
```
浏览器.get（）
```
混合在一起。这里根本不需要
```
请求
```
模块，因为您是通过无头浏览器获取页面的
第一页不需要特殊情况。很好
```
time.sleep（）
```
应该介于
```
browser.get（）
```
和解析之间，以便在将页面送入BeautifulSoup之前完全加载页面
一旦所有页面都被删除，您应该将
```
数据
```
写入
```
for
```
循环外部的JSON文件
在
```
for
```
循环之外退出浏览器，而不是在一次迭代之后
为避免编码错误，请在写入JSON文件时指定编码：使用
```
open（“js webscrape.JSON”，“w”，encoding=“utf-8”）
```

下面是一个可工作的实现，它可以删除所有7个页面：

import time
from bs4 import BeautifulSoup as soup
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
import json

# The path to where you have your chrome webdriver stored:
webdriver_path = '/Users/Gebruiker/Downloads/chromedriver_win32/chromedriver'

# Add arguments telling Selenium to not actually open a window
chrome_options = Options()
chrome_options.add_argument('--headless')

# Fire up the headless browser
browser = webdriver.Chrome(executable_path = webdriver_path, options = chrome_options)

# Load webpage
url = "https://cnx.org/search?q=subject:Arts"

data = []
n = 7
for i in range(1, n+1):
    response = browser.get(url + "&page=" + str(i))
    time.sleep(5)

    # Parse HTML
    page_soup = soup(browser.page_source,'lxml')
    containers = page_soup.findAll("tr")

    for container in containers:
        item = dict()
        item['type'] = "Course Material"
        if container.find('td', {'class' : 'title'}):
            item['title'] = container.find('td', {'class' : 'title'}).h4.text.strip()
        else:
            item['title'] = ""
        if container.find('td', {'class' : 'authors'}):
            item['author'] = container.find('td', {'class' : 'authors'}).text.strip()
        else:
            item['author'] = ""
        if container.find('td', {'class' : 'title'}):
            item['link'] = "https://cnx.org/" + container.find('td', {'class' : 'title'}).a["href"]
        else:
            item['link'] = ""
        if container.find('td', {'class' : 'title'}):
            item['description'] = container.find('td', {'class' : 'title'}).span.text
        else:
            item['description'] = ""
        item['subject'] = "Arts"
        item['source'] = "OpenStax CNX"
        item['base_url'] = "https://cnx.org/browse"
        item['license'] = "Attribution"
        data.append(item) # add the item to the list

# write data to file and quit browser when done
print(data)
with open("js-webscrape.json", "w", encoding="utf-8") as writeJSON:
    json.dump(data, writeJSON, ensure_ascii=False)

browser.quit()