Python HTML解析器分页

Python HTML解析器分页,python,python-3.x,web-scraping,beautifulsoup,html-parsing,Python,Python 3.x,Web Scraping,Beautifulsoup,Html Parsing,我是python新手,在尝试HTML解析器的过程中已经取得了很大的进步,但我一直在研究如何为页面底部的评论进行分页,以便为站点工作 URL在PasteBin代码中,出于隐私原因,我在这个线程中省略了URL 非常感谢您的帮助 # Reviews Scrape from urllib.request import urlopen as uReq from bs4 import BeautifulSoup as soup my_url = 'EXAMPLE.COM' # opening up c

我是python新手,在尝试HTML解析器的过程中已经取得了很大的进步,但我一直在研究如何为页面底部的评论进行分页,以便为站点工作

URL在PasteBin代码中,出于隐私原因,我在这个线程中省略了URL

非常感谢您的帮助

# Reviews Scrape

from urllib.request import urlopen as uReq
from bs4 import BeautifulSoup as soup

my_url = 'EXAMPLE.COM'

# opening up connection, grabbing, the page
uClient = uReq(my_url)
page_html = uClient.read()
uClient.close()

# HTML Parsing
page_soup = soup(page_html, "html.parser")

# Grabs each review
reviews = page_soup.findAll("div",{"class":"jdgm-rev jdgm-divider-top"})

filename = "compreviews.csv"
f = open(filename, "w")

headers = "Score, Title, Content\n"

f.write(headers)
# HTML Lookup Location per website and strips spacing
for container in reviews:
    # score = container.div.div.span["data-score"]
    score = container.findAll("span",{"data-score":True})
    user_score = score[0].text.strip()

    title_review = container.findAll("b",{"class":"jdgm-rev__title"})
    user_title = title_review[0].text.strip()

    content_review = container.findAll("div",{"class":"jdgm-rev__body"})
    user_content = content_review[0].text.strip()

    print("user_score:" + score[0]['data-score'])
    print("user_title:" + user_title)
    print("user_content:" + user_content)

    f.write(score[0]['data-score'] + "," +user_title + "," +user_content + "\n")

f.close()

页面使用查询字符串执行xhr GET请求以获取结果。此查询字符串具有每页评论和页码的参数。您可以使用每个页面的最大评论数31来发出初始请求,从返回的json中提取html,然后获取页面计数;编写一个循环,在所有页面上运行以获得结果。示例构造如下:

import requests
from bs4 import BeautifulSoup as bs

start_url = 'https://urlpart&page=1&per_page=31&product_id=someid'

with requests.Session() as s:
    r = s.get(start_url).json()
    soup = bs(r['html'], 'lxml')
    print([i.text for i in soup.select('.jdgm-rev__author')])
    print([i.text for i in soup.select('.jdgm-rev__title')])
    total_pages = int(soup.select_one('.jdgm-paginate__last-page')['data-page'])

    for page in range(2, total_pages + 1):
        r = s.get(f'https://urlpart&page={page}&per_page=31&product_id=someid').json()
        soup = bs(r['html'], 'lxml')
        print([i.text for i in soup.select('.jdgm-rev__author')])
        print([i.text for i in soup.select('.jdgm-rev__title')]) #etc
示例数据帧到csv

import requests
from bs4 import BeautifulSoup as bs
import pandas as pd

start_url = 'https://urlpart&page=1&per_page=31&product_id=someid'

authors = []
titles = []

with requests.Session() as s:
    r = s.get(start_url).json()
    soup = bs(r['html'], 'lxml')
    authors.extend([i.text for i in soup.select('.jdgm-rev__author')])
    titles.extend([i.text for i in soup.select('.jdgm-rev__title')])
    total_pages = int(soup.select_one('.jdgm-paginate__last-page')['data-page'])

    for page in range(2, total_pages + 1):
        r = s.get(f'https://urlpart&page={page}&per_page=31&product_id=someid').json()
        soup = bs(r['html'], 'lxml')
        authors.extend([i.text for i in soup.select('.jdgm-rev__author')])
        titles.extend([i.text for i in soup.select('.jdgm-rev__title')]) #etc

headers = ['Author','Title']
df = pd.DataFrame(zip(authors,titles), columns = headers)
df.to_csv(r'C:\Users\User\Desktop\data.csv', sep=',', encoding='utf-8',index = False )

页面使用查询字符串执行xhr GET请求以获取结果。此查询字符串具有每页评论和页码的参数。您可以使用每个页面的最大评论数31来发出初始请求,从返回的json中提取html,然后获取页面计数;编写一个循环,在所有页面上运行以获得结果。示例构造如下:

import requests
from bs4 import BeautifulSoup as bs

start_url = 'https://urlpart&page=1&per_page=31&product_id=someid'

with requests.Session() as s:
    r = s.get(start_url).json()
    soup = bs(r['html'], 'lxml')
    print([i.text for i in soup.select('.jdgm-rev__author')])
    print([i.text for i in soup.select('.jdgm-rev__title')])
    total_pages = int(soup.select_one('.jdgm-paginate__last-page')['data-page'])

    for page in range(2, total_pages + 1):
        r = s.get(f'https://urlpart&page={page}&per_page=31&product_id=someid').json()
        soup = bs(r['html'], 'lxml')
        print([i.text for i in soup.select('.jdgm-rev__author')])
        print([i.text for i in soup.select('.jdgm-rev__title')]) #etc
示例数据帧到csv

import requests
from bs4 import BeautifulSoup as bs
import pandas as pd

start_url = 'https://urlpart&page=1&per_page=31&product_id=someid'

authors = []
titles = []

with requests.Session() as s:
    r = s.get(start_url).json()
    soup = bs(r['html'], 'lxml')
    authors.extend([i.text for i in soup.select('.jdgm-rev__author')])
    titles.extend([i.text for i in soup.select('.jdgm-rev__title')])
    total_pages = int(soup.select_one('.jdgm-paginate__last-page')['data-page'])

    for page in range(2, total_pages + 1):
        r = s.get(f'https://urlpart&page={page}&per_page=31&product_id=someid').json()
        soup = bs(r['html'], 'lxml')
        authors.extend([i.text for i in soup.select('.jdgm-rev__author')])
        titles.extend([i.text for i in soup.select('.jdgm-rev__title')]) #etc

headers = ['Author','Title']
df = pd.DataFrame(zip(authors,titles), columns = headers)
df.to_csv(r'C:\Users\User\Desktop\data.csv', sep=',', encoding='utf-8',index = False )

您可以集成代码。我会坚持使用请求库和会话。您还需要循环结构。您可以使用代码为author等提取结果,因为您将拥有soup对象。考虑是否在循环中添加列表以在结尾构造数据文件并写入CSV。你可以使用pandas,我的首选,或者坚持使用csv并按行写出(这感觉有点慢)基本上用你想要的代码来替换我的打印语句,以提取所需信息并构建最终输出。好的,谢谢你,我是Python新手,所以我将使用你的代码进行更多的研究,看看我是否能让它工作。再次谢谢,没关系。很可能有人会回答,并使其更接近您的代码。以上只是向您展示了另一种方式。我注意到返回的评论总数实际上比页面上显示的要少一点,但我相信它返回的数字是正确的。您能告诉我您是如何找到包含xhr GET的URL的吗?您可以集成您的代码。我会坚持使用请求库和会话。您还需要循环结构。您可以使用代码为author等提取结果,因为您将拥有soup对象。考虑是否在循环中添加列表以在结尾构造数据文件并写入CSV。你可以使用pandas,我的首选,或者坚持使用csv并按行写出(这感觉有点慢)基本上用你想要的代码来替换我的打印语句,以提取所需信息并构建最终输出。好的,谢谢你,我是Python新手,所以我将使用你的代码进行更多的研究,看看我是否能让它工作。再次谢谢,没关系。很可能有人会回答,并使其更接近您的代码。以上只是向您展示了另一种方式。我确实注意到,返回的评论总数实际上比页面上显示的要少一些,但我相信它返回的数字是正确的。您能告诉我您是如何找到包含xhr的URL的吗?