Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/319.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 从tripadvisor抓取评论_Python_Python 3.x_Python 2.7_Beautifulsoup - Fatal编程技术网

Python 从tripadvisor抓取评论

Python 从tripadvisor抓取评论,python,python-3.x,python-2.7,beautifulsoup,Python,Python 3.x,Python 2.7,Beautifulsoup,假设我正在从url中抓取评论 它没有包含我想删掉的评论的页面。那么,我如何才能刮下所有后续页面的评论呢 我使用了下面的代码,但仍然只在第一页显示评论 from bs4 import BeautifulSoup import requests URL_BASE = "https://www.tripadvisor.com/Hotel_Review-g562819-d289642-Reviews-Hotel_Caserio-Playa_del_Ingles_Maspalomas_Gran_Can

假设我正在从url中抓取评论

它没有包含我想删掉的评论的页面。那么,我如何才能刮下所有后续页面的评论呢

我使用了下面的代码,但仍然只在第一页显示评论

from bs4 import BeautifulSoup
import requests

URL_BASE = "https://www.tripadvisor.com/Hotel_Review-g562819-d289642-Reviews-Hotel_Caserio-Playa_del_Ingles_Maspalomas_Gran_Canaria_Canary_Islands.html"
MAX_PAGES = 30
counter = 0

for i in range(1, MAX_PAGES):

if i > 1:
    url = "%spage/%d/" % (URL_BASE, i)
else:
    url = URL_BASE

req = requests.get(url)
statusCode = req.status_code
if statusCode == 200:

    html = BeautifulSoup(req.text, "html.parser")
    resultsoup = html.find_all('P', {'class': 'partial_entry'})

else:
    break

for review in resultsoup:
review_list = review.get_text()
print(review_list)
基于

服务器添加到url(在
.html
之前的任何位置)

  • -or5
    获取第二页
  • -或10
    获取第三页
等等

您甚至可以跳过单词(用于
SEO
),并且仅使用

https://www.tripadvisor.com/g562819-d289642-or5.html
https://www.tripadvisor.com/g562819-d289642-or10.html
获取下一页的评论

from bs4 import BeautifulSoup
import requests
import re
#import webbrowser

def get_soup(url):

    headers = {'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0'}

    r = s.get(url, headers=headers)

    #with open('temp.html', 'wb') as f:
    #    f.write(r.content)
    #    webbrowser.open('temp.html')

    if r.status_code != 200:
        print('status code:', r.status_code)
    else:
        return BeautifulSoup(r.text, 'html.parser')

def parse(url, response):

    if not response:
        print('no response:', url)
        return

    # get number of reviews
    num_reviews = response.find('span', class_='reviews_header_count').text
    num_reviews = num_reviews[1:-1] # remove `( )`
    num_reviews = num_reviews.replace(',', '') # remove `,`
    num_reviews = int(num_reviews)
    print('num_reviews:', num_reviews, type(num_reviews))

    # create template for urls to pages with reviews
    url = url.replace('.html', '-or{}.html')
    print('template:', url)

    # load pages with reviews
    for offset in range(0, num_reviews, 5):
        print('url:', url.format(offset))
        url_ = url.format(offset)
        parse_reviews(url_, get_soup(url_))
        return # for test only - to stop after first page

def parse_reviews(url, response):
    print('review:', url)

    if not response:
        print('no response:', url)
        return

    # get every review
    for idx, review in enumerate(response.find_all('div', class_='review-container')):
        item = {
            'hotel_name': response.find('h1', class_='heading_title').text,
            'review_title': review.find('span', class_='noQuotes').text,
            'review_body': review.find('p', class_='partial_entry').text,
            'review_date': review.find('span', class_='relativeDate')['title'],#.text,#[idx],
            'num_reviews_reviewer': review.find('span', class_='badgetext').text,
            'reviewer_name': review.find('span', class_='scrname').text,
            'bubble_rating': review.select_one('div.reviewItemInline span.ui_bubble_rating')['class'][1][7:],
        }

        results.append(item) # <--- add to global list

        #~ yield item
        for key,val in item.items():
            print(key, ':', val)
        print('----')
        #return # for test only - to stop after first review


# --- main ---

s = requests.Session()

start_urls = [
    'https://www.tripadvisor.com/Hotel_Review-g562819-d289642-Reviews-Hotel_Caserio-Playa_del_Ingles_Maspalomas_Gran_Canaria_Canary_Islands.html',
    #'https://www.tripadvisor.com/Hotel_Review-g60795-d102542-Reviews-Courtyard_Philadelphia_Airport-Philadelphia_Pennsylvania.html',
    #'https://www.tripadvisor.com/Hotel_Review-g60795-d122332-Reviews-The_Ritz_Carlton_Philadelphia-Philadelphia_Pennsylvania.html',
]

results = [] # <--- global list for items

for url in start_urls:
    parse(url, get_soup(url))

import pandas as pd

df = pd.DataFrame(results) # <--- convert list to DataFrame
df.to_csv('output.csv')    # <--- save in file
从bs4导入美化组
导入请求
进口稀土
#导入网络浏览器
def get_汤(url):
headers={'User-Agent':'Mozilla/5.0(X11;Ubuntu;Linux x86_64;rv:57.0)Gecko/20100101 Firefox/57.0'}
r=s.get(url,headers=headers)
#以open('temp.html','wb')作为f:
#f.写作(r.内容)
#webbrowser.open('temp.html')
如果r.status_代码!=200:
打印('状态代码:',r.状态代码)
其他:
返回BeautifulSoup(r.text,'html.parser')
def解析(url、响应):
如果没有答复:
打印('无响应:',url)
回来
#获得评论的数量
num\u reviews=response.find('span',class='reviews\u header\u count')。文本
num_reviews=num_reviews[1:-1]#删除`()`
num_reviews=num_reviews.replace(',','')#删除``
num\u reviews=int(num\u reviews)
打印('num_reviews:',num_reviews,type(num_reviews))
#为带有评论的页面的URL创建模板
url=url.replace('.html'、'-或{}.html')
打印('模板:',url)
#加载带有评论的页面
对于范围内的偏移量(0,数量,5):
打印('url:',url.format(偏移量))
url=url.format(偏移量)
解析评论(url,获取汤(url))
返回#仅用于测试-在第一页后停止
def parse_评论(url、响应):
打印('审阅:',url)
如果没有答复:
打印('无响应:',url)
回来
#得到每一个评论
对于idx,在enumerate中查看(response.find_all('div',class='review-container'):
项目={
“hotel_name”:response.find('h1',class='heading_title')。文本,
“review_title”:review.find('span',class='noQuotes')。文本,
“review_body”:review.find('p',class='partial_entry')。text,
“review_date”:review.find('span',class='relativeDate')['title'],#.text,#[idx],
“num\u reviews\u reviewer”:review.find('span',class='badgetext')。text,
“审阅者名称”:review.find('span',class='scrname')。文本,
“bubble_rating”:查看。选择一个('div.reviewItemInline span.ui_bubble_rating')['class'][1][7:],
}

results.append(item)#您尝试了什么?您尝试为第1页之后的页面创建的URL似乎不起作用…此页面上的评论解决方案是几天前的-但可能是针对
scrapy
python request
。我不记得了。此页面使用
JavaScript
加载数据,而BS不运行JS。您可能需要
Selenium
来控制将加载页面并运行JS的web浏览器。或者使用Chrome/Firefox(tab Network->XHR)中的
DevTools
查找JS用于获取数据的URL。scrapy的解决方案
scrapy
——您可以阅读代码为
请求创建解决方案
+
beautifulsoup
您提供的上述代码展示了如何完美地获取所有页面!我如何将这些结果导出或保存为CSV?python有模块
CSV
pandas
,可以与表一起工作。在全局列表中保留所有
,解析所有页面后,您可以使用
csv
或pandas`保存它。或者你可以在解析评论时使用
csv
来编写。顺便说一句:如果你使用
scrapy
,那么它可以自动保存在
csv
XML
JSON
@Lachie我添加了使用
pandas
来保存在文件中的代码。@Lachie我也会把这段代码放在GitHub上-