Python 使用BeautifulSoup刮取Tripadvisor进行审阅时出现值错误
作为一个完全的新手,我正试图搜集一些Tripadvisor评论 我使用的代码来自 它对一个链接有效(尽管删除了属性“language”),但对其他链接无效(例如) 我收到错误信息:Python 使用BeautifulSoup刮取Tripadvisor进行审阅时出现值错误,python,web-scraping,beautifulsoup,valueerror,Python,Web Scraping,Beautifulsoup,Valueerror,作为一个完全的新手,我正试图搜集一些Tripadvisor评论 我使用的代码来自 它对一个链接有效(尽管删除了属性“language”),但对其他链接无效(例如) 我收到错误信息: Traceback (most recent call last): File "<pyshell#37>", line 4, in <module> items = scrape(url) File "<pyshell#13>
Traceback (most recent call last):
File "<pyshell#37>", line 4, in <module>
items = scrape(url)
File "<pyshell#13>", line 11, in scrape
items = parse(session, url + '?filterLang=' + lang)
File "<pyshell#18>", line 15, in parse
num_reviews = int(num_reviews) # convert text into integer
ValueError: invalid literal for int() with base 10: '5.695'
回溯(最近一次呼叫最后一次):
文件“”,第4行,在
items=scrape(url)
文件“”,第11行,在草稿中
items=parse(会话,url+'?filterLang='+lang)
解析中第15行的文件“”
num_reviews=int(num_reviews)#将文本转换为整数
ValueError:基数为10的int()的文本无效:“5.695”
(其中5695是页面中的评论数)
我在这里附上代码,以防有人能帮我
非常感谢你!
西尔维亚
--
在此填写完整的代码:
import requests
from bs4 import BeautifulSoup
import csv
import webbrowser
import io
def display(content, filename='output.html'):
with open(filename, 'wb') as f:
f.write(content)
webbrowser.open(filename)
def get_soup(session, url, show=False):
r = session.get(url)
if show:
display(r.content, 'temp.html')
if r.status_code != 200: # not OK
print('[get_soup] status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def post_soup(session, url, params, show=False):
'''Read HTML from server and convert to Soup'''
r = session.post(url, data=params)
if show:
display(r.content, 'temp.html')
if r.status_code != 200: # not OK
print('[post_soup] status code:', r.status_code)
else:
return BeautifulSoup(r.text, 'html.parser')
def scrape(url, lang='ALL'):
# create session to keep all cookies (etc.) between requests
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:57.0) Gecko/20100101 Firefox/57.0',
})
items = parse(session, url + '?filterLang=' + lang)
return items
def parse(session, url):
'''Get number of reviews and start getting subpages with reviews'''
print('[parse] url:', url)
soup = get_soup(session, url)
if not soup:
print('[parse] no soup:', url)
return
num_reviews = soup.find('span', class_='reviews_header_count').text # get text
num_reviews = num_reviews[1:-1]
num_reviews = num_reviews.replace(',', '')
num_reviews = int(num_reviews) # convert text into integer
print('[parse] num_reviews ALL:', num_reviews)
url_template = url.replace('.html', '-or{}.html')
print('[parse] url_template:', url_template)
items = []
offset = 0
while(True):
subpage_url = url_template.format(offset)
subpage_items = parse_reviews(session, subpage_url)
if not subpage_items:
break
items += subpage_items
if len(subpage_items) < 5:
break
offset += 5
return items
def get_reviews_ids(soup):
items = soup.find_all('div', attrs={'data-reviewid': True})
if items:
reviews_ids = [x.attrs['data-reviewid'] for x in items][::2]
print('[get_reviews_ids] data-reviewid:', reviews_ids)
return reviews_ids
def get_more(session, reviews_ids):
url = 'https://www.tripadvisor.com/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=Hotel_Review'
payload = {
'reviews': ','.join(reviews_ids), # ie. "577882734,577547902,577300887",
#'contextChoice': 'DETAIL_HR', # ???
'widgetChoice': 'EXPANDED_HOTEL_REVIEW_HSX', # ???
'haveJses': 'earlyRequireDefine,amdearly,global_error,long_lived_global,apg-Hotel_Review,apg-Hotel_Review-in,bootstrap,desktop-rooms-guests-dust-en_US,responsive-calendar-templates-dust-en_US,taevents',
'haveCsses': 'apg-Hotel_Review-in',
'Action': 'install',
}
soup = post_soup(session, url, payload)
return soup
def parse_reviews(session, url):
'''Get all reviews from one page'''
print('[parse_reviews] url:', url)
soup = get_soup(session, url)
if not soup:
print('[parse_reviews] no soup:', url)
return
hotel_name = soup.find('h1', id='HEADING').text
reviews_ids = get_reviews_ids(soup)
if not reviews_ids:
return
soup = get_more(session, reviews_ids)
if not soup:
print('[parse_reviews] no soup:', url)
return
items = []
for idx, review in enumerate(soup.find_all('div', class_='reviewSelector')):
badgets = review.find_all('span', class_='badgetext')
if len(badgets) > 0:
contributions = badgets[0].text
else:
contributions = '0'
if len(badgets) > 1:
helpful_vote = badgets[1].text
else:
helpful_vote = '0'
user_loc = review.select_one('div.userLoc strong')
if user_loc:
user_loc = user_loc.text
else:
user_loc = ''
bubble_rating = review.select_one('span.ui_bubble_rating')['class']
bubble_rating = bubble_rating[1].split('_')[-1]
item = {
'review_body': review.find('p', class_='partial_entry').text,
'review_date': review.find('span', class_='ratingDate')['title'], # 'ratingDate' instead of 'relativeDate'
}
items.append(item)
print('\n--- review ---\n')
for key,val in item.items():
print(' ', key, ':', val)
print()
return items
def write_in_csv(items, filename='results.csv',
headers=['hotel name', 'review title', 'review body',
'review date', 'contributions', 'helpful vote',
'user name' , 'user location', 'rating'],
mode='w'):
print('--- CSV ---')
with io.open(filename, mode, encoding="utf-8") as csvfile:
csv_file = csv.DictWriter(csvfile, headers)
if mode == 'w':
csv_file.writeheader()
csv_file.writerows(items)
DB_COLUMN = 'review_body'
DB_COLUMN1 = 'review_date'
start_urls = [
'https://www.tripadvisor.com/Restaurant_Review-g187823-d2101904-Reviews-Eataly_Genova-Genoa_Italian_Riviera_Liguria.html',
]
headers = [
DB_COLUMN,
DB_COLUMN1,
]
lang = 'it'
for url in start_urls:
# get all reviews for 'url' and 'lang'
items = scrape(url)
if not items:
print('No reviews')
else:
# write in CSV
filename = url.split('Reviews-')[1][:-5]
print('filename:', filename)
write_in_csv(items, filename + '.csv', headers, mode='w')
导入请求
从bs4导入BeautifulSoup
导入csv
导入网络浏览器
输入io
def显示(内容,filename='output.html'):
将open(filename,'wb')作为f:
f、 写作(内容)
webbrowser.open(文件名)
def get_soup(会话、url、show=False):
r=session.get(url)
如果显示:
显示(r.content,'temp.html')
如果r.status_代码!=200:#不好
打印(“[get_soup]状态代码:”,r.status_代码)
其他:
返回BeautifulSoup(r.text,'html.parser')
def post_汤(会话、url、参数、show=False):
''从服务器读取HTML并转换为Soup''
r=session.post(url,data=params)
如果显示:
显示(r.content,'temp.html')
如果r.status_代码!=200:#不好
打印(“[post\u soup]状态代码:”,r.status\u代码)
其他:
返回BeautifulSoup(r.text,'html.parser')
def刮取(url,lang='ALL'):
#创建会话以保留请求之间的所有cookie(等)
会话=请求。会话()
session.headers.update({
“用户代理”:“Mozilla/5.0(X11;Ubuntu;Linux x86_64;rv:57.0)Gecko/20100101 Firefox/57.0”,
})
items=parse(会话,url+'?filterLang='+lang)
退货项目
def解析(会话,url):
''获取评论数量并开始获取包含评论的子页面''
打印(“[parse]url:”,url)
soup=get_soup(会话,url)
如果不是汤:
打印(“[parse]无汤:”,url)
返回
num_reviews=soup.find('span',class='reviews_header_count').text#get text
num_reviews=num_reviews[1:-1]
num_reviews=num_reviews.replace(',','')
num_reviews=int(num_reviews)#将文本转换为整数
打印(“[parse]num\u reviews ALL:”,num\u reviews)
url_template=url.replace('.html'、'-或{}.html')
打印(“[parse]url\u模板:”,url\u模板)
项目=[]
偏移量=0
虽然(正确):
子页面url=url\u模板格式(偏移量)
子页面\u项=解析\u评论(会话,子页面\u url)
如果不是子页面_项:
打破
项目+=子页面\项目
如果len(子页面项)<5:
打破
偏移量+=5
退货项目
def get_reviews_ID(汤):
items=soup.find_all('div',attrs={'data-reviewid':True})
如有项目:
检查项目中x的ID=[x.attrs['data-reviewid'][::2]
打印(“[get_reviews_ID]data reviewid:”,reviews_ID)
退货审核单
def获取更多信息(会话、审核ID):
url='1〕https://www.tripadvisor.com/OverlayWidgetAjax?Mode=EXPANDED_HOTEL_REVIEWS_RESP&metaReferer=Hotel_Review'
有效载荷={
“评论”:“加入(评论ID)”,即“577882734577547902577300887”,
#“上下文选择”:“详细信息”???
“widgetChoice”:“扩展酒店评论”???
“haveJses”:“早期要求重新定义全局错误、长寿全局、apg-Hotel-Review、apg-Hotel-Review-in、引导、桌面-房间-客人-dust-en-US、响应式-日历-模板-dust-en-US、taevents”,
“haveCsses”:“apg-Hotel_Review-in”,
“操作”:“安装”,
}
soup=post_soup(会话、url、有效负载)
返汤
def parse_评论(会话、url):
''从一个页面获取所有评论''
打印(“[parse_reviews]url:”,url)
soup=get_soup(会话,url)
如果不是汤:
打印(“[parse_reviews]no soup:”,url)
返回
hotel_name=soup.find('h1',id='HEADING')。文本
评论\u ID=获取评论\u ID(汤)
如果没有,请检查\u ID:
返回
soup=获取更多(会话、评论\u ID)
如果不是汤:
打印(“[parse_reviews]no soup:”,url)
返回
项目=[]
对于idx,在enumerate(soup.find_all('div',class='reviewSelector')中查看:
badgets=review.find_all('span',class='badgetext')
如果len(徽章)>0:
贡献=徽章[0]。文本
其他:
贡献='0'
如果len(徽章)>1:
有帮助的投票=徽章[1]。文本
其他:
有帮助的投票='0'
user\u loc=review。选择一个('div.userLoc strong')
如果用户锁定:
user\u loc=user\u loc.text
其他:
用户位置=“”
bubble\u rating=查看。选择一个('span.ui\u bubble\u rating')['class']
泡泡评级=泡泡评级[1]。拆分(“”“)[-1]
项目={
“review_body”:review.find('p',class='partial_entry')。text,
“review_date”:review.find('span',class='ratingDate')['title'],#'ratingDate'而不是'relativeDate'
}
items.append(项目)
打印('\n---审阅---\n')
对于键,在item.items()中使用val:
打印(“”,键“”:“”,val)
打印()
退货项目
def write_in_csv(项目,filename='results.csv',
标题=['酒店名称','评审标题','评审主体',
“审核日期”、“贡献”、“有用投票”,
“用户名”、“用户位置”、“评级”],
模式class='w'):
打印('--CSV---')
将io.open(文件名、模式、编码=“utf-8”)作为csvfile:
csv_file=csv.DictWriter(csvfile,头文件)
num_reviews = num_reviews.replace('.', '')
num_reviews = int(num_reviews)
num_reviews = soup.find('span', class_='reviews_header_count').text # get text
num_reviews = num_reviews[1:-1]
num_reviews = num_reviews.replace(',', '').replace('.','')
num_reviews = int(num_reviews)