Python Selenium:在抓取网站时拒绝访问
嗨,我正试图从Tokopedia网站上提取信息,特别是从这个类别中https://www.tokopedia.com/p/handphone-tablet/handphone?page=1'. 但当我访问特定的项目页面时,我会看到一个空白页面,其中包含以下消息: 拒绝访问您无权访问此服务器上的“上述类别链接上列出的任何项目链接”。参考#18.60fe5e6f.1622187209.55a7d604。因此,数据框中的某些项是空的 这是我的代码:Python Selenium:在抓取网站时拒绝访问,python,selenium,google-chrome,selenium-webdriver,firefox,Python,Selenium,Google Chrome,Selenium Webdriver,Firefox,嗨,我正试图从Tokopedia网站上提取信息,特别是从这个类别中https://www.tokopedia.com/p/handphone-tablet/handphone?page=1'. 但当我访问特定的项目页面时,我会看到一个空白页面,其中包含以下消息: 拒绝访问您无权访问此服务器上的“上述类别链接上列出的任何项目链接”。参考#18.60fe5e6f.1622187209.55a7d604。因此,数据框中的某些项是空的 这是我的代码: import time from selenium
import time
from selenium import webdriver
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from selenium.webdriver.chrome.options import Options
import pandas as pd
import requests
from fake_useragent import UserAgent
def getTokopediaContent(link):
ua = UserAgent()
userAgent = ua.random
options = Options()
options.add_argument("--window-size=1920,1080")
options.add_argument('--no-sandbox')
options.add_argument(f'user-agent={userAgent}')
driver = webdriver.Chrome(options=options)
driver = webdriver.Chrome()
print(link)
driver.get(link)
time.sleep(2) # Allow 2 seconds for the web page to open
scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
i = 1
while True:
# scroll one screen height each time
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
time.sleep(scroll_pause_time)
# update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
scroll_height = driver.execute_script("return document.body.scrollHeight;")
# Break the loop when the height we need to scroll to is larger than the total scroll height
if (screen_height) * i > scroll_height:
break
soup = BeautifulSoup(driver.page_source, "html.parser")
#product description
product_desc = []
desc_div=soup.find_all('div',{'data-testid':'lblPDPDescriptionProduk'})
for a in desc_div:
desc_text = a.get_text(separator=" ").strip()
product_desc.append(desc_text)
#price
price = []
price_div=soup.find_all('div',{'class':'css-4u82jy'})
for a in price_div:
price_span = a.find_all('ul',{'class':'css-o5uqvq'})
for b in price_span:
price_text = b.text.strip()
price.append(price_text)
#product description 1
description = []
desc =soup.find_all('div',{'role':'tabpanel'})
for a in desc:
desc_ul = a.find_all('ul',{'class':'css-1ijyj3z e1iszlzh2'})
for b in desc_ul:
desc_text = b.get_text(separator=" ").strip()
description.append(desc_text)
#Rating
rating = []
rating_div=soup.find_all('div',{'class':'css-gbp2g2'})
for a in rating_div:
rating_span = a.find_all('span',{'data-testid':'lblPDPDetailProductRatingNumber'})
for b in rating_span:
rating_text = b.text.strip()
rating.append(rating_text)
#Merchant
merchant = []
merchant_div=soup.find_all('a',{'data-testid':'llbPDPFooterShopName'})
for a in merchant_div:
merchant_span = a.find_all('h2')
for b in merchant_span:
merchant_text = b.text.strip()
merchant.append(merchant_text)
dict_isi = {'Description':product_desc, 'Price':price, 'Specification': description, 'Rating': rating, 'Merchant': merchant}
return dict_isi
知道发生了什么吗?我能做些什么来修复它
def getTokopedia():
df = pd.DataFrame(columns=['Product', 'Description', 'Specification', 'Image Link', 'Price', 'Rating', 'Merchant'])
ua = UserAgent()
userAgent = ua.random
options = Options()
options.add_argument("--window-size=1920,1080")
options.add_argument('--no-sandbox')
options.add_argument(f'user-agent={userAgent}')
driver = webdriver.Chrome(options=options)
driver = webdriver.Chrome()
driver.get("https://www.tokopedia.com/p/handphone-tablet/handphone?page=1")
time.sleep(2) # Allow 2 seconds for the web page to open
scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
screen_height = driver.execute_script("return window.screen.height;") # get the screen height of the web
i = 1
while True:
# scroll one screen height each time
driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))
i += 1
time.sleep(scroll_pause_time)
# update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
scroll_height = driver.execute_script("return document.body.scrollHeight;")
# Break the loop when the height we need to scroll to is larger than the total scroll height
if (screen_height) * i > scroll_height:
break
soup = BeautifulSoup(driver.page_source, "html.parser")
#product
product = []
product_div=soup.find_all('div',{'class':'css-11s9vse'})
for a in product_div:
product_span = a.find_all('span',{'class':'css-1bjwylw'})
for b in product_span:
product_text = b.text.strip()
product.append(product_text)
#image link
image_link = []
image_div=soup.find_all('div',{'class':'css-jo3xxj'})
for a in image_div:
image_span = a.find_all('img')
for b in image_span:
image = b.attrs['src'].strip()
image_link.append(image)
link = []
all_productlink = soup.findAll('a', attrs={'class':'css-89jnbj'})
tokped = {'Product':product, 'Image Link': image_link, 'Description': [], 'Price':[], 'Specification' : [], 'Rating': [], 'Merchant': []}
for l in all_productlink:
product_link = l['href']
tokpedScrape = getTokopediaContent(product_link)
tokped['Description'].append(tokpedScrape['Description'])
tokped['Price'].append(tokpedScrape['Price'])
tokped['Specification'].append(tokpedScrape['Specification'])
tokped['Rating'].append(tokpedScrape['Rating'])
tokped['Merchant'].append(tokpedScrape['Merchant'])
df = pd.DataFrame.from_dict(tokped)
return(df)