Python Selenium:在抓取网站时拒绝访问_Python_Selenium_Google Chrome_Selenium Webdriver_Firefox

Python Selenium:在抓取网站时拒绝访问

python selenium google-chrome selenium-webdriver firefox

Python Selenium:在抓取网站时拒绝访问,python,selenium,google-chrome,selenium-webdriver,firefox,Python,Selenium,Google Chrome,Selenium Webdriver,Firefox,嗨，我正试图从Tokopedia网站上提取信息，特别是从这个类别中https://www.tokopedia.com/p/handphone-tablet/handphone?page=1'. 但当我访问特定的项目页面时，我会看到一个空白页面，其中包含以下消息：拒绝访问您无权访问此服务器上的“上述类别链接上列出的任何项目链接”。参考#18.60fe5e6f.1622187209.55a7d604。因此，数据框中的某些项是空的这是我的代码： import time from selenium

嗨，我正试图从Tokopedia网站上提取信息，特别是从这个类别中https://www.tokopedia.com/p/handphone-tablet/handphone?page=1'. 但当我访问特定的项目页面时，我会看到一个空白页面，其中包含以下消息：

拒绝访问您无权访问此服务器上的“上述类别链接上列出的任何项目链接”。参考#18.60fe5e6f.1622187209.55a7d604。因此，数据框中的某些项是空的

这是我的代码：

import time
from selenium import webdriver
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from selenium.webdriver.chrome.options import Options
import pandas as pd
import requests
from fake_useragent import UserAgent

def getTokopediaContent(link):
    ua = UserAgent()
    userAgent = ua.random
    options = Options()
    options.add_argument("--window-size=1920,1080")
    options.add_argument('--no-sandbox')
    options.add_argument(f'user-agent={userAgent}')
    driver = webdriver.Chrome(options=options)
    driver = webdriver.Chrome()
    print(link)
    driver.get(link)
    time.sleep(2)  # Allow 2 seconds for the web page to open
    scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
    screen_height = driver.execute_script("return window.screen.height;")   # get the screen height of the web
    i = 1
    
    while True:
        # scroll one screen height each time
        driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))  
        i += 1
        time.sleep(scroll_pause_time)
        # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
        scroll_height = driver.execute_script("return document.body.scrollHeight;")  
        # Break the loop when the height we need to scroll to is larger than the total scroll height
        if (screen_height) * i > scroll_height:
            break 
            
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    #product description 
    product_desc = []
    desc_div=soup.find_all('div',{'data-testid':'lblPDPDescriptionProduk'})
    for a in desc_div:
        desc_text = a.get_text(separator=" ").strip()
        product_desc.append(desc_text)
        
    #price 
    price = []
    price_div=soup.find_all('div',{'class':'css-4u82jy'})
    for a in price_div:
        price_span = a.find_all('ul',{'class':'css-o5uqvq'})
        for b in price_span:
            price_text = b.text.strip()
            price.append(price_text)
    
    #product description 1 
    description  = []
    desc =soup.find_all('div',{'role':'tabpanel'})
    for a in desc:
        desc_ul = a.find_all('ul',{'class':'css-1ijyj3z e1iszlzh2'})
        for b in desc_ul:
            desc_text = b.get_text(separator=" ").strip()
            description.append(desc_text)
            
    #Rating
    rating = []
    rating_div=soup.find_all('div',{'class':'css-gbp2g2'})
    for a in rating_div:
        rating_span = a.find_all('span',{'data-testid':'lblPDPDetailProductRatingNumber'})
        for b in rating_span:
            rating_text = b.text.strip()
            rating.append(rating_text)
            
    #Merchant
    merchant = []
    merchant_div=soup.find_all('a',{'data-testid':'llbPDPFooterShopName'})
    for a in merchant_div:
        merchant_span = a.find_all('h2')
        for b in merchant_span:
            merchant_text = b.text.strip()
            merchant.append(merchant_text)
            
    dict_isi = {'Description':product_desc, 'Price':price, 'Specification': description, 'Rating': rating, 'Merchant': merchant}
    
    return dict_isi

知道发生了什么吗？我能做些什么来修复它

def getTokopedia():
    df = pd.DataFrame(columns=['Product', 'Description', 'Specification', 'Image Link', 'Price', 'Rating', 'Merchant'])
    ua = UserAgent()
    userAgent = ua.random
    options = Options()
    options.add_argument("--window-size=1920,1080")
    options.add_argument('--no-sandbox')
    options.add_argument(f'user-agent={userAgent}')
    driver = webdriver.Chrome(options=options)
    driver = webdriver.Chrome()
    driver.get("https://www.tokopedia.com/p/handphone-tablet/handphone?page=1")
    time.sleep(2)  # Allow 2 seconds for the web page to open
    scroll_pause_time = 1 # You can set your own pause time. My laptop is a bit slow so I use 1 sec
    screen_height = driver.execute_script("return window.screen.height;")   # get the screen height of the web
    i = 1
    
    while True:
        # scroll one screen height each time
        driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))  
        i += 1
        time.sleep(scroll_pause_time)
        # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
        scroll_height = driver.execute_script("return document.body.scrollHeight;")  
        # Break the loop when the height we need to scroll to is larger than the total scroll height
        if (screen_height) * i > scroll_height:
            break
            
    soup = BeautifulSoup(driver.page_source, "html.parser")
    
    #product 
    product = []
    product_div=soup.find_all('div',{'class':'css-11s9vse'})
    for a in product_div:
        product_span = a.find_all('span',{'class':'css-1bjwylw'})
        for b in product_span:
            product_text = b.text.strip()
            product.append(product_text)
            
    #image link
    image_link = []
    image_div=soup.find_all('div',{'class':'css-jo3xxj'})
    for a in image_div:
        image_span = a.find_all('img')
        for b in image_span:
            image = b.attrs['src'].strip()
            image_link.append(image)
            
    link = []
    all_productlink = soup.findAll('a', attrs={'class':'css-89jnbj'})
    tokped = {'Product':product, 'Image Link': image_link, 'Description': [], 'Price':[], 'Specification' : [], 'Rating': [], 'Merchant': []}
    for l in all_productlink:
        product_link = l['href']
        tokpedScrape = getTokopediaContent(product_link)
        tokped['Description'].append(tokpedScrape['Description'])
        tokped['Price'].append(tokpedScrape['Price'])
        tokped['Specification'].append(tokpedScrape['Specification'])
        tokped['Rating'].append(tokpedScrape['Rating'])
        tokped['Merchant'].append(tokpedScrape['Merchant'])
    df = pd.DataFrame.from_dict(tokped)
    return(df)