Python selenium.common.Exception.TimeoutException-谷歌图像抓取_Python_Selenium_Web Scraping

Python selenium.common.Exception.TimeoutException-谷歌图像抓取

python selenium web-scraping

Python selenium.common.Exception.TimeoutException-谷歌图像抓取,python,selenium,web-scraping,Python,Selenium,Web Scraping,我正在使用这段代码从谷歌图像搜索结果中抓取一些图像。但是，有一个try exception块，其中我的代码给出了此错误： selenium.common.Exception.TimeoutException D:\Downloads\google\u images\u downloader-master\google\u images\u downloader-master>[19680:952:0326/213123.951:错误：ssl\u客户端\u套接字\u impl.cc（941）]握手

我正在使用这段代码从谷歌图像搜索结果中抓取一些图像。但是，有一个

try exception

块，其中我的代码给出了此错误：

selenium.common.Exception.TimeoutException

D:\Downloads\google\u images\u downloader-master\google\u images\u downloader-master>[19680:952:0326/213123.951:错误：ssl\u客户端\u套接字\u impl.cc（941）]握手失败；返回-1，SSL错误代码1，网络错误-100

以下是代码块：

number_of_scrolls = int((num_requested / 400) + 10)
for _ in range(number_of_scrolls):
   for __ in range(10):
      driver.execute_script("window.scrollBy(0, 1000000)")
      time.sleep(0.2)
   time.sleep(0.5)
   try: 
      wait = WebDriverWait(driver, 10)
      element = wait.until(EC.element_to_be_clickable((By.XPATH, "//input[@value='Show more results']")))
      element.click()  
   except Exception as e:   
       print ("Less images found:", e)
       break

另外，我正在使用Chrome作为我的网络驱动程序，你确定你面临这个问题吗？因为我从你提供的链接复制了相同的代码，它对我来说工作正常。只有我做了一些改变

我用的不是firefox，而是chrome

无法在

https://www.google.com/search?q=Poecile%20montanus%20bird&source=lnms&tbm=isch

。获取语法错误：驱动程序的非ASCII字符“\xc4”。通过xpath（“//input[@value='Więcej wyników']）查找元素。单击（）due ASCII字符。为避免此错误，我已将xpath更改为

驱动程序。通过\u xpath（“//input[@value='W']”）查找\u元素。单击（）

，因此我们得到

（'Less images found:'，NoSuchElementException（））

。请查看屏幕截图以了解更多详细信息

from selenium import webdriver
import os
import json
import urllib3
import time
import shutil

searched_test_array = [
    # "Parus major bird",
    "Poecile montanus bird",
    # "Carduelis flammea bird",
    # "Parus cristatus bird",
    # "Carduelis spinus bird",
    # "Turdus iliacus bird",
    # "Dryocopus martius bird",
    # "Dendrocopos major bird",
    # "Picus canus bird",
    # "Picus viridis bird",
    # "Dendrocopos medius bird",
    # "Dendrocopos minor bird",    #NOK
    # "Carduelis chloris bird",
    # "Pyrrhula pyrrhula bird",
    # "Columba livia bird",
    "Coccothraustes coccothraustes bird",
    # "Carduelis cannabina bird",
    # "Passer montanus bird",
    # "Larus canus bird",
    # "Larus argentatus bird",
    "Parus caeruleus bird",
    # "Regulus regulus bird",
    # "Buteo buteo bird",
    # "Certhia familiaris bird",
    # "Certhia brachydactyla bird",
    # "Emberiza calandra bird bird",
    # "Corvus frugilegus bird",
    # "Accipiter gentilis bird",
    # "Bombycilla garrulus bird",
    # "Fringilla montifringilla bird",
    # "Corvus monedula bird",
    # "Turdus merula bird",
    # "Sitta europaea bird",
    # "Accipiter nisus bird",
    # "Corvus corax bird",
    # "Turdus pilaris bird",
    # "Emberiza schoeniclus bird",
    # "Aegithalos caudatus bird",
    # "Erithacus rubicola bird",
    # "Carduelis flavirostris bird",
    # "Streptopelia decaocto bird",
    # "Parus palustris bird",
    # "Parus ater bird",
    # "Pica pica bird",
    # "Lanius excubitor bird",
    # "Troglodytes troglodytes bird",
    "Carduelis carduelis bird",
    # "Sturnus vulgaris bird",

    # "Garrulus glandarius bird",
    # "Emberiza citrinella bird",

    # "Corvus corone bird",
    # "Passer domesticus bird",
    # "Panurus biarmicus bird",
    # "Fringilla coelebs bird",
    # "Larus ridibundus bird"
]

num_requested = 1000

# adding path to geckodriver to the OS environment variable
os.environ["PATH"] += os.pathsep + os.getcwd()
download_path = os.getcwd() + "/Downloads"


def main():
print ("Scrapping started")

# Create Donwload patch or delete existing!
if not os.path.exists(download_path):
    os.makedirs(download_path)
# else:
#     shutil.rmtree(download_path)
#     os.makedirs(download_path)

# Iterate over search array
for searchtext in searched_test_array:

    # Create class patch of delete existing
    searchedTextDir = os.path.join(download_path, searchtext.replace(" ", "_"))
    if not os.path.exists(searchedTextDir):
        os.makedirs(searchedTextDir)
    # else:
    #     shutil.rmtree(searchedTextDir)
    #     os.makedirs(searchedTextDir)

    # Prepare search URL. searchtext is a name of a class.
    url = "https://www.google.com/search?q=" + searchtext + "&source=lnms&tbm=isch"
    # Start Firefox

    driver = webdriver.Chrome(executable_path=r"C:\New folder\chromedriver.exe")
    # Open URL
    driver.get(url)

    extensions = {"jpg", "jpeg", "png", "gif"}
    img_count = 0
    downloaded_img_count = 0

    # I have to do some magic math to make web browser scroll down the search box.
    number_of_scrolls = int((num_requested / 400) + 10)
    for _ in range(number_of_scrolls):
        for __ in range(10):
            # And scroll scroll scroll to let Google Json load  images
            driver.execute_script("window.scrollBy(0, 1000000)")
            time.sleep(0.2)
        # to load next 400 images
        time.sleep(0.5)
        try:
            # Look for a button down the page for more search results.
            # For English version use: //input[@value='Show more results']

             driver.find_element_by_xpath("//input[@value='W']").click()

        except Exception as e:
            print ("Less images found:", e)
            break

    # Get URLs of all images on the page
    imges = driver.find_elements_by_xpath('//div[contains(@class,"rg_meta")]')
    print ("Total images:", len(imges), "\n")

    # Start iterating over found URLs
    for img in imges:
        img_count += 1
        img_url = json.loads(img.get_attribute('innerHTML'))["ou"]
        img_type = json.loads(img.get_attribute('innerHTML'))["ity"]
        print ("Downloading image", img_count, ": ", img_url, img_type)
        try:
            # Thy to save image on HDD
            if img_type not in extensions:
                img_type = "jpg"
            http = urllib3.PoolManager()

            # Write image to hdd. Don't forget about timeout!
            response = http.request('GET', img_url, timeout=2)
            f = open(searchedTextDir + "/" + str(downloaded_img_count) + "." + img_type, "wb")
            f.write(response.data)
            f.close
            downloaded_img_count += 1
        except Exception as e:
            print ("Download failed:", e)
        finally:
            print
        if downloaded_img_count >= num_requested:
            break

    print ("Total downloaded: ", downloaded_img_count, "/", img_count)
    driver.quit()
    time.sleep(0.5)

print ("Scrapping done")


if __name__ == "__main__":
    main()

点击显示更多

显示更多结果

add  for _ in range(number_of_scrolls):
        for __ in range(10):
            # And scroll scroll scroll to let Google Json load  images
            driver.execute_script("window.scrollBy(0, 1000000)")
            time.sleep(0.2)
        # to load next 400 images
        time.sleep(0.5)
        driver.find_element_by_xpath("//input[@class='mye4qd']").click()
        time.sleep(5)

您无法通过以下方式捕获TimeoutException：

except Exception as e:

你应使用：

except TimeoutException:

您还必须从Selenium导入该异常：

from selenium.common.exceptions import TimeoutException

对我来说，我必须用xpath（//input[@value='Show more results']）为英语提供

find_element\u。此外，在屏幕截图中，我看到下载的总图像显示所有案例的0/0错误消息：元素不可交互
，这就是为什么它会得到0/0图像，就像在屏幕截图中一样，在输出中，你会得到0图像，因为你在代码中使用的定位器有0个匹配元素。你需要确保你通过了正确的定位器。git上的程序有旧的定位器，现在无效。知道如何让ro用户获得更新的outr校正定位器吗？对Selenium没有太多经验不确定，但我认为可能值得向git所有者核实到底发生了什么变化