Python 浏览谷歌图片不会产生第一个图片结果
我编写了这个webscraper脚本,它可以抓取谷歌图片(有一些在线帮助)。这是:Python 浏览谷歌图片不会产生第一个图片结果,python,web-scraping,google-image-search,Python,Web Scraping,Google Image Search,我编写了这个webscraper脚本,它可以抓取谷歌图片(有一些在线帮助)。这是: import os import requests from bs4 import BeautifulSoup import csv # Base URL for Google Search google_image = 'https://www.google.com/search?site=&tbm=isch&source=hp&biw=1873&bih=990&'
import os
import requests
from bs4 import BeautifulSoup
import csv
# Base URL for Google Search
google_image = 'https://www.google.com/search?site=&tbm=isch&source=hp&biw=1873&bih=990&'
# CSV Directory
csv_dir = '../main/data/activities.csv'
# Neccesary strings for Python to access browser network
usr_agent = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 '
'Safari/537.11',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3',
'Accept-Encoding': 'none',
'Accept-Language': 'en-US,en;q=0.8',
'Connection': 'keep-alive',
}
def description_to_query(activity_description):
line = activity_description.lower()
line = line.replace(' ', '+')
return line
def download_images(activity_description):
"""Takes as input the activity description and writes corresponding Google Images result to activitiy_img folder,
adding that path to the activities csv as well """
# Image directory
img_dir = '../main/data/activitiy_img'
if not os.path.exists(img_dir):
os.mkdir(img_dir)
# Create URL for web
searchurl = google_image + 'q=' + description_to_query(activity_description)
print(f'{activity_description}: {searchurl}')
# Get content from URL
response = requests.get(searchurl, headers=usr_agent)
# Find all divs containing images
soup = BeautifulSoup(response.text, 'html.parser')
results = soup.findAll('img', {'class': 'rg_i Q4LuWd'})
# Unpacking div and retrieving data-src content. If key not found, continue.
link = ''
for res in results:
try:
link = res['data-src']
except KeyError:
continue
# Getting image data from just retrieved data-src and declaring img_name based on dir and description
response = requests.get(link)
img_path = img_dir + '/' + activity_description + '.jpg'
# Writing file
with open(img_path, 'wb') as img:
img.write(response.content)
print(f'Downloading image {img_path}...')
# Updating activities CSV file
write_img_to_csv(activity_description, img_path)
def write_img_to_csv(activity_description, img_path):
"""Writes image path to CSV line corresponding with activity description"""
# Reading CSV and copying to directory
csv_read = csv.reader(open(csv_dir, newline=''))
lines = list(csv_read)
# Changing CSV values
for row in lines:
if row[1] == activity_description.lower():
# Converting to int to add, then back to string to store updated value in CSV
row[3] = img_path
print(f'{img_path} added to CSV')
# Changing CSV file to locally changed CSV
csv_write = csv.writer(open(csv_dir, 'w', newline=''))
csv_write.writerows(lines)
现在,问题是,它很有效(万岁!),但它似乎不是第一个结果(通常是最好的结果),而是一个非常“晚”的结果,可能与搜索描述活动描述
相差很远,而且往往是非常低的分辨率
我想知道为什么会这样。我已经检查了Google Images HTML源代码,用于识别图像类的dict{'class':'rg_I Q4LuWd'}
似乎也存在于第一个图像中。我假设soup.findAll
最初会找到第一个结果,但我可能错了,我想知道我是不是错了,如果不是,错在哪里
提前谢谢