Python 网络垃圾巨蟒
Hello Community是python的初学者,他希望创建一个工具来部分地获得生命。我制作了这个小代码,作为页面的img标记进行过滤Python 网络垃圾巨蟒,python,Python,Hello Community是python的初学者,他希望创建一个工具来部分地获得生命。我制作了这个小代码,作为页面的img标记进行过滤 import requests from bs4 import BeautifulSoup t=input('Digite o Nome do Filme:') ano=int(input('Digite o Ano do Filme:')) if ano==1: req=requests.get('https://www.th
import requests
from bs4 import BeautifulSoup
t=input('Digite o Nome do Filme:')
ano=int(input('Digite o Ano do Filme:'))
if ano==1:
req=requests.get('https://www.themoviedb.org/search?query='+t+'&language=pt-BR')
bs=BeautifulSoup(req.text, 'lxml')
print(bs.find_all('img'))
else:
req=requests.get('https://www.themoviedb.org/search?query='+t+'%20y%3A'+str(ano)+'&language=pt-BR')
bs=BeautifulSoup(req.text, 'lxml')
print(bs.find_all('img'))
然后我做了另一个部分,获取图像链接并将其显示在控制台上
import io
import os
import requests
import tempfile
from PIL import Image
from matplotlib import pyplot as plt
img_url = 'https://image.tmdb.org/t/p/w500_and_h282_face/dKxkwAJfGuznW8Hu0mhaDJtna0n.jpg'
buffer = tempfile.SpooledTemporaryFile(max_size=1e9)
r = requests.get(img_url, stream=True)
if r.status_code == 200:
downloaded = 0
filesize = int(r.headers['content-length'])
for chunk in r.iter_content():
downloaded += len(chunk)
buffer.write(chunk)
print(downloaded/filesize)
buffer.seek(0)
i = Image.open(io.BytesIO(buffer.read()))
i.save(os.path.join('.', 'image.jpg'), quality=85)
buffer.close()
plt.imshow(i)
plt.show()
因此,我想知道如何使img_url变量自动获取打印url(bs.find_all('img'))。或者如果有库的话。我在你的第一个代码中做了一个修改,现在你可以使用“poster fade lazyautosizes lazyloaded”类访问每个标签img的链接。我建议你把第二个代码放进去 函数,然后在此脚本中调用它
import requests
from bs4 import BeautifulSoup
t=input('Digite o Nome do Filme:')
ano=int(input('Digite o Ano do Filme:'))
if ano==1:
req=requests.get('https://www.themoviedb.org/search?query='+t+'&language=pt-BR')
bs=BeautifulSoup(req.text, 'lxml')
#elements=bs.find_all('img',class_="fade lazyautosizes lazyloaded")
elements=bs.select('div.image_content > a > img')
for element in elements:
print("LINK")
print (element['data-src'])
#Here should put a function for pass the url to script matplotlib
myfunctionmatplotlib(element['data-src'])
else:
req=requests.get('https://www.themoviedb.org/search?query='+t+'%20y%3A'+str(ano)+'&language=pt-BR')
bs=BeautifulSoup(req.text, 'lxml')
elements=bs.select('div.image_content > a > img')
for element in elements:
print("LINK")
print (element['data-src'])
#Here should put a function for pass the url to script matplotlib
myfunctionmatplotlib(element['data-src'])
我在你的第一个代码中做了一个修改,现在你可以使用“poster fade lazyautosizes lazyloaded”类访问每个标签img的链接。我建议你把第二个代码放进去 函数,然后在此脚本中调用它
import requests
from bs4 import BeautifulSoup
t=input('Digite o Nome do Filme:')
ano=int(input('Digite o Ano do Filme:'))
if ano==1:
req=requests.get('https://www.themoviedb.org/search?query='+t+'&language=pt-BR')
bs=BeautifulSoup(req.text, 'lxml')
#elements=bs.find_all('img',class_="fade lazyautosizes lazyloaded")
elements=bs.select('div.image_content > a > img')
for element in elements:
print("LINK")
print (element['data-src'])
#Here should put a function for pass the url to script matplotlib
myfunctionmatplotlib(element['data-src'])
else:
req=requests.get('https://www.themoviedb.org/search?query='+t+'%20y%3A'+str(ano)+'&language=pt-BR')
bs=BeautifulSoup(req.text, 'lxml')
elements=bs.select('div.image_content > a > img')
for element in elements:
print("LINK")
print (element['data-src'])
#Here should put a function for pass the url to script matplotlib
myfunctionmatplotlib(element['data-src'])
我编写了这段新代码,您可以尝试下载URL的图像。我还编写了一个代码,可以从关键字中获取图像的URL,如果这能解决您的问题,我可以与您共享该代码:
""" Download image according to given urls and automatically rename them in order. """
# -*- coding: utf-8 -*-
from __future__ import print_function
import shutil
import imghdr
import os
import concurrent.futures
import requests
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Proxy-Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",
"Accept-Encoding": "gzip, deflate, sdch",
# 'Connection': 'close',
}
def download_image(image_url, dst_dir, file_name, timeout=20, proxy_type=None, proxy=None):
proxies = None
if proxy_type is not None:
proxies = {
"http": proxy_type + "://" + proxy,
"https": proxy_type + "://" + proxy
}
response = None
file_path = os.path.join(dst_dir, file_name)
try_times = 0
while True:
try:
try_times += 1
response = requests.get(
image_url, headers=headers, timeout=timeout, proxies=proxies)
with open(file_path, 'wb') as f:
f.write(response.content)
response.close()
file_type = imghdr.what(file_path)
# if file_type is not None:
if file_type in ["jpg", "jpeg", "png", "bmp"]:
new_file_name = "{}.{}".format(file_name, file_type)
new_file_path = os.path.join(dst_dir, new_file_name)
shutil.move(file_path, new_file_path)
print("## OK: {} {}".format(new_file_name, image_url))
else:
os.remove(file_path)
print("## Err: {}".format(image_url))
break
except Exception as e:
if try_times < 3:
continue
if response:
response.close()
print("## Fail: {} {}".format(image_url, e.args))
break
def download_images(image_urls, dst_dir, file_prefix="img", concurrency=50, timeout=20, proxy_type=None, proxy=None):
"""
Download image according to given urls and automatically rename them in order.
:param timeout:
:param proxy:
:param proxy_type:
:param image_urls: list of image urls
:param dst_dir: output the downloaded images to dst_dir
:param file_prefix: if set to "img", files will be in format "img_xxx.jpg"
:param concurrency: number of requests process simultaneously
:return: none
"""
with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as executor:
future_list = list()
count = 0
if not os.path.exists(dst_dir):
os.makedirs(dst_dir)
for image_url in image_urls:
file_name = file_prefix + "_" + "%04d" % count
future_list.append(executor.submit(
download_image, image_url, dst_dir, file_name, timeout, proxy_type, proxy))
count += 1
concurrent.futures.wait(future_list, timeout=180)
“”“根据给定的URL下载图像并按顺序自动重命名。”“”
#-*-编码:utf-8-*-
来自未来导入打印功能
进口舒蒂尔
导入imghdr
导入操作系统
进口期货
导入请求
标题={
“接受”:“text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8”,
“代理连接”:“保持活动状态”,
“用户代理”:“Mozilla/5.0(Windows NT 10.0;Win64;x64)”
“AppleWebKit/537.36(KHTML,像Gecko)Chrome/54.0.2840.99 Safari/537.36”,
“接受编码”:“gzip、deflate、sdch”,
#“连接”:“关闭”,
}
def下载图像(图像url、dst目录、文件名、超时=20、代理类型=None、代理=None):
代理=无
如果代理类型不是“无”:
代理={
“http”:proxy_type+”:/“+proxy,
“https”:代理类型+”:/“+代理
}
响应=无
file_path=os.path.join(dst_目录,文件名)
尝试次数=0
尽管如此:
尝试:
尝试次数+=1
response=requests.get(
图像(url,标题=标题,超时=超时,代理=代理)
打开(文件路径“wb”)作为f:
f、 写(response.content)
答复:close()
文件类型=imghdr.what(文件路径)
#如果文件类型不是“无”:
如果文件\输入[“jpg”、“jpeg”、“png”、“bmp”]:
新建文件名=“{}.{}”。格式(文件名,文件类型)
new_file_path=os.path.join(dst_dir,new_file_name)
移动(文件路径、新文件路径)
打印(“##确定:{}{}”。格式(新文件名、图像url))
其他:
删除(文件路径)
打印(“##错误:{}”。格式(图像url))
打破
例外情况除外,如e:
如果重试次数小于3次:
持续
如果回答:
答复:close()
打印(“##失败:{}{}”。格式(图像url,e.args))
打破
def下载图像(图像URL、dst目录、文件前缀=“img”、并发性=50、超时=20、代理类型=None、代理=None):
"""
根据给定的URL下载图像,并按顺序自动重命名它们。
:参数超时:
:参数代理:
:参数代理类型:
:param image_url:图像URL列表
:param dst_dir:将下载的图像输出到dst_dir
:param file_prefix:如果设置为“img”,则文件的格式为“img_xxx.jpg”
:param concurrency:同时处理的请求数
:返回:无
"""
以concurrent.futures.ThreadPoolExecutor(max_workers=concurrency)作为执行器:
未来列表=列表()
计数=0
如果操作系统路径不存在(dst_dir):
操作系统生成目录(dst_目录)
对于图像url中的图像url:
文件名=文件前缀+“\u”+%04d”%count
未来列表。追加(executor.submit(
下载(图像、图像url、dst目录、文件名、超时、代理类型、代理))
计数+=1
concurrent.futures.wait(未来列表,超时=180)
希望这对您有所帮助=)我编写了这段新代码,您可以尝试下载您拥有的URL图像。我还编写了一个代码,可以从关键字中获取图像的URL,如果这能解决您的问题,我可以与您共享该代码:
""" Download image according to given urls and automatically rename them in order. """
# -*- coding: utf-8 -*-
from __future__ import print_function
import shutil
import imghdr
import os
import concurrent.futures
import requests
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
"Proxy-Connection": "keep-alive",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
"AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",
"Accept-Encoding": "gzip, deflate, sdch",
# 'Connection': 'close',
}
def download_image(image_url, dst_dir, file_name, timeout=20, proxy_type=None, proxy=None):
proxies = None
if proxy_type is not None:
proxies = {
"http": proxy_type + "://" + proxy,
"https": proxy_type + "://" + proxy
}
response = None
file_path = os.path.join(dst_dir, file_name)
try_times = 0
while True:
try:
try_times += 1
response = requests.get(
image_url, headers=headers, timeout=timeout, proxies=proxies)
with open(file_path, 'wb') as f:
f.write(response.content)
response.close()
file_type = imghdr.what(file_path)
# if file_type is not None:
if file_type in ["jpg", "jpeg", "png", "bmp"]:
new_file_name = "{}.{}".format(file_name, file_type)
new_file_path = os.path.join(dst_dir, new_file_name)
shutil.move(file_path, new_file_path)
print("## OK: {} {}".format(new_file_name, image_url))
else:
os.remove(file_path)
print("## Err: {}".format(image_url))
break
except Exception as e:
if try_times < 3:
continue
if response:
response.close()
print("## Fail: {} {}".format(image_url, e.args))
break
def download_images(image_urls, dst_dir, file_prefix="img", concurrency=50, timeout=20, proxy_type=None, proxy=None):
"""
Download image according to given urls and automatically rename them in order.
:param timeout:
:param proxy:
:param proxy_type:
:param image_urls: list of image urls
:param dst_dir: output the downloaded images to dst_dir
:param file_prefix: if set to "img", files will be in format "img_xxx.jpg"
:param concurrency: number of requests process simultaneously
:return: none
"""
with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as executor:
future_list = list()
count = 0
if not os.path.exists(dst_dir):
os.makedirs(dst_dir)
for image_url in image_urls:
file_name = file_prefix + "_" + "%04d" % count
future_list.append(executor.submit(
download_image, image_url, dst_dir, file_name, timeout, proxy_type, proxy))
count += 1
concurrent.futures.wait(future_list, timeout=180)
“”“根据给定的URL下载图像并按顺序自动重命名。”“”
#-*-编码:utf-8-*-
来自未来导入打印功能
进口舒蒂尔
导入imghdr
导入操作系统
进口期货
导入请求
标题={
“接受”:“text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8”,
“代理连接”:“保持活动状态”,
“用户代理”:“Mozilla/5.0(Windows NT 10.0;Win64;x64)”
“AppleWebKit/537.36(KHTML,像Gecko)Chrome/54.0.2840.99 Safari/537.36”,
“接受编码”:“gzip、deflate、sdch”,
#“连接”:“关闭”,
}
def下载图像(图像url、dst目录、文件名、超时=20、代理类型=None、代理=None):
代理=无
如果代理类型不是“无”:
代理={
“http”:proxy_type+”:/“+proxy,
“https”:代理类型+”:/“+代理
}
响应=无
file_path=os.path.join(dst_目录,文件名)
尝试次数=0
尽管如此:
尝试:
尝试次数+=1
response=requests.get(
图像(url,标题=标题,超时=超时,代理=代理)
打开(文件路径“wb”)作为f:
f、 写(response.content)
答复:close()
文件类型=imghdr.what(文件路径)
#如果文件类型不是“无”:
如果文件\输入[“jpg”、“jpeg”、“png”、“bmp”]:
新建文件名=“{}.{}”。格式(文件名,文件类型)
new_file_path=os.path.join(dst_dir,new_file_name)