Python 网络垃圾巨蟒_Python - Fatal编程技术网

Python 网络垃圾巨蟒

python

Python 网络垃圾巨蟒,python,Python,Hello Community是python的初学者，他希望创建一个工具来部分地获得生命。我制作了这个小代码，作为页面的img标记进行过滤 import requests from bs4 import BeautifulSoup t=input('Digite o Nome do Filme:') ano=int(input('Digite o Ano do Filme:')) if ano==1: req=requests.get('https://www.th

Hello Community是python的初学者，他希望创建一个工具来部分地获得生命。我制作了这个小代码，作为页面的img标记进行过滤

import requests 
from bs4 import BeautifulSoup 
    t=input('Digite o Nome do Filme:') 
    ano=int(input('Digite o Ano do Filme:')) 

if ano==1:
  req=requests.get('https://www.themoviedb.org/search?query='+t+'&language=pt-BR')
  bs=BeautifulSoup(req.text, 'lxml') 
  print(bs.find_all('img')) 
else:
  req=requests.get('https://www.themoviedb.org/search?query='+t+'%20y%3A'+str(ano)+'&language=pt-BR')
  bs=BeautifulSoup(req.text, 'lxml') 
  print(bs.find_all('img'))

然后我做了另一个部分，获取图像链接并将其显示在控制台上

import io
import os
import requests
import tempfile
from PIL import Image
from matplotlib import pyplot as plt

img_url = 'https://image.tmdb.org/t/p/w500_and_h282_face/dKxkwAJfGuznW8Hu0mhaDJtna0n.jpg'

buffer = tempfile.SpooledTemporaryFile(max_size=1e9)
r = requests.get(img_url, stream=True)
if r.status_code == 200:
    downloaded = 0
    filesize = int(r.headers['content-length'])
    for chunk in r.iter_content():
        downloaded += len(chunk)
        buffer.write(chunk)
        print(downloaded/filesize)
    buffer.seek(0)
    i = Image.open(io.BytesIO(buffer.read()))
    i.save(os.path.join('.', 'image.jpg'), quality=85)
buffer.close() 

plt.imshow(i)
plt.show()

因此，我想知道如何使img_url变量自动获取打印url（bs.find_all（'img'））。或者如果有库的话。

我在你的第一个代码中做了一个修改，现在你可以使用“poster fade lazyautosizes lazyloaded”类访问每个标签img的链接。我建议你把第二个代码放进去函数，然后在此脚本中调用它

import requests
from bs4 import BeautifulSoup

t=input('Digite o Nome do Filme:')
ano=int(input('Digite o Ano do Filme:'))

if ano==1:
  req=requests.get('https://www.themoviedb.org/search?query='+t+'&language=pt-BR')
  bs=BeautifulSoup(req.text, 'lxml')
  #elements=bs.find_all('img',class_="fade lazyautosizes lazyloaded")
  elements=bs.select('div.image_content > a > img')
  for element in elements:
      print("LINK")
      print (element['data-src'])
      #Here should put a function for pass the url to script matplotlib 
      myfunctionmatplotlib(element['data-src'])  
else:
  req=requests.get('https://www.themoviedb.org/search?query='+t+'%20y%3A'+str(ano)+'&language=pt-BR')
  bs=BeautifulSoup(req.text, 'lxml')
  elements=bs.select('div.image_content > a > img')
  for element in elements:
      print("LINK")
      print (element['data-src'])
      #Here should put a function for pass the url to script matplotlib 
      myfunctionmatplotlib(element['data-src'])

import requests
from bs4 import BeautifulSoup

t=input('Digite o Nome do Filme:')
ano=int(input('Digite o Ano do Filme:'))

if ano==1:
  req=requests.get('https://www.themoviedb.org/search?query='+t+'&language=pt-BR')
  bs=BeautifulSoup(req.text, 'lxml')
  #elements=bs.find_all('img',class_="fade lazyautosizes lazyloaded")
  elements=bs.select('div.image_content > a > img')
  for element in elements:
      print("LINK")
      print (element['data-src'])
      #Here should put a function for pass the url to script matplotlib 
      myfunctionmatplotlib(element['data-src'])  
else:
  req=requests.get('https://www.themoviedb.org/search?query='+t+'%20y%3A'+str(ano)+'&language=pt-BR')
  bs=BeautifulSoup(req.text, 'lxml')
  elements=bs.select('div.image_content > a > img')
  for element in elements:
      print("LINK")
      print (element['data-src'])
      #Here should put a function for pass the url to script matplotlib 
      myfunctionmatplotlib(element['data-src'])

我编写了这段新代码，您可以尝试下载URL的图像。我还编写了一个代码，可以从关键字中获取图像的URL，如果这能解决您的问题，我可以与您共享该代码：

""" Download image according to given urls and automatically rename them in order. """
# -*- coding: utf-8 -*-

from __future__ import print_function

import shutil
import imghdr
import os
import concurrent.futures
import requests

headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Proxy-Connection": "keep-alive",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",
    "Accept-Encoding": "gzip, deflate, sdch",
    # 'Connection': 'close',
}


def download_image(image_url, dst_dir, file_name, timeout=20, proxy_type=None, proxy=None):
    proxies = None
    if proxy_type is not None:
        proxies = {
            "http": proxy_type + "://" + proxy,
            "https": proxy_type + "://" + proxy
        }

    response = None
    file_path = os.path.join(dst_dir, file_name)
    try_times = 0
    while True:
        try:
            try_times += 1
            response = requests.get(
                image_url, headers=headers, timeout=timeout, proxies=proxies)
            with open(file_path, 'wb') as f:
                f.write(response.content)
            response.close()
            file_type = imghdr.what(file_path)
            # if file_type is not None:
            if file_type in ["jpg", "jpeg", "png", "bmp"]:
                new_file_name = "{}.{}".format(file_name, file_type)
                new_file_path = os.path.join(dst_dir, new_file_name)
                shutil.move(file_path, new_file_path)
                print("## OK:  {}  {}".format(new_file_name, image_url))
            else:
                os.remove(file_path)
                print("## Err:  {}".format(image_url))
            break
        except Exception as e:
            if try_times < 3:
                continue
            if response:
                response.close()
            print("## Fail:  {}  {}".format(image_url, e.args))
            break


def download_images(image_urls, dst_dir, file_prefix="img", concurrency=50, timeout=20, proxy_type=None, proxy=None):
    """
    Download image according to given urls and automatically rename them in order.
    :param timeout:
    :param proxy:
    :param proxy_type:
    :param image_urls: list of image urls
    :param dst_dir: output the downloaded images to dst_dir
    :param file_prefix: if set to "img", files will be in format "img_xxx.jpg"
    :param concurrency: number of requests process simultaneously
    :return: none
    """

    with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as executor:
        future_list = list()
        count = 0
        if not os.path.exists(dst_dir):
            os.makedirs(dst_dir)
        for image_url in image_urls:
            file_name = file_prefix + "_" + "%04d" % count
            future_list.append(executor.submit(
                download_image, image_url, dst_dir, file_name, timeout, proxy_type, proxy))
            count += 1
        concurrent.futures.wait(future_list, timeout=180)

“”“根据给定的URL下载图像并按顺序自动重命名。”“”
#-*-编码：utf-8-*-
来自未来导入打印功能
进口舒蒂尔
导入imghdr
导入操作系统
进口期货
导入请求
标题={
“接受”：“text/html，application/xhtml+xml，application/xml；q=0.9，image/webp，*/*；q=0.8”，
“代理连接”：“保持活动状态”，
“用户代理”：“Mozilla/5.0（Windows NT 10.0；Win64；x64）”
“AppleWebKit/537.36（KHTML，像Gecko）Chrome/54.0.2840.99 Safari/537.36”，
“接受编码”：“gzip、deflate、sdch”，
#“连接”：“关闭”，
}
def下载图像（图像url、dst目录、文件名、超时=20、代理类型=None、代理=None）：
代理=无
如果代理类型不是“无”：
代理={
“http”：proxy_type+”：/“+proxy，
“https”：代理类型+”：/“+代理
}
响应=无
file_path=os.path.join（dst_目录，文件名）
尝试次数=0
尽管如此：
尝试：
尝试次数+=1
response=requests.get(
图像（url，标题=标题，超时=超时，代理=代理）
打开（文件路径“wb”）作为f：
f、 写（response.content）
答复:close()
文件类型=imghdr.what（文件路径）
#如果文件类型不是“无”：
如果文件\输入[“jpg”、“jpeg”、“png”、“bmp”]：
新建文件名=“{}.{}”。格式（文件名，文件类型）
new_file_path=os.path.join（dst_dir，new_file_name）
移动（文件路径、新文件路径）
打印（“##确定：{}{}”。格式（新文件名、图像url））
其他：
删除（文件路径）
打印（“##错误：{}”。格式（图像url））
打破
例外情况除外，如e：
如果重试次数小于3次：
持续
如果回答：
答复:close()
打印（“##失败：{}{}”。格式（图像url，e.args））
打破
def下载图像（图像URL、dst目录、文件前缀=“img”、并发性=50、超时=20、代理类型=None、代理=None）：
"""
根据给定的URL下载图像，并按顺序自动重命名它们。
：参数超时：
：参数代理：
：参数代理类型：
：param image_url：图像URL列表
：param dst_dir：将下载的图像输出到dst_dir
：param file_prefix：如果设置为“img”，则文件的格式为“img_xxx.jpg”
：param concurrency:同时处理的请求数
：返回：无
"""
以concurrent.futures.ThreadPoolExecutor（max_workers=concurrency）作为执行器：
未来列表=列表（）
计数=0
如果操作系统路径不存在（dst_dir）：
操作系统生成目录（dst_目录）
对于图像url中的图像url：
文件名=文件前缀+“\u”+%04d”%count
未来列表。追加（executor.submit(
下载（图像、图像url、dst目录、文件名、超时、代理类型、代理））
计数+=1
concurrent.futures.wait（未来列表，超时=180）

希望这对您有所帮助=）

我编写了这段新代码，您可以尝试下载您拥有的URL图像。我还编写了一个代码，可以从关键字中获取图像的URL，如果这能解决您的问题，我可以与您共享该代码：

""" Download image according to given urls and automatically rename them in order. """
# -*- coding: utf-8 -*-

from __future__ import print_function

import shutil
import imghdr
import os
import concurrent.futures
import requests

headers = {
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
    "Proxy-Connection": "keep-alive",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
                  "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36",
    "Accept-Encoding": "gzip, deflate, sdch",
    # 'Connection': 'close',
}


def download_image(image_url, dst_dir, file_name, timeout=20, proxy_type=None, proxy=None):
    proxies = None
    if proxy_type is not None:
        proxies = {
            "http": proxy_type + "://" + proxy,
            "https": proxy_type + "://" + proxy
        }

    response = None
    file_path = os.path.join(dst_dir, file_name)
    try_times = 0
    while True:
        try:
            try_times += 1
            response = requests.get(
                image_url, headers=headers, timeout=timeout, proxies=proxies)
            with open(file_path, 'wb') as f:
                f.write(response.content)
            response.close()
            file_type = imghdr.what(file_path)
            # if file_type is not None:
            if file_type in ["jpg", "jpeg", "png", "bmp"]:
                new_file_name = "{}.{}".format(file_name, file_type)
                new_file_path = os.path.join(dst_dir, new_file_name)
                shutil.move(file_path, new_file_path)
                print("## OK:  {}  {}".format(new_file_name, image_url))
            else:
                os.remove(file_path)
                print("## Err:  {}".format(image_url))
            break
        except Exception as e:
            if try_times < 3:
                continue
            if response:
                response.close()
            print("## Fail:  {}  {}".format(image_url, e.args))
            break


def download_images(image_urls, dst_dir, file_prefix="img", concurrency=50, timeout=20, proxy_type=None, proxy=None):
    """
    Download image according to given urls and automatically rename them in order.
    :param timeout:
    :param proxy:
    :param proxy_type:
    :param image_urls: list of image urls
    :param dst_dir: output the downloaded images to dst_dir
    :param file_prefix: if set to "img", files will be in format "img_xxx.jpg"
    :param concurrency: number of requests process simultaneously
    :return: none
    """

    with concurrent.futures.ThreadPoolExecutor(max_workers=concurrency) as executor:
        future_list = list()
        count = 0
        if not os.path.exists(dst_dir):
            os.makedirs(dst_dir)
        for image_url in image_urls:
            file_name = file_prefix + "_" + "%04d" % count
            future_list.append(executor.submit(
                download_image, image_url, dst_dir, file_name, timeout, proxy_type, proxy))
            count += 1
        concurrent.futures.wait(future_list, timeout=180)

“”“根据给定的URL下载图像并按顺序自动重命名。”“”
#-*-编码：utf-8-*-
来自未来导入打印功能
进口舒蒂尔
导入imghdr
导入操作系统
进口期货
导入请求
标题={
“接受”：“text/html，application/xhtml+xml，application/xml；q=0.9，image/webp，*/*；q=0.8”，
“代理连接”：“保持活动状态”，
“用户代理”：“Mozilla/5.0（Windows NT 10.0；Win64；x64）”
“AppleWebKit/537.36（KHTML，像Gecko）Chrome/54.0.2840.99 Safari/537.36”，
“接受编码”：“gzip、deflate、sdch”，
#“连接”：“关闭”，
}
def下载图像（图像url、dst目录、文件名、超时=20、代理类型=None、代理=None）：
代理=无
如果代理类型不是“无”：
代理={
“http”：proxy_type+”：/“+proxy，
“https”：代理类型+”：/“+代理
}
响应=无
file_path=os.path.join（dst_目录，文件名）
尝试次数=0
尽管如此：
尝试：
尝试次数+=1
response=requests.get(
图像（url，标题=标题，超时=超时，代理=代理）
打开（文件路径“wb”）作为f：
f、 写（response.content）
答复:close()
文件类型=imghdr.what（文件路径）
#如果文件类型不是“无”：
如果文件\输入[“jpg”、“jpeg”、“png”、“bmp”]：
新建文件名=“{}.{}”。格式（文件名，文件类型）
new_file_path=os.path.join（dst_dir，new_file_name）