Python BeautifulSoup find_all（“img”）不适用于所有站点_Python_Beautifulsoup_Python Requests

Python BeautifulSoup find_all（“img”）不适用于所有站点

python

Python BeautifulSoup find_all（“img”）不适用于所有站点,python,beautifulsoup,python-requests,Python,Beautifulsoup,Python Requests,我正在尝试编写一个Python脚本来从任何网站下载图像。它正在工作，但前后不一致。具体地说，find_allimg并没有为第二个url这样做。剧本是： # works for http://proof.nationalgeographic.com/2016/02/02/photo-of-the-day-best-of-january-3/ # but not http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad

我正在尝试编写一个Python脚本来从任何网站下载图像。它正在工作，但前后不一致。具体地说，find_allimg并没有为第二个url这样做。剧本是：

# works for http://proof.nationalgeographic.com/2016/02/02/photo-of-the-day-best-of-january-3/
# but not http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad-desertification/
import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup

def url_to_image(url, filename):
    # get HTTP response, open as bytes, save the image
    # http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
    req = requests.get(url)
    i = Image.open(BytesIO(req.content))
    i.save(filename)

# open page, get HTML request and parse with BeautifulSoup
html = requests.get("http://proof.nationalgeographic.com/2016/02/02/photo-of-the-day-best-of-january-3/")
soup = BeautifulSoup(html.text, "html.parser")

# find all JPEGS in our soup and write their "src" attribute to array
urls = []
for img in soup.find_all("img"):
    if img["src"].endswith("jpg"):
        print("endswith jpg")
        urls.append(str(img["src"]))
    print(str(img))

jpeg_no = 00
for url in urls:
    url_to_image(url, filename="NatGeoPix/" + str(jpeg_no) + ".jpg")
    jpeg_no += 1

在失败的页面上使用JavaScript呈现图像。首先使用

如果您不想使用干刮，请参阅

e、 g

但我还要检查您是否有绝对URL，而不是相对URL：

import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
import dryscrape
from urllib.parse import urljoin


def url_to_image(url, filename):
    # get HTTP response, open as bytes, save the image
    # http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
    req = requests.get(url)
    i = Image.open(BytesIO(req.content))
    i.save(filename)

# open page, get HTML request and parse with BeautifulSoup
base = "http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad-desertification/"
session = dryscrape.Session()
session.visit(base)
response = session.body()
soup = BeautifulSoup(response, "html.parser")

# find all JPEGS in our soup and write their "src" attribute to array
urls = []
for img in soup.find_all("img"):
    if img["src"].endswith("jpg"):
        print("endswith jpg")
        urls.append(str(img["src"]))
        print(str(img))

jpeg_no = 00
for url in urls:
    if url.startswith( 'http' ):
        absoute = url
    else:
        absoute = urljoin(base, url)
    print (absoute)
    url_to_image(absoute, filename="NatGeoPix/" + str(jpeg_no) + ".jpg")
    jpeg_no += 1

或者Selenium with PhantomJS或Google Chrome supports headless没有尝试过。您如何判断图像是用JS呈现的？如果我使用web developer工具栏关闭Firefox中的JavaScript，图像不会显示。另外，如果我查看页面源代码而不是生成的源代码，我看不到HTML中的图像元素，但可以看到JavaScript中的大量引用。使用上面的方法，我能够抓取图像。奇怪的是…目录存在

import requests
from PIL import Image
from io import BytesIO
from bs4 import BeautifulSoup
import dryscrape
from urllib.parse import urljoin


def url_to_image(url, filename):
    # get HTTP response, open as bytes, save the image
    # http://docs.python-requests.org/en/master/user/quickstart/#binary-response-content
    req = requests.get(url)
    i = Image.open(BytesIO(req.content))
    i.save(filename)

# open page, get HTML request and parse with BeautifulSoup
base = "http://www.nationalgeographic.com/photography/proof/2017/05/lake-chad-desertification/"
session = dryscrape.Session()
session.visit(base)
response = session.body()
soup = BeautifulSoup(response, "html.parser")

# find all JPEGS in our soup and write their "src" attribute to array
urls = []
for img in soup.find_all("img"):
    if img["src"].endswith("jpg"):
        print("endswith jpg")
        urls.append(str(img["src"]))
        print(str(img))

jpeg_no = 00
for url in urls:
    if url.startswith( 'http' ):
        absoute = url
    else:
        absoute = urljoin(base, url)
    print (absoute)
    url_to_image(absoute, filename="NatGeoPix/" + str(jpeg_no) + ".jpg")
    jpeg_no += 1