Python urlretrieve在几次下载后生成http错误404_Python_Web Scraping_Beautifulsoup_Python Requests

Python urlretrieve在几次下载后生成http错误404

python web-scraping

Python urlretrieve在几次下载后生成http错误404,python,web-scraping,beautifulsoup,python-requests,Python,Web Scraping,Beautifulsoup,Python Requests,我正试图以这种方式下载一些图像： from bs4 import BeautifulSoup import requests url_ds3_part1 = 'https://darksouls3.wiki.fextralife.com' url_ds3 = { "daggers":"https://darksouls3.wiki.fextralife.com/Daggers", "straight_swords":"ht

我正试图以这种方式下载一些图像：

from bs4 import BeautifulSoup
import requests

url_ds3_part1 = 'https://darksouls3.wiki.fextralife.com'

url_ds3 = {
"daggers":"https://darksouls3.wiki.fextralife.com/Daggers",
"straight_swords":"https://darksouls3.wiki.fextralife.com/Straight+Swords",
"great_swords":"https://darksouls3.wiki.fextralife.com/Greatswords",
"ultra_great_swords":"https://darksouls3.wiki.fextralife.com/Ultra+Greatswords",
"curved_swords":"https://darksouls3.wiki.fextralife.com/Curved+Swords",
"katanas":"https://darksouls3.wiki.fextralife.com/Katanas",
"curved_great_swords":"https://darksouls3.wiki.fextralife.com/Curved+Greatswords",
"piercing_swords":"https://darksouls3.wiki.fextralife.com/Piercing+Swords"
}

for url in url_ds3.values():
    content = requests.get(url).content
    soup = BeautifulSoup(content,'lxml') 
    image_tags = soup.findAll('img')
    [urllib.request.urlretrieve(url_ds3_part1+str(image_tag.get('src')), str('images_swords')+str(image_tag.get('src'))) for image_tag in image_tags if (('forum' in str(image_tag.get('src'))) == False)
                                                                                                                                                     & (('None' in str(image_tag.get('src'))) == False)
                                                                                                                                                     & (('avatar' in str(image_tag.get('src'))) == False)
                                                                                                                                                     & (('Damage' in str(image_tag.get('src'))) == False)  
                                                                                                                                                     & (('Resist' in str(image_tag.get('src'))) == False)
                                                                                                                                                     & (('STR' in str(image_tag.get('src'))) == False)  
                                                                                                                                                     & (('DEX' in str(image_tag.get('src'))) == False)  
                                                                                                                                                     & (('INT' in str(image_tag.get('src'))) == False)  
                                                                                                                                                     & (('FTH' in str(image_tag.get('src'))) == False)   
                                                                                                                                                     & (('attack' in str(image_tag.get('src'))) == False)    
                                                                                                                                                     & (('normal' in str(image_tag.get('src'))) == False)    
    ]

您只需复制/粘贴代码即可重现错误HHTP404错误。我对这个有点陌生（我第一次做刮削），但我怀疑他在与复制品或其他东西斗争，在分解这个问题上有点挣扎

提前感谢您的帮助：）

编辑：忘记添加url\u ds3\u part1 EDIT2：这会让你了解我试图检索的图像：

for url in url_ds3.values():
    content = requests.get(url).content
    soup = BeautifulSoup(content,'lxml') 
    image_tags = soup.findAll('img')
    [print(url_ds3_part1+str(image_tag.get('src'))) for image_tag in image_tags if (('forum' in str(image_tag.get('src'))) == False)
                                                                                 & (('None' in str(image_tag.get('src'))) == False)
                                                                                 & (('avatar' in str(image_tag.get('src'))) == False)                   
                                                                                 & (('Damage' in str(image_tag.get('src'))) == False)  
                                                                                 & (('Resist' in str(image_tag.get('src'))) == False)
                                                                                 & (('STR' in str(image_tag.get('src'))) == False)  
                                                                                 & (('DEX' in str(image_tag.get('src'))) == False)  
                                                                                 & (('INT' in str(image_tag.get('src'))) == False)  
                                                                                 & (('FTH' in str(image_tag.get('src'))) == False)   
                                                                                 & (('attack' in str(image_tag.get('src'))) == False)    
                                                                                 & (('normal' in str(image_tag.get('src'))) == False)    
    ]

编辑3：用try暂时绕过了这个问题

for url in url_ds3.values():
    content = requests.get(url).content
    soup = BeautifulSoup(content,'lxml') 
    image_tags = soup.findAll('img')
    try:
        [urllib.request.urlretrieve(url_ds3_part1+str(image_tag.get('src')), str('images_swords')+str(image_tag.get('src'))) for image_tag in image_tags if (('forum' in str(image_tag.get('src'))) == False)
                                                                                                                                                          & (('None' in str(image_tag.get('src'))) == False)
                                                                                                                                                          & (('avatar' in str(image_tag.get('src'))) == False)
                                                                                                                                                          & (('Damage' in str(image_tag.get('src'))) == False)  
                                                                                                                                                          & (('Resist' in str(image_tag.get('src'))) == False)
                                                                                                                                                          & (('STR' in str(image_tag.get('src'))) == False)  
                                                                                                                                                          & (('DEX' in str(image_tag.get('src'))) == False)  
                                                                                                                                                          & (('INT' in str(image_tag.get('src'))) == False)  
                                                                                                                                                          & (('FTH' in str(image_tag.get('src'))) == False)   
                                                                                                                                                          & (('attack' in str(image_tag.get('src'))) == False)    
                                                                                                                                                          & (('normal' in str(image_tag.get('src'))) == False)    
        ] 
    except:
        pass

运行代码时，似乎有些URL是绝对的（它们以https://开头），有些则不是。您需要检查以下各项：

import requests
import urllib.request
from bs4 import BeautifulSoup

url_ds3 = {
"daggers":"https://darksouls3.wiki.fextralife.com/Daggers",
"straight_swords":"https://darksouls3.wiki.fextralife.com/Straight+Swords",
"great_swords":"https://darksouls3.wiki.fextralife.com/Greatswords",
"ultra_great_swords":"https://darksouls3.wiki.fextralife.com/Ultra+Greatswords",
"curved_swords":"https://darksouls3.wiki.fextralife.com/Curved+Swords",
"katanas":"https://darksouls3.wiki.fextralife.com/Katanas",
"curved_great_swords":"https://darksouls3.wiki.fextralife.com/Curved+Greatswords",
"piercing_swords":"https://darksouls3.wiki.fextralife.com/Piercing+Swords"
}


url_ds3_part1 = 'https://darksouls3.wiki.fextralife.com'

for url in url_ds3.values():
    print(url)

    content = requests.get(url).content
    soup = BeautifulSoup(content,'lxml') 
    image_tags = soup.findAll('img')

    for image_tag in image_tags:
        if ( (('forum' in str(image_tag.get('src'))) == False)
             & (('None' in str(image_tag.get('src'))) == False)
             & (('avatar' in str(image_tag.get('src'))) == False)
             & (('Damage' in str(image_tag.get('src'))) == False)  
             & (('Resist' in str(image_tag.get('src'))) == False)
             & (('STR' in str(image_tag.get('src'))) == False)  
             & (('DEX' in str(image_tag.get('src'))) == False)  
             & (('INT' in str(image_tag.get('src'))) == False)  
             & (('FTH' in str(image_tag.get('src'))) == False)   
             & (('attack' in str(image_tag.get('src'))) == False)    
             & (('normal' in str(image_tag.get('src'))) == False) ):

            if image_tag.get('src').startswith('http'):
                u = image_tag['src']
            else:
                u = url_ds3_part1 + image_tag['src']

            urllib.request.urlretrieve(u, 'images_swords' + image_tag['src'].replace(url_ds3_part1, ''))

脚本应该得到什么图像？只有列

“Name&Icon”

中的图标？很抱歉，我在代码中留下了一些混乱，因为我忘记了定义变量url\u ds3\u part1，现在如果您执行所有操作，您应该获得图标，但在某一点上它将停止检索所有内容并发出404错误消息