Python urlretrieve在几次下载后生成http错误404
我正试图以这种方式下载一些图像:Python urlretrieve在几次下载后生成http错误404,python,web-scraping,beautifulsoup,python-requests,Python,Web Scraping,Beautifulsoup,Python Requests,我正试图以这种方式下载一些图像: from bs4 import BeautifulSoup import requests url_ds3_part1 = 'https://darksouls3.wiki.fextralife.com' url_ds3 = { "daggers":"https://darksouls3.wiki.fextralife.com/Daggers", "straight_swords":"ht
from bs4 import BeautifulSoup
import requests
url_ds3_part1 = 'https://darksouls3.wiki.fextralife.com'
url_ds3 = {
"daggers":"https://darksouls3.wiki.fextralife.com/Daggers",
"straight_swords":"https://darksouls3.wiki.fextralife.com/Straight+Swords",
"great_swords":"https://darksouls3.wiki.fextralife.com/Greatswords",
"ultra_great_swords":"https://darksouls3.wiki.fextralife.com/Ultra+Greatswords",
"curved_swords":"https://darksouls3.wiki.fextralife.com/Curved+Swords",
"katanas":"https://darksouls3.wiki.fextralife.com/Katanas",
"curved_great_swords":"https://darksouls3.wiki.fextralife.com/Curved+Greatswords",
"piercing_swords":"https://darksouls3.wiki.fextralife.com/Piercing+Swords"
}
for url in url_ds3.values():
content = requests.get(url).content
soup = BeautifulSoup(content,'lxml')
image_tags = soup.findAll('img')
[urllib.request.urlretrieve(url_ds3_part1+str(image_tag.get('src')), str('images_swords')+str(image_tag.get('src'))) for image_tag in image_tags if (('forum' in str(image_tag.get('src'))) == False)
& (('None' in str(image_tag.get('src'))) == False)
& (('avatar' in str(image_tag.get('src'))) == False)
& (('Damage' in str(image_tag.get('src'))) == False)
& (('Resist' in str(image_tag.get('src'))) == False)
& (('STR' in str(image_tag.get('src'))) == False)
& (('DEX' in str(image_tag.get('src'))) == False)
& (('INT' in str(image_tag.get('src'))) == False)
& (('FTH' in str(image_tag.get('src'))) == False)
& (('attack' in str(image_tag.get('src'))) == False)
& (('normal' in str(image_tag.get('src'))) == False)
]
您只需复制/粘贴代码即可重现错误HHTP404错误。
我对这个有点陌生(我第一次做刮削),但我怀疑他在与复制品或其他东西斗争,在分解这个问题上有点挣扎
提前感谢您的帮助:)
编辑:忘记添加url\u ds3\u part1
EDIT2:这会让你了解我试图检索的图像:
for url in url_ds3.values():
content = requests.get(url).content
soup = BeautifulSoup(content,'lxml')
image_tags = soup.findAll('img')
[print(url_ds3_part1+str(image_tag.get('src'))) for image_tag in image_tags if (('forum' in str(image_tag.get('src'))) == False)
& (('None' in str(image_tag.get('src'))) == False)
& (('avatar' in str(image_tag.get('src'))) == False)
& (('Damage' in str(image_tag.get('src'))) == False)
& (('Resist' in str(image_tag.get('src'))) == False)
& (('STR' in str(image_tag.get('src'))) == False)
& (('DEX' in str(image_tag.get('src'))) == False)
& (('INT' in str(image_tag.get('src'))) == False)
& (('FTH' in str(image_tag.get('src'))) == False)
& (('attack' in str(image_tag.get('src'))) == False)
& (('normal' in str(image_tag.get('src'))) == False)
]
编辑3:
用try暂时绕过了这个问题
for url in url_ds3.values():
content = requests.get(url).content
soup = BeautifulSoup(content,'lxml')
image_tags = soup.findAll('img')
try:
[urllib.request.urlretrieve(url_ds3_part1+str(image_tag.get('src')), str('images_swords')+str(image_tag.get('src'))) for image_tag in image_tags if (('forum' in str(image_tag.get('src'))) == False)
& (('None' in str(image_tag.get('src'))) == False)
& (('avatar' in str(image_tag.get('src'))) == False)
& (('Damage' in str(image_tag.get('src'))) == False)
& (('Resist' in str(image_tag.get('src'))) == False)
& (('STR' in str(image_tag.get('src'))) == False)
& (('DEX' in str(image_tag.get('src'))) == False)
& (('INT' in str(image_tag.get('src'))) == False)
& (('FTH' in str(image_tag.get('src'))) == False)
& (('attack' in str(image_tag.get('src'))) == False)
& (('normal' in str(image_tag.get('src'))) == False)
]
except:
pass
运行代码时,似乎有些URL是绝对的(它们以https://开头),有些则不是。您需要检查以下各项:
import requests
import urllib.request
from bs4 import BeautifulSoup
url_ds3 = {
"daggers":"https://darksouls3.wiki.fextralife.com/Daggers",
"straight_swords":"https://darksouls3.wiki.fextralife.com/Straight+Swords",
"great_swords":"https://darksouls3.wiki.fextralife.com/Greatswords",
"ultra_great_swords":"https://darksouls3.wiki.fextralife.com/Ultra+Greatswords",
"curved_swords":"https://darksouls3.wiki.fextralife.com/Curved+Swords",
"katanas":"https://darksouls3.wiki.fextralife.com/Katanas",
"curved_great_swords":"https://darksouls3.wiki.fextralife.com/Curved+Greatswords",
"piercing_swords":"https://darksouls3.wiki.fextralife.com/Piercing+Swords"
}
url_ds3_part1 = 'https://darksouls3.wiki.fextralife.com'
for url in url_ds3.values():
print(url)
content = requests.get(url).content
soup = BeautifulSoup(content,'lxml')
image_tags = soup.findAll('img')
for image_tag in image_tags:
if ( (('forum' in str(image_tag.get('src'))) == False)
& (('None' in str(image_tag.get('src'))) == False)
& (('avatar' in str(image_tag.get('src'))) == False)
& (('Damage' in str(image_tag.get('src'))) == False)
& (('Resist' in str(image_tag.get('src'))) == False)
& (('STR' in str(image_tag.get('src'))) == False)
& (('DEX' in str(image_tag.get('src'))) == False)
& (('INT' in str(image_tag.get('src'))) == False)
& (('FTH' in str(image_tag.get('src'))) == False)
& (('attack' in str(image_tag.get('src'))) == False)
& (('normal' in str(image_tag.get('src'))) == False) ):
if image_tag.get('src').startswith('http'):
u = image_tag['src']
else:
u = url_ds3_part1 + image_tag['src']
urllib.request.urlretrieve(u, 'images_swords' + image_tag['src'].replace(url_ds3_part1, ''))
脚本应该得到什么图像?只有列
“Name&Icon”
中的图标?很抱歉,我在代码中留下了一些混乱,因为我忘记了定义变量url\u ds3\u part1,现在如果您执行所有操作,您应该获得图标,但在某一点上它将停止检索所有内容并发出404错误消息