Python 错误[违规]避免使用document.write()文件
我正在尝试下载top_s和top_sl类的所有图像,它运行了几页,但随后停止,HTTP错误:状态代码404,net::ERR_UNKNOWN_URL_SCHEME,[违反]避免使用document.write() 这是我的python代码-Python 错误[违规]避免使用document.write()文件,python,web-scraping,beautifulsoup,Python,Web Scraping,Beautifulsoup,我正在尝试下载top_s和top_sl类的所有图像,它运行了几页,但随后停止,HTTP错误:状态代码404,net::ERR_UNKNOWN_URL_SCHEME,[违反]避免使用document.write() 这是我的python代码- import requests import urllib.request import random from bs4 import BeautifulSoup as bs url = 'https://goodlogo.com/top.250/n/2
import requests
import urllib.request
import random
from bs4 import BeautifulSoup as bs
url = 'https://goodlogo.com/top.250/n/250/interval/6'
sourcecode = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
plain_text = sourcecode.text
soup = bs(plain_text, 'html.parser')
path = 'C:/Users/roysu/Desktop/src_code/Python_projects/python/web_scrap/myPath/'
link = soup.select(".top_s3l")
for tag in link:
my_images = tag.get('src')
path_new = my_images.replace("/images/logos/small/", "")
file_name = path+path_new
full_name = 'https://goodlogo.com'+my_images
sourcecode1 = requests.get(
full_name, headers={'User-Agent': 'Mozilla/5.0'})
file = open(file_name, "wb")
file.write(sourcecode1.content)
file.close()
link1 = soup.select(".top_s3")
for tag1 in link1:
my_images1 = tag1.get('src')
path_new1 = my_images1.replace("/images/logos/small/", "")
file_name1 = path+path_new1
full_name1 = 'https://goodlogo.com'+my_images1
enter code here
sourcecode1 = requests.get(
full_name1, headers={'User-Agent': 'Mozilla/5.0'})
file = open(file_name1, "wb")
file.write(sourcecode1.content)
file.close()
这可能不是你一开始想要的,但我写这篇文章是为了好玩,因为为什么不呢。现在是2020年,大流行到处都是,周五晚上我没有更好的事情要做 这基本上是一个略为改进的scraper版本,当然,它只用于教育目的。它以相当慢的方式获取每页所有
150
徽标(除了最后一个,大约有50个),在获取另一个徽标图像之前至少暂停一秒钟
您可以使用获取下一页(第一页=2,最后一页=3)
控制刮板。例如,这将从页面2
和3
下载所有徽标。但是,如果您希望获取所有参数,只需删除参数,如下面的get\u next\u page()
并让它运行即可
例如,我使用了一系列你可能不熟悉的技巧
您可以随意阅读这些内容,也可以直接进入代码并进行探索/享受
不管怎样,这是密码。欢迎反馈
import functools
import os
import random
import time
import requests
from bs4 import BeautifulSoup
STARTING_URL = "https://goodlogo.com/top.250/n/250/interval/"
TAG_CLASS = 'top_s3'
DOWNLOAD_DIR = "logos"
HEADERS = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-US,en;q=0.9,pl;q=0.8",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36",
}
def timer(func):
@functools.wraps(func)
def wrapper_timer(*args, **kwargs):
start_time = time.perf_counter()
value = func(*args, **kwargs)
end_time = time.perf_counter()
run_time = end_time - start_time
print(f"Finished {func.__name__!r} in {run_time:.2f} secs.")
return value
return wrapper_timer
def make_logo_dir():
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
def get_next_page(first_page: int = 1, last_page: int = 8) -> str:
page_offset = 1
yield from (
f"{STARTING_URL}{page_number}" for page_number
in range(first_page, last_page + page_offset)
)
def update_headers(current_page: str):
HEADERS["referer"] = current_page
def get_page_source_code(page_url: str) -> str:
return requests.get(page_url, headers=update_headers(page_url)).text
def make_soup(page_string: str) -> BeautifulSoup:
return BeautifulSoup(page_string, "html.parser")
def get_img_src(soup):
yield from (
i['src'] for i in soup.find_all("img", {"class", TAG_CLASS})
)
def parse_source(img_src: str) -> tuple:
image_name = img_src.split('/')[-1]
source_url = f"https://goodlogo.com/images/logos/{image_name}"
return image_name, source_url
def download_logo(image_source: iter):
for item, image in enumerate(image_source, start=1):
file_name, source_url, = parse_source(image)
print(f"Fetching logo #{item}: {source_url}")
save_logo(file_name, source_url)
def save_logo(file_name: str, source_url: str):
with open(os.path.join(DOWNLOAD_DIR, file_name), "wb") as f:
f.write(requests.get(source_url, headers=HEADERS).content)
pause_for_awhile()
def pause_for_awhile(max_seconds: int = 5):
time.sleep(random.randint(1, max_seconds))
@timer
def download_logos():
make_logo_dir()
for page in get_next_page():
print(f"--- Current page: {page} ---")
download_logo(get_img_src(make_soup(get_page_source_code(page))))
if __name__ == "__main__":
download_logos()
这可能不是你一开始想要的,但我写这篇文章是为了好玩,因为为什么不呢。现在是2020年,大流行到处都是,周五晚上我没有更好的事情要做 这基本上是一个略为改进的scraper版本,当然,它只用于教育目的。它以相当慢的方式获取每页所有
150
徽标(除了最后一个,大约有50个),在获取另一个徽标图像之前至少暂停一秒钟
您可以使用获取下一页(第一页=2,最后一页=3)
控制刮板。例如,这将从页面2
和3
下载所有徽标。但是,如果您希望获取所有参数,只需删除参数,如下面的get\u next\u page()
并让它运行即可
例如,我使用了一系列你可能不熟悉的技巧
您可以随意阅读这些内容,也可以直接进入代码并进行探索/享受
不管怎样,这是密码。欢迎反馈
import functools
import os
import random
import time
import requests
from bs4 import BeautifulSoup
STARTING_URL = "https://goodlogo.com/top.250/n/250/interval/"
TAG_CLASS = 'top_s3'
DOWNLOAD_DIR = "logos"
HEADERS = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"accept-encoding": "gzip, deflate, br",
"accept-language": "en-US,en;q=0.9,pl;q=0.8",
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "same-origin",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
"user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36",
}
def timer(func):
@functools.wraps(func)
def wrapper_timer(*args, **kwargs):
start_time = time.perf_counter()
value = func(*args, **kwargs)
end_time = time.perf_counter()
run_time = end_time - start_time
print(f"Finished {func.__name__!r} in {run_time:.2f} secs.")
return value
return wrapper_timer
def make_logo_dir():
os.makedirs(DOWNLOAD_DIR, exist_ok=True)
def get_next_page(first_page: int = 1, last_page: int = 8) -> str:
page_offset = 1
yield from (
f"{STARTING_URL}{page_number}" for page_number
in range(first_page, last_page + page_offset)
)
def update_headers(current_page: str):
HEADERS["referer"] = current_page
def get_page_source_code(page_url: str) -> str:
return requests.get(page_url, headers=update_headers(page_url)).text
def make_soup(page_string: str) -> BeautifulSoup:
return BeautifulSoup(page_string, "html.parser")
def get_img_src(soup):
yield from (
i['src'] for i in soup.find_all("img", {"class", TAG_CLASS})
)
def parse_source(img_src: str) -> tuple:
image_name = img_src.split('/')[-1]
source_url = f"https://goodlogo.com/images/logos/{image_name}"
return image_name, source_url
def download_logo(image_source: iter):
for item, image in enumerate(image_source, start=1):
file_name, source_url, = parse_source(image)
print(f"Fetching logo #{item}: {source_url}")
save_logo(file_name, source_url)
def save_logo(file_name: str, source_url: str):
with open(os.path.join(DOWNLOAD_DIR, file_name), "wb") as f:
f.write(requests.get(source_url, headers=HEADERS).content)
pause_for_awhile()
def pause_for_awhile(max_seconds: int = 5):
time.sleep(random.randint(1, max_seconds))
@timer
def download_logos():
make_logo_dir()
for page in get_next_page():
print(f"--- Current page: {page} ---")
download_logo(get_img_src(make_soup(get_page_source_code(page))))
if __name__ == "__main__":
download_logos()
这会给你带来错误吗?它在我的系统上运行得非常好。它没有显示错误,但在抓取了几页之后,它就不再下载了,我检查了控制台,它显示了警告。这会给你带来错误吗?它在我的系统上运行得非常好。它没有显示错误,但在抓取了几页之后,它不再下载了,我检查了控制台,它显示了警告。