Python 错误[违规]避免使用document.write（）文件_Python_Web Scraping_Beautifulsoup

Python 错误[违规]避免使用document.write（）文件

python web-scraping

Python 错误[违规]避免使用document.write（）文件,python,web-scraping,beautifulsoup,Python,Web Scraping,Beautifulsoup,我正在尝试下载top_s和top_sl类的所有图像，它运行了几页，但随后停止，HTTP错误：状态代码404，net:：ERR_UNKNOWN_URL_SCHEME，[违反]避免使用document.write（）这是我的python代码- import requests import urllib.request import random from bs4 import BeautifulSoup as bs url = 'https://goodlogo.com/top.250/n/2

我正在尝试下载top_s和top_sl类的所有图像，它运行了几页，但随后停止，HTTP错误：状态代码404，net:：ERR_UNKNOWN_URL_SCHEME，[违反]避免使用document.write（）

这是我的python代码-

import requests
import urllib.request
import random
from bs4 import BeautifulSoup as bs


url = 'https://goodlogo.com/top.250/n/250/interval/6'

sourcecode = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
plain_text = sourcecode.text
soup = bs(plain_text, 'html.parser')


path = 'C:/Users/roysu/Desktop/src_code/Python_projects/python/web_scrap/myPath/'


link = soup.select(".top_s3l")
for tag in link:
    my_images = tag.get('src')
    path_new = my_images.replace("/images/logos/small/", "")
    file_name = path+path_new

    full_name = 'https://goodlogo.com'+my_images
    sourcecode1 = requests.get(
        full_name, headers={'User-Agent': 'Mozilla/5.0'})
    file = open(file_name, "wb")
    file.write(sourcecode1.content)
    file.close()


link1 = soup.select(".top_s3")
for tag1 in link1:
    my_images1 = tag1.get('src')

    path_new1 = my_images1.replace("/images/logos/small/", "")
    file_name1 = path+path_new1

    full_name1 = 'https://goodlogo.com'+my_images1
    enter code here
    sourcecode1 = requests.get(
        full_name1, headers={'User-Agent': 'Mozilla/5.0'})
    file = open(file_name1, "wb")
    file.write(sourcecode1.content)
    file.close()

这可能不是你一开始想要的，但我写这篇文章是为了好玩，因为为什么不呢。现在是2020年，大流行到处都是，周五晚上我没有更好的事情要做

这基本上是一个略为改进的scraper版本，当然，它只用于教育目的。它以相当慢的方式获取每页所有

徽标（除了最后一个，大约有50个），在获取另一个徽标图像之前至少暂停一秒钟

您可以使用

获取下一页（第一页=2，最后一页=3）

控制刮板。例如，这将从页面

和

下载所有徽标。但是，如果您希望获取所有参数，只需删除参数，如下面的

get\u next\u page（）

并让它运行即可

例如，我使用了一系列你可能不熟悉的技巧

您可以随意阅读这些内容，也可以直接进入代码并进行探索/享受

不管怎样，这是密码。欢迎反馈

import functools
import os
import random
import time

import requests
from bs4 import BeautifulSoup

STARTING_URL = "https://goodlogo.com/top.250/n/250/interval/"
TAG_CLASS = 'top_s3'
DOWNLOAD_DIR = "logos"
HEADERS = {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "accept-encoding": "gzip, deflate, br",
    "accept-language": "en-US,en;q=0.9,pl;q=0.8",
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "same-origin",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36",
}


def timer(func):
    @functools.wraps(func)
    def wrapper_timer(*args, **kwargs):
        start_time = time.perf_counter()
        value = func(*args, **kwargs)
        end_time = time.perf_counter()
        run_time = end_time - start_time
        print(f"Finished {func.__name__!r} in {run_time:.2f} secs.")
        return value
    return wrapper_timer


def make_logo_dir():
    os.makedirs(DOWNLOAD_DIR, exist_ok=True)


def get_next_page(first_page: int = 1, last_page: int = 8) -> str:
    page_offset = 1
    yield from (
        f"{STARTING_URL}{page_number}" for page_number
        in range(first_page, last_page + page_offset)
    )


def update_headers(current_page: str):
    HEADERS["referer"] = current_page


def get_page_source_code(page_url: str) -> str:
    return requests.get(page_url, headers=update_headers(page_url)).text


def make_soup(page_string: str) -> BeautifulSoup:
    return BeautifulSoup(page_string, "html.parser")


def get_img_src(soup):
    yield from (
        i['src'] for i in soup.find_all("img", {"class", TAG_CLASS})
    )


def parse_source(img_src: str) -> tuple:
    image_name = img_src.split('/')[-1]
    source_url = f"https://goodlogo.com/images/logos/{image_name}"
    return image_name, source_url


def download_logo(image_source: iter):
    for item, image in enumerate(image_source, start=1):
        file_name, source_url, = parse_source(image)
        print(f"Fetching logo #{item}: {source_url}")
        save_logo(file_name, source_url)


def save_logo(file_name: str, source_url: str):
    with open(os.path.join(DOWNLOAD_DIR, file_name), "wb") as f:
        f.write(requests.get(source_url, headers=HEADERS).content)
        pause_for_awhile()


def pause_for_awhile(max_seconds: int = 5):
    time.sleep(random.randint(1, max_seconds))


@timer
def download_logos():
    make_logo_dir()
    for page in get_next_page():
        print(f"--- Current page: {page} ---")
        download_logo(get_img_src(make_soup(get_page_source_code(page))))


if __name__ == "__main__":
    download_logos()