Warning: file_get_contents(/data/phpspider/zhask/data//catemap/2/python/342.json): failed to open stream: No such file or directory in /data/phpspider/zhask/libs/function.php on line 167

Warning: Invalid argument supplied for foreach() in /data/phpspider/zhask/libs/tag.function.php on line 1116

Notice: Undefined index: in /data/phpspider/zhask/libs/function.php on line 180

Warning: array_chunk() expects parameter 1 to be array, null given in /data/phpspider/zhask/libs/function.php on line 181
Python 错误[违规]避免使用document.write()文件_Python_Web Scraping_Beautifulsoup - Fatal编程技术网

Python 错误[违规]避免使用document.write()文件

Python 错误[违规]避免使用document.write()文件,python,web-scraping,beautifulsoup,Python,Web Scraping,Beautifulsoup,我正在尝试下载top_s和top_sl类的所有图像,它运行了几页,但随后停止,HTTP错误:状态代码404,net::ERR_UNKNOWN_URL_SCHEME,[违反]避免使用document.write() 这是我的python代码- import requests import urllib.request import random from bs4 import BeautifulSoup as bs url = 'https://goodlogo.com/top.250/n/2

我正在尝试下载top_s和top_sl类的所有图像,它运行了几页,但随后停止,HTTP错误:状态代码404,net::ERR_UNKNOWN_URL_SCHEME,[违反]避免使用document.write()

这是我的python代码-

import requests
import urllib.request
import random
from bs4 import BeautifulSoup as bs


url = 'https://goodlogo.com/top.250/n/250/interval/6'

sourcecode = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
plain_text = sourcecode.text
soup = bs(plain_text, 'html.parser')


path = 'C:/Users/roysu/Desktop/src_code/Python_projects/python/web_scrap/myPath/'


link = soup.select(".top_s3l")
for tag in link:
    my_images = tag.get('src')
    path_new = my_images.replace("/images/logos/small/", "")
    file_name = path+path_new

    full_name = 'https://goodlogo.com'+my_images
    sourcecode1 = requests.get(
        full_name, headers={'User-Agent': 'Mozilla/5.0'})
    file = open(file_name, "wb")
    file.write(sourcecode1.content)
    file.close()


link1 = soup.select(".top_s3")
for tag1 in link1:
    my_images1 = tag1.get('src')

    path_new1 = my_images1.replace("/images/logos/small/", "")
    file_name1 = path+path_new1

    full_name1 = 'https://goodlogo.com'+my_images1
    enter code here
    sourcecode1 = requests.get(
        full_name1, headers={'User-Agent': 'Mozilla/5.0'})
    file = open(file_name1, "wb")
    file.write(sourcecode1.content)
    file.close()

这可能不是你一开始想要的,但我写这篇文章是为了好玩,因为为什么不呢。现在是2020年,大流行到处都是,周五晚上我没有更好的事情要做

这基本上是一个略为改进的scraper版本,当然,它只用于教育目的。它以相当慢的方式获取每页所有
150
徽标(除了最后一个,大约有50个),在获取另一个徽标图像之前至少暂停一秒钟

您可以使用
获取下一页(第一页=2,最后一页=3)
控制刮板。例如,这将从页面
2
3
下载所有徽标。但是,如果您希望获取所有参数,只需删除参数,如下面的
get\u next\u page()
并让它运行即可

例如,我使用了一系列你可能不熟悉的技巧

您可以随意阅读这些内容,也可以直接进入代码并进行探索/享受

不管怎样,这是密码。欢迎反馈

import functools
import os
import random
import time

import requests
from bs4 import BeautifulSoup

STARTING_URL = "https://goodlogo.com/top.250/n/250/interval/"
TAG_CLASS = 'top_s3'
DOWNLOAD_DIR = "logos"
HEADERS = {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "accept-encoding": "gzip, deflate, br",
    "accept-language": "en-US,en;q=0.9,pl;q=0.8",
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "same-origin",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36",
}


def timer(func):
    @functools.wraps(func)
    def wrapper_timer(*args, **kwargs):
        start_time = time.perf_counter()
        value = func(*args, **kwargs)
        end_time = time.perf_counter()
        run_time = end_time - start_time
        print(f"Finished {func.__name__!r} in {run_time:.2f} secs.")
        return value
    return wrapper_timer


def make_logo_dir():
    os.makedirs(DOWNLOAD_DIR, exist_ok=True)


def get_next_page(first_page: int = 1, last_page: int = 8) -> str:
    page_offset = 1
    yield from (
        f"{STARTING_URL}{page_number}" for page_number
        in range(first_page, last_page + page_offset)
    )


def update_headers(current_page: str):
    HEADERS["referer"] = current_page


def get_page_source_code(page_url: str) -> str:
    return requests.get(page_url, headers=update_headers(page_url)).text


def make_soup(page_string: str) -> BeautifulSoup:
    return BeautifulSoup(page_string, "html.parser")


def get_img_src(soup):
    yield from (
        i['src'] for i in soup.find_all("img", {"class", TAG_CLASS})
    )


def parse_source(img_src: str) -> tuple:
    image_name = img_src.split('/')[-1]
    source_url = f"https://goodlogo.com/images/logos/{image_name}"
    return image_name, source_url


def download_logo(image_source: iter):
    for item, image in enumerate(image_source, start=1):
        file_name, source_url, = parse_source(image)
        print(f"Fetching logo #{item}: {source_url}")
        save_logo(file_name, source_url)


def save_logo(file_name: str, source_url: str):
    with open(os.path.join(DOWNLOAD_DIR, file_name), "wb") as f:
        f.write(requests.get(source_url, headers=HEADERS).content)
        pause_for_awhile()


def pause_for_awhile(max_seconds: int = 5):
    time.sleep(random.randint(1, max_seconds))


@timer
def download_logos():
    make_logo_dir()
    for page in get_next_page():
        print(f"--- Current page: {page} ---")
        download_logo(get_img_src(make_soup(get_page_source_code(page))))


if __name__ == "__main__":
    download_logos()


这可能不是你一开始想要的,但我写这篇文章是为了好玩,因为为什么不呢。现在是2020年,大流行到处都是,周五晚上我没有更好的事情要做

这基本上是一个略为改进的scraper版本,当然,它只用于教育目的。它以相当慢的方式获取每页所有
150
徽标(除了最后一个,大约有50个),在获取另一个徽标图像之前至少暂停一秒钟

您可以使用
获取下一页(第一页=2,最后一页=3)
控制刮板。例如,这将从页面
2
3
下载所有徽标。但是,如果您希望获取所有参数,只需删除参数,如下面的
get\u next\u page()
并让它运行即可

例如,我使用了一系列你可能不熟悉的技巧

您可以随意阅读这些内容,也可以直接进入代码并进行探索/享受

不管怎样,这是密码。欢迎反馈

import functools
import os
import random
import time

import requests
from bs4 import BeautifulSoup

STARTING_URL = "https://goodlogo.com/top.250/n/250/interval/"
TAG_CLASS = 'top_s3'
DOWNLOAD_DIR = "logos"
HEADERS = {
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
    "accept-encoding": "gzip, deflate, br",
    "accept-language": "en-US,en;q=0.9,pl;q=0.8",
    "sec-fetch-dest": "document",
    "sec-fetch-mode": "navigate",
    "sec-fetch-site": "same-origin",
    "sec-fetch-user": "?1",
    "upgrade-insecure-requests": "1",
    "user-agent": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36",
}


def timer(func):
    @functools.wraps(func)
    def wrapper_timer(*args, **kwargs):
        start_time = time.perf_counter()
        value = func(*args, **kwargs)
        end_time = time.perf_counter()
        run_time = end_time - start_time
        print(f"Finished {func.__name__!r} in {run_time:.2f} secs.")
        return value
    return wrapper_timer


def make_logo_dir():
    os.makedirs(DOWNLOAD_DIR, exist_ok=True)


def get_next_page(first_page: int = 1, last_page: int = 8) -> str:
    page_offset = 1
    yield from (
        f"{STARTING_URL}{page_number}" for page_number
        in range(first_page, last_page + page_offset)
    )


def update_headers(current_page: str):
    HEADERS["referer"] = current_page


def get_page_source_code(page_url: str) -> str:
    return requests.get(page_url, headers=update_headers(page_url)).text


def make_soup(page_string: str) -> BeautifulSoup:
    return BeautifulSoup(page_string, "html.parser")


def get_img_src(soup):
    yield from (
        i['src'] for i in soup.find_all("img", {"class", TAG_CLASS})
    )


def parse_source(img_src: str) -> tuple:
    image_name = img_src.split('/')[-1]
    source_url = f"https://goodlogo.com/images/logos/{image_name}"
    return image_name, source_url


def download_logo(image_source: iter):
    for item, image in enumerate(image_source, start=1):
        file_name, source_url, = parse_source(image)
        print(f"Fetching logo #{item}: {source_url}")
        save_logo(file_name, source_url)


def save_logo(file_name: str, source_url: str):
    with open(os.path.join(DOWNLOAD_DIR, file_name), "wb") as f:
        f.write(requests.get(source_url, headers=HEADERS).content)
        pause_for_awhile()


def pause_for_awhile(max_seconds: int = 5):
    time.sleep(random.randint(1, max_seconds))


@timer
def download_logos():
    make_logo_dir()
    for page in get_next_page():
        print(f"--- Current page: {page} ---")
        download_logo(get_img_src(make_soup(get_page_source_code(page))))


if __name__ == "__main__":
    download_logos()


这会给你带来错误吗?它在我的系统上运行得非常好。它没有显示错误,但在抓取了几页之后,它就不再下载了,我检查了控制台,它显示了警告。这会给你带来错误吗?它在我的系统上运行得非常好。它没有显示错误,但在抓取了几页之后,它不再下载了,我检查了控制台,它显示了警告。