Python Can';t在抓取时修改脚本以限定请求数

Python Can';t在抓取时修改脚本以限定请求数,python,python-3.x,multithreading,web-scraping,Python,Python 3.x,Multithreading,Web Scraping,我已经用python编写了一个脚本,使用Thread来同时处理多个请求,并更快地执行抓取过程。脚本正在相应地完成它的工作 简而言之,scraper的作用是:它解析从登录页到主页(存储信息的地方)的所有链接,并从那里刮取快乐时光和特色特别。在所有的29页都被抓取之前,爬虫程序一直在进行 因为可能有很多链接可以玩,我想限制请求的数量。然而,因为我对此没有太多的想法,所以我找不到任何理想的方法来修改我现有的脚本以达到这个目的 任何帮助都将不胜感激 这是我迄今为止的尝试: import requests

我已经用python编写了一个脚本,使用
Thread
来同时处理多个请求,并更快地执行抓取过程。脚本正在相应地完成它的工作

简而言之,scraper的作用是:它解析从登录页到主页(存储信息的地方)的所有链接,并从那里刮取
快乐时光
特色特别
。在所有的29页都被抓取之前,爬虫程序一直在进行

因为可能有很多链接可以玩,我想限制请求的数量。然而,因为我对此没有太多的想法,所以我找不到任何理想的方法来修改我现有的脚本以达到这个目的

任何帮助都将不胜感激

这是我迄今为止的尝试:

import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import threading

url = "https://www.totalhappyhour.com/washington-dc-happy-hour/?page={}"

def get_info(link):
    for mlink in [link.format(page) for page in range(1,30)]:
        response = requests.get(mlink)
        soup = BeautifulSoup(response.text,"lxml")
        itemlinks = [urljoin(link,container.select_one("h2.name a").get("href")) for container in soup.select(".profile")]
        threads = []
        for ilink in itemlinks:
            thread = threading.Thread(target=fetch_info,args=(ilink,))
            thread.start()
            threads+=[thread]

        for thread in threads:
            thread.join()

def fetch_info(nlink):
    response = requests.get(nlink)
    soup = BeautifulSoup(response.text,"lxml")
    for container in soup.select(".specials"):
        try:
            hours = container.select_one("h3").text
        except Exception: hours = ""
        try:
            fspecial = ' '.join([item.text for item in container.select(".special")])
        except Exception: fspecial = ""
        print(f'{hours}---{fspecial}')

if __name__ == '__main__':
    get_info(url)
你应该看看,它非常简单,可以帮助你更快地做事

还可以简化代码(以防您不想使用asyncio)。
多处理。如果您更喜欢使用线程,则池
也具有同等功能

关于请求限制,我建议您使用(或任何其他信号量,以防您从线程切换)

线程方法:

from multiprocessing.pool import ThreadPool as Pool
from threading import Semaphore
from time import sleep


MAX_RUN_AT_ONCE = 5
NUMBER_OF_THREADS = 10

sm = Semaphore(MAX_RUN_AT_ONCE)


def do_task(number):
    with sm:
        print(f"run with {number}")
        sleep(3)
        return number * 2


def main():

    p = Pool(NUMBER_OF_THREADS)
    results = p.map(do_task, range(10))
    print(results)


if __name__ == '__main__':
    main()
from multiprocessing import Pool
from multiprocessing import Semaphore
from time import sleep


MAX_RUN_AT_ONCE = 5
NUMBER_OF_PROCESS = 10

semaphore = None

def initializer(sm):
    """init the semaphore for the child process"""
    global semaphore
    semaphore = sm


def do_task(number):
    with semaphore:
        print(f"run with {number}\n")
        sleep(3)
        return number * 2


def main():
    sm = Semaphore(MAX_RUN_AT_ONCE)
    p = Pool(NUMBER_OF_PROCESS, initializer=initializer,
             initargs=[sm])

    results = p.map(do_task, range(10))
    print(results)


if __name__ == '__main__':
    main()
import asyncio


MAX_RUN_AT_ONCE = 5
sm = asyncio.Semaphore(MAX_RUN_AT_ONCE)

async def do_task(number):
    async with sm:
        print(f"run with {number}\n")
        await asyncio.sleep(3)
        return number * 2

async def main():   
    coros = [do_task(number) for number in range(10)]
    finished, _  = await asyncio.wait(coros)
    print([fut.result() for fut in finished])

if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())
多处理方法:

from multiprocessing.pool import ThreadPool as Pool
from threading import Semaphore
from time import sleep


MAX_RUN_AT_ONCE = 5
NUMBER_OF_THREADS = 10

sm = Semaphore(MAX_RUN_AT_ONCE)


def do_task(number):
    with sm:
        print(f"run with {number}")
        sleep(3)
        return number * 2


def main():

    p = Pool(NUMBER_OF_THREADS)
    results = p.map(do_task, range(10))
    print(results)


if __name__ == '__main__':
    main()
from multiprocessing import Pool
from multiprocessing import Semaphore
from time import sleep


MAX_RUN_AT_ONCE = 5
NUMBER_OF_PROCESS = 10

semaphore = None

def initializer(sm):
    """init the semaphore for the child process"""
    global semaphore
    semaphore = sm


def do_task(number):
    with semaphore:
        print(f"run with {number}\n")
        sleep(3)
        return number * 2


def main():
    sm = Semaphore(MAX_RUN_AT_ONCE)
    p = Pool(NUMBER_OF_PROCESS, initializer=initializer,
             initargs=[sm])

    results = p.map(do_task, range(10))
    print(results)


if __name__ == '__main__':
    main()
import asyncio


MAX_RUN_AT_ONCE = 5
sm = asyncio.Semaphore(MAX_RUN_AT_ONCE)

async def do_task(number):
    async with sm:
        print(f"run with {number}\n")
        await asyncio.sleep(3)
        return number * 2

async def main():   
    coros = [do_task(number) for number in range(10)]
    finished, _  = await asyncio.wait(coros)
    print([fut.result() for fut in finished])

if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())
异步方法:

from multiprocessing.pool import ThreadPool as Pool
from threading import Semaphore
from time import sleep


MAX_RUN_AT_ONCE = 5
NUMBER_OF_THREADS = 10

sm = Semaphore(MAX_RUN_AT_ONCE)


def do_task(number):
    with sm:
        print(f"run with {number}")
        sleep(3)
        return number * 2


def main():

    p = Pool(NUMBER_OF_THREADS)
    results = p.map(do_task, range(10))
    print(results)


if __name__ == '__main__':
    main()
from multiprocessing import Pool
from multiprocessing import Semaphore
from time import sleep


MAX_RUN_AT_ONCE = 5
NUMBER_OF_PROCESS = 10

semaphore = None

def initializer(sm):
    """init the semaphore for the child process"""
    global semaphore
    semaphore = sm


def do_task(number):
    with semaphore:
        print(f"run with {number}\n")
        sleep(3)
        return number * 2


def main():
    sm = Semaphore(MAX_RUN_AT_ONCE)
    p = Pool(NUMBER_OF_PROCESS, initializer=initializer,
             initargs=[sm])

    results = p.map(do_task, range(10))
    print(results)


if __name__ == '__main__':
    main()
import asyncio


MAX_RUN_AT_ONCE = 5
sm = asyncio.Semaphore(MAX_RUN_AT_ONCE)

async def do_task(number):
    async with sm:
        print(f"run with {number}\n")
        await asyncio.sleep(3)
        return number * 2

async def main():   
    coros = [do_task(number) for number in range(10)]
    finished, _  = await asyncio.wait(coros)
    print([fut.result() for fut in finished])

if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())
对于使用应该使用的asyncio执行
http请求
,您也可以使用
请求
循环。在执行器中运行,但不要使用
asyncio
,因为您所有的代码几乎都是请求

输出:

from multiprocessing.pool import ThreadPool as Pool
from threading import Semaphore
from time import sleep


MAX_RUN_AT_ONCE = 5
NUMBER_OF_THREADS = 10

sm = Semaphore(MAX_RUN_AT_ONCE)


def do_task(number):
    with sm:
        print(f"run with {number}")
        sleep(3)
        return number * 2


def main():

    p = Pool(NUMBER_OF_THREADS)
    results = p.map(do_task, range(10))
    print(results)


if __name__ == '__main__':
    main()
from multiprocessing import Pool
from multiprocessing import Semaphore
from time import sleep


MAX_RUN_AT_ONCE = 5
NUMBER_OF_PROCESS = 10

semaphore = None

def initializer(sm):
    """init the semaphore for the child process"""
    global semaphore
    semaphore = sm


def do_task(number):
    with semaphore:
        print(f"run with {number}\n")
        sleep(3)
        return number * 2


def main():
    sm = Semaphore(MAX_RUN_AT_ONCE)
    p = Pool(NUMBER_OF_PROCESS, initializer=initializer,
             initargs=[sm])

    results = p.map(do_task, range(10))
    print(results)


if __name__ == '__main__':
    main()
import asyncio


MAX_RUN_AT_ONCE = 5
sm = asyncio.Semaphore(MAX_RUN_AT_ONCE)

async def do_task(number):
    async with sm:
        print(f"run with {number}\n")
        await asyncio.sleep(3)
        return number * 2

async def main():   
    coros = [do_task(number) for number in range(10)]
    finished, _  = await asyncio.wait(coros)
    print([fut.result() for fut in finished])

if __name__ == '__main__':
    loop = asyncio.get_event_loop()
    loop.run_until_complete(main())
与0一起运行

与1运行

与2人赛跑

与3人赛跑

与4人一起跑步

(这里是信号灯的暂停和睡眠)

与5人赛跑

与6人一起跑步

与7人赛跑

与8人赛跑

与9人赛跑

[0,2,4,6,8,10,12,14,16,18]


您也可以查看

,因为我对使用多处理创建任何刮板非常陌生,我希望有任何真实的脚本,以便非常清楚地理解逻辑。脚本中使用的站点具有一些bot保护机制。然而,我发现了一个非常类似的网页,可以在其中应用多处理

import requests
from multiprocessing import Pool
from urllib.parse import urljoin
from bs4 import BeautifulSoup

url = "http://srar.com/roster/index.php?agent_search={}"

def get_links(link):
    completelinks = []
    for ilink in [chr(i) for i in range(ord('a'),ord('d')+1)]:
        res = requests.get(link.format(ilink))  
        soup = BeautifulSoup(res.text,'lxml')
        for items in soup.select("table.border tr"):
            if not items.select("td a[href^='index.php?agent']"):continue
            data = [urljoin(link,item.get("href")) for item in items.select("td a[href^='index.php?agent']")]
            completelinks.extend(data)
    return completelinks

def get_info(nlink):
    req = requests.get(nlink)
    sauce = BeautifulSoup(req.text,"lxml")
    for tr in sauce.select("table[style$='1px;'] tr"):
        table = [td.get_text(strip=True) for td in tr.select("td")]
        print(table)

if __name__ == '__main__':
    allurls = get_links(url)
    with Pool(10) as p:  ##this is the number responsible for limiting the number of requests
        p.map(get_info,allurls)
        p.join()

虽然我不确定是否可以在下面的脚本中实现
ThreadPool
的逻辑,这已经在SocketPlayer的回答中描述过了,但它似乎工作得完美无缺。如果我出了什么差错,请随时纠正

import requests
from urllib.parse import urljoin
from bs4 import BeautifulSoup
from multiprocessing.pool import ThreadPool as Pool
from threading import Semaphore

MAX_RUN_AT_ONCE = 5
NUMBER_OF_THREADS = 10

sm = Semaphore(MAX_RUN_AT_ONCE)

url = "http://srar.com/roster/index.php?agent_search={}"

def get_links(link):
    with sm:
        completelinks = []
        for ilink in [chr(i) for i in range(ord('a'),ord('d')+1)]:
            res = requests.get(link.format(ilink))  
            soup = BeautifulSoup(res.text,'lxml')
            for items in soup.select("table.border tr"):
                if not items.select("td a[href^='index.php?agent']"):continue
                data = [urljoin(link,item.get("href")) for item in items.select("td a[href^='index.php?agent']")]
                completelinks.extend(data)
        return completelinks

def get_info(nlink):
    req = requests.get(nlink)
    sauce = BeautifulSoup(req.text,"lxml")
    for tr in sauce.select("table[style$='1px;'] tr")[1:]:
        table = [td.get_text(strip=True) for td in tr.select("td")]
        print(table)

if __name__ == '__main__':
    p = Pool(NUMBER_OF_THREADS)
    p.map(get_info, get_links(url))

如果要设置最大并行请求数,可能需要实现ThreadingPool。检查这正是我要找的@Anderson爵士。你有空的时候能帮我实现吗?我非常怀疑我自己能做这件事。提前感谢。似乎目标网站存在一些性能问题或某种机器人保护:当我尝试运行脚本服务器时,服务器很快就会失去响应,因此很难将多线程web抓取应用于此特定网站…抱歉,我帮不上什么忙,他们也阻止了我。Andersson是对的,他们有一个bot保护机制——如果你查看他们的服务器头,你会发现他们使用的是Sucuri/Cloudproxy。当然,你可以使用代理直到它被禁止,然后再使用第二个代理,以此类推,但这是滥用。我恐怕我们只能把他们放在一边了。那个网站只是一个占位符。但是,我已经使用
多处理创建了一个。非常感谢你们两位。我应该粘贴在我已经创建的下面吗?