Python HTTPError:HTTP错误429:WIKI页面请求过多_Python_Python 3.x_Python Requests_Wikipedia_Http Error

Python HTTPError:HTTP错误429:WIKI页面请求过多

python python-3.x

Python HTTPError:HTTP错误429:WIKI页面请求过多,python,python-3.x,python-requests,wikipedia,http-error,Python,Python 3.x,Python Requests,Wikipedia,Http Error,我得到429错误，即使脚本访问的页面数量甚至不到50。如何调整脚本以避免错误。用户代理是我的Chrome浏览器的正确代理。我试着在请求之后放置时间。睡眠（10），但这没有帮助。我正在Windows 10上使用jupyter笔记本 def get_wiki_list_italian_movies(year): import sys, bs4, requests, textwrap, , time, re from textblob import TextBlob

我得到429错误，即使脚本访问的页面数量甚至不到50。如何调整脚本以避免错误。用户代理是我的Chrome浏览器的正确代理。我试着在请求之后放置

时间。睡眠（10）

，但这没有帮助。我正在Windows 10上使用jupyter笔记本

def get_wiki_list_italian_movies(year):
    import sys, bs4, requests, textwrap, , time, re
    from textblob import TextBlob
    
    p = r'https://en.wikipedia.org/wiki/List_of_Italian_films_of_' + str(year)
    list_wiki, list_wiki_links = [], []

    header = {'User-agent':
               'mozilla/5.0 (windows nt 10.0; win64; x64) applewebkit/537.36 (khtml, like gecko) chrome/86.0.4240.111 safari/537.36'}
   
    request = requests.get(p, headers = header)              
    time.sleep(10)

    soup = bs4.BeautifulSoup(request.text, 'lxml')
    table = soup.find_all('table', class_='wikitable')
    
    for el in table:
        td = el.find_all('td')
        for t in td:
            i = t.find('i')
            if i:
                for a in i.find_all('a', href=True):
                    result = re.sub(r'[()]','',a['href'])
                    if '/w/index' in result:
                        list_wiki.append(re.sub(r'\(.*','',a['title']).strip() + ' ' + str(year))
                    else:
                        list_wiki_links.append('https://en.wikipedia.org'+result)

    for link in list_wiki_links:
        request = requests.get(link, headers = header)
        time.sleep(10)
        soup = bs4.BeautifulSoup(request.text, 'lxml')
        i_list = soup.find_all('i')
        
        for i in i_list:
            b = i.find('b')
            if b:
                t= b.text
                if len(t) > 4 and TextBlob(t).detect_language() == 'it':
                    list_wiki.append(t.strip() + ' ' + str(year))
             
    return sorted(list(set(list_wiki)))

def movies_wiki_list(years_span):
    ll = []
    for year in years_span:
        ll += get_wiki_list_italian_movies(year)
        time.sleep(10)
    return ll

italian_movies_1932_1933 = movies_wiki_list(range(1932, 1934))
italian_movies_1932_1933

这就是错误：

HTTPError                                 Traceback (most recent call last)
<ipython-input-12-6a4bc670faa6> in <module>
     53     return ll
     54 
---> 55 italian_movies_1932_1933 = movies_wiki_list(range(1932, 1934))
     56 italian_movies_1932_1933

<ipython-input-12-6a4bc670faa6> in movies_wiki_list(years_span)
     49     ll = []
     50     for year in years_span:
---> 51         ll += get_wiki_list_italian_movies(year)
     52         time.sleep(10)
     53     return ll

<ipython-input-12-6a4bc670faa6> in get_wiki_list_italian_movies(year)
     41             if b:
     42                 t= b.text
---> 43                 if len(t) > 4 and TextBlob(t).detect_language() == 'it':
     44                     list_wiki.append(t.strip() + ' ' + str(year))
     45 

~\anaconda3\lib\site-packages\textblob\blob.py in detect_language(self)
    566         :rtype: str
    567         """
--> 568         return self.translator.detect(self.raw)
    569 
    570     def correct(self):

~\anaconda3\lib\site-packages\textblob\translate.py in detect(self, source, host, type_)
     70         data = {"q": source}
     71         url = u'{url}&sl=auto&tk={tk}'.format(url=self.url, tk=_calculate_tk(source))
---> 72         response = self._request(url, host=host, type_=type_, data=data)
     73         result, language = json.loads(response)
     74         return language

~\anaconda3\lib\site-packages\textblob\translate.py in _request(self, url, host, type_, data)
     90         if host or type_:
     91             req.set_proxy(host=host, type=type_)
---> 92         resp = request.urlopen(req)
     93         content = resp.read()
     94         return content.decode('utf-8')

~\anaconda3\lib\urllib\request.py in urlopen(url, data, timeout, cafile, capath, cadefault, context)
    220     else:
    221         opener = _opener
--> 222     return opener.open(url, data, timeout)
    223 
    224 def install_opener(opener):

~\anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
    529         for processor in self.process_response.get(protocol, []):
    530             meth = getattr(processor, meth_name)
--> 531             response = meth(req, response)
    532 
    533         return response

~\anaconda3\lib\urllib\request.py in http_response(self, request, response)
    638         # request was successfully received, understood, and accepted.
    639         if not (200 <= code < 300):
--> 640             response = self.parent.error(
    641                 'http', request, response, code, msg, hdrs)
    642 

~\anaconda3\lib\urllib\request.py in error(self, proto, *args)
    561             http_err = 0
    562         args = (dict, proto, meth_name) + args
--> 563         result = self._call_chain(*args)
    564         if result:
    565             return result

~\anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
    500         for handler in handlers:
    501             func = getattr(handler, meth_name)
--> 502             result = func(*args)
    503             if result is not None:
    504                 return result

~\anaconda3\lib\urllib\request.py in http_error_302(self, req, fp, code, msg, headers)
    753         fp.close()
    754 
--> 755         return self.parent.open(new, timeout=req.timeout)
    756 
    757     http_error_301 = http_error_303 = http_error_307 = http_error_302

~\anaconda3\lib\urllib\request.py in open(self, fullurl, data, timeout)
    529         for processor in self.process_response.get(protocol, []):
    530             meth = getattr(processor, meth_name)
--> 531             response = meth(req, response)
    532 
    533         return response

~\anaconda3\lib\urllib\request.py in http_response(self, request, response)
    638         # request was successfully received, understood, and accepted.
    639         if not (200 <= code < 300):
--> 640             response = self.parent.error(
    641                 'http', request, response, code, msg, hdrs)
    642 

~\anaconda3\lib\urllib\request.py in error(self, proto, *args)
    567         if http_err:
    568             args = (dict, 'default', 'http_error_default') + orig_args
--> 569             return self._call_chain(*args)
    570 
    571 # XXX probably also want an abstract factory that knows when it makes

~\anaconda3\lib\urllib\request.py in _call_chain(self, chain, kind, meth_name, *args)
    500         for handler in handlers:
    501             func = getattr(handler, meth_name)
--> 502             result = func(*args)
    503             if result is not None:
    504                 return result

~\anaconda3\lib\urllib\request.py in http_error_default(self, req, fp, code, msg, hdrs)
    647 class HTTPDefaultErrorHandler(BaseHandler):
    648     def http_error_default(self, req, fp, code, msg, hdrs):
--> 649         raise HTTPError(req.full_url, code, msg, hdrs, fp)
    650 
    651 class HTTPRedirectHandler(BaseHandler):

HTTPError: HTTP Error 429: Too Many Requests

HTTPError回溯（最近一次调用）
在里面
53返回ll
54
--->55意大利电影1932年=电影维基列表（范围（19321934））
56部意大利电影(1932)(1933)
电影中的维基列表（年跨度）
49 ll=[]
50年（以年为单位）：
--->51 ll+=获取维基列表意大利电影（年）
52次。睡眠（10）
53返回ll
在get_wiki_list_意大利电影（年）
41如果b：
42 t=b.text
--->43如果len（t）>4且TextBlob（t）.detect_language（）=“it”：
44 list_wiki.append（t.strip（）+“”+str（年））
45
检测语言中的~\anaconda3\lib\site packages\textblob\blob.py（self）
566:rtype:str
567         """
-->568返回self.translator.detect（self.raw）
569
570 def正确（自）：
检测中的~\anaconda3\lib\site packages\textblob\translate.py（自身、源、主机、类型）
70数据={“q”：源}
71 url=u'{url}&sl=auto&tk={tk}'。格式（url=self.url，tk=_-calculate_-tk（源））
--->72响应=self.\u请求（url，主机=host，类型=type，数据=data）
73结果，语言=json.loads（响应）
74返回语言
请求中的~\anaconda3\lib\site packages\textblob\translate.py（self、url、host、type、data）
90如果主机或类型为：
91请求设置代理（主机=主机，类型=类型）
--->92 resp=request.urlopen（请求）
93内容=分别读取（）
94返回内容。解码（'utf-8'）
urlopen中的~\anaconda3\lib\urllib\request.py（url、数据、超时、cafile、capath、cadefault、上下文）
220其他：
221开瓶器=_开瓶器
-->222返回opener.open（url、数据、超时）
223
224 def安装_开启器（开启器）：
~\anaconda3\lib\urllib\request.py处于打开状态（self、fullurl、数据、超时）
529用于self.process\u response.get（协议，[]）中的处理器：
530 meth=getattr（处理器，meth\u名称）
-->531响应=方法（请求，响应）
532
533返回响应
http\u响应中的~\anaconda3\lib\urllib\request.py（self、request、response）
638#请求已成功接收、理解并接受。
639如果不是（200 640响应=self.parent.error(
641“http”、请求、响应、代码、消息、hdrs）
642
~\anaconda3\lib\urllib\request.py出错（self、proto、*args）
561 http_err=0
562 args=（dict，proto，meth_name）+args
-->563结果=自调用链（*args）
564如果结果：
565返回结果
调用链中的~\anaconda3\lib\urllib\request.py（self、chain、kind、meth\u name、*args）
500对于处理程序中的处理程序：
501 func=getattr（处理程序，方法名称）
-->502结果=函数（*args）
503如果结果不是无：
504返回结果
http\u error\u 302中的~\anaconda3\lib\urllib\request.py（self、req、fp、code、msg、headers）
753 fp.close（）
754
-->755返回self.parent.open（新建，超时=请求超时）
756
757 http_error\u 301=http_error\u 303=http_error\u 307=http_error\u 302
~\anaconda3\lib\urllib\request.py处于打开状态（self、fullurl、数据、超时）
529用于self.process\u response.get（协议，[]）中的处理器：
530 meth=getattr（处理器，meth\u名称）
-->531响应=方法（请求，响应）
532
533返回响应
http\u响应中的~\anaconda3\lib\urllib\request.py（self、request、response）
638#请求已成功接收、理解并接受。
639如果不是（200 640响应=self.parent.error(
641“http”、请求、响应、代码、消息、hdrs）
642
~\anaconda3\lib\urllib\request.py出错（self、proto、*args）
567如果http_错误：
568参数=（dict，“default”，“http\u error\u default”）+原始参数
-->569返回自调用链（*args）
570
571#XXX可能还想要一个知道何时生产的抽象工厂
调用链中的~\anaconda3\lib\urllib\request.py（self、chain、kind、meth\u name、*args）
500对于处理程序中的处理程序：
501 func=getattr（处理程序，方法名称）
-->502结果=函数（*args）
503如果结果不是无：
504返回结果
http\u error\u默认值中的~\anaconda3\lib\urllib\request.py（self、req、fp、code、msg、hdrs）
647类HTTPDefaultErrorHandler（BaseHandler）：
648 def http_错误_默认值（self、req、fp、code、msg、hdrs）：
-->649 raise HTTPError（请求完整的url、代码、消息、hdrs、fp）
650
651类HTTPRedirectHandler（BaseHandler）：
HTTPError:HTTP错误429:请求太多

根据维基百科的API指南：和

对读取请求没有严格的限制，但我们要求您要考虑周全，尽量不要让站点宕机。大多数系统管理员都会保留如果你确实危及安全，有权不客气地阻止你他们场地的稳定性

如果您以串联方式而不是并行方式提出请求（即等待在发送新请求之前完成一个请求，例如你永远不会在同一时间提出多个请求），那么你当然可以。也可以尝试将内容合并到一个请求中（例如，在titles参数中使用多个title，而不是为每个tit发出新的请求）

!pip install googletrans
from googletrans import Translator
translator = Translator()

TextBlob(t).detect_language() == 'it'

 translator.detect(t).lang == 'it'

try:
   urlpage=urllib.request.urlopen('url')

except:
   time.sleep(1)
   urlpage=urllib.request.urlopen('url')