503使用python3进行谷歌搜索爬网时出错——请求，4_Python_Web Scraping_Beautifulsoup_Http Status Code 503

503使用python3进行谷歌搜索爬网时出错——请求，4

python web-scraping

503使用python3进行谷歌搜索爬网时出错——请求，4,python,web-scraping,beautifulsoup,http-status-code-503,Python,Web Scraping,Beautifulsoup,Http Status Code 503,我想把谷歌搜索的链接标题改成20页左右。我在前一天刚刚尝试过这段代码，它是有效的！但是今天，它给我发了503个错误我寻找解决这个问题的方法。以下是我尝试过的延迟时间（通过在25之后的行中插入'time.sleep（60）'代码） “假用户代理”库但是，看着503出错。。这是文件 import requests from bs4 import BeautifulSoup from collections import Counter #google, '소프트웨어 교육' base

我想把谷歌搜索的链接标题改成20页左右。我在前一天刚刚尝试过这段代码，它是有效的！但是今天，它给我发了503个错误

我寻找解决这个问题的方法。以下是我尝试过的

延迟时间（通过在25之后的行中插入'time.sleep（60）'代码）
“假用户代理”库

但是，看着503出错。。这是文件

import requests
from bs4 import BeautifulSoup
from collections import Counter

#google, '소프트웨어 교육'
base_google1_url = "https://www.google.co.kr/search?q=%EC%86%8C%ED%94%84%ED%8A%B8%EC%9B%A8%EC%96%B4+%EA%B5%90%EC%9C%A1&safe=active&ei=rv_RWYyaKcmW0gTqsa_IDg&start="
extra_google1_url="&sa=N&biw=958&bih=954"
#google, 'sw교육'
base_google2_url="https://www.google.co.kr/search?q=sw%EA%B5%90%EC%9C%A1&safe=active&ei=kLzUWYONLYa30QS4r5KACA&start="
extra_google2_url="&sa=N&biw=887&bih=950"

#book.naver, '소프트웨어 교육'
base_naver_url = "http://book.naver.com/search/search_in.nhn?query=%EC%86%8C%ED%94%84%ED%8A%B8%EC%9B%A8%EC%96%B4+%EA%B5%90%EC%9C%A1&&pattern=0&orderType=rel.desc&viewType=list&searchType=bookSearch&serviceSm=service.basic&title=&author=&publisher=&isbn=&toc=&subject=&publishStartDay=&publishEndDay=&categoryId=&qdt=1&filterType=0&filterValue=&serviceIc=service.author&buyAllow=0&ebook=0&page="

#from: https://docs.python.org/2/library/collections.html
cnt = Counter()


#bring search info
def get_html (site_name, content_num):
    _html = ""
    if site_name == 'google1':
        google1_url = base_google1_url + str(content_num) + extra_google1_url
        resp = requests.get(google1_url)
    elif site_name == 'google2':
        google2_url = base_google2_url + str(content_num) + extra_google2_url
        resp = requests.get(google2_url)
    elif site_name == 'naver':
        naver_url = base_naver_url + str(content_num)
        resp = requests.get(naver_url)

    if resp.status_code == 200:
        _html = resp.text
    return _html

def word_count (name):
    for content in name.contents:
        words = content.split()
        for word in words:
            cnt[word] += 1
    counting = cnt
    return counting



def main():

    cnt.clear()
    counting = cnt
    page_num = 0

    #bring google '소프트웨어 교육' search info~~
    while page_num < 20:
        content_num = page_num*10
        html = get_html("google1", content_num)
        soup = BeautifulSoup(html, 'html.parser')
        texts = soup.find_all('h3')
        invalid_tag = ['b']
        for text in texts:
            for match in text.find_all(invalid_tag):
                match.replaceWithChildren()
            names = text.find_all('a')
            for name in names:
                counting = word_count(name)
        page_num += 1

    page_num = 0
    #bring google 'sw교육' search info~~
    while page_num < 20:
        content_num = page_num*10
        html = get_html("google2", content_num)
        soup = BeautifulSoup(html, 'html.parser')
        texts = soup.find_all('h3')
        invalid_tag = ['b', 'a']
        for text in texts:
            for match in text.find_all(invalid_tag):
                match.replaceWithChildren()
            counting = word_count(text)
            print(text)
        page_num += 1

    #bring naver book search info~~
    page_num = 1
    while page_num < 40:
        html = get_html("naver", page_num)
        soup = BeautifulSoup(html, 'html.parser')
        texts = soup.find_all("dt")
        invalid_tag = ['a','strong', 'span', 'img']
        for text in texts:
            for match in text.find_all(invalid_tag):
                match.replaceWithChildren()
            counting = word_count(text)
        page_num += 1

    #deleting useless keywords: if need to include len(k) == 1, instead of 'len(k) == 1 and ~ ' use following code --'or (len(k) == 1 and ord(k) >=33 and ord(k)<65)'
    #https://stackoverflow.com/questions/8448202/remove-more-than-one-key-from-python-dict
    del counting['소프트웨어'], counting['교육']
    for key in [k for k in counting if len(k) == 1 or type(k) == int]: del counting[key]

    count_20 = counting.most_common(20)
    print(count_20)




if __name__ == '__main__':
    main()

导入请求
从bs4导入BeautifulSoup
从收款进口柜台
#谷歌，'소프트웨어 교육'
基本搜索谷歌1搜索url=”https://www.google.co.kr/search?q=%EC%86%8C%ED%94%84%ED%8A%B8%EC%9B%A8%EC%96%B4+%EA%B5%90%EC%9C%A1&safe=active&ei=rv_RWYyaKcmW0gTqsa_IDg&start=”
额外的谷歌url=“&sa=N&biw=958&bih=954”
#谷歌软件교육'
基本搜索谷歌2搜索url=”https://www.google.co.kr/search?q=sw%EA%B5%90%EC%9C%A1&safe=active&ei=kLzUWYONLYa30QS4r5KACA&start="
额外的谷歌2\u url=“&sa=N&biw=887&bih=950”
#书，纳弗，'소프트웨어 교육'
基本导航url="http://book.naver.com/search/search_in.nhn?query=%EC%86%8C%ED%94%84%ED%8A%B8%EC%9B%A8%EC%96%B4+%EA%B5%90%EC%9C%A1&&pattern=0&orderType=rel.desc&viewType=list&searchType=bookSearch&serviceSm=service.basic&title=&author=&publisher=&isbn=&toc=&subject=&publishStartDay=&PublishDay=&categoryId=&qdt=1&filterType=0&filterValue=&serviceIc=service.author&buyAllow=0&ebook=0&page=”
#发件人：https://docs.python.org/2/library/collections.html
cnt=计数器（）
#带来搜索信息
def get_html（站点名称、内容编号）：
_html=“”
如果站点名称='google1'：
谷歌1\uURL=基本谷歌1\uURL+str（内容数量）+额外谷歌1\uURL
resp=requests.get（谷歌1_url）
elif site_name==“谷歌2”：
谷歌2\uURL=基本谷歌2\uURL+str（内容数量）+额外谷歌2\uURL
resp=requests.get（谷歌2_url）
elif站点名称=='naver'：
导航url=base\u导航url+str（content\u num）
resp=requests.get（导航url）
如果响应状态\ U代码==200：
_html=resp.text
返回html
def字数（名称）：
对于name.contents中的内容：
words=content.split（）
用文字表示：
cnt[word]+=1
计数=cnt
返回计数
def main（）：
cnt.clear（）
计数=cnt
页码=0
#带来谷歌的소프트웨어 교육' 搜索信息~~
当页码小于20时：
内容数量=页码数量*10
html=获取html（“谷歌1”，内容编号）
soup=BeautifulSoup（html，'html.parser'）
text=soup.find_all（'h3'））
无效的_标记=['b']
对于文本中的文本：
用于文本中的匹配。全部查找（无效标记）：
match.replaceWithChildren（）
名称=文本。查找所有（'a'）
对于名称中的名称：
计数=单词计数（名称）
页数+=1
页码=0
#带来谷歌的软件교육' 搜索信息~~
当页码小于20时：
内容数量=页码数量*10
html=获取html（“谷歌2”，内容编号）
soup=BeautifulSoup（html，'html.parser'）
text=soup.find_all（'h3'））
无效的_标记=['b'，'a']
对于文本中的文本：
用于文本中的匹配。全部查找（无效标记）：
match.replaceWithChildren（）
计数=单词计数（文本）
打印（文本）
页数+=1
#带上naver图书搜索信息~~
页码=1
当页码小于40时：
html=获取html（“导航”，页码）
soup=BeautifulSoup（html，'html.parser'）
文本=汤。查找所有（“dt”）
无效的_标记=['a'，'strong'，'span'，'img']
对于文本中的文本：
用于文本中的匹配。全部查找（无效标记）：
match.replaceWithChildren（）
计数=单词计数（文本）
页数+=1
#删除无用的关键字：如果需要包含len（k）==1，而不是“len（k）==1和~”使用以下代码--”或（len（k）==1和ord（k）>=33和ord（k），你可能有点想过头了。对于这样的任务，我认为代码太多了
在标题中手动添加用户代理
（用户代理）

链接列表。我认为不需要变量
以下是您可以做到的方法（）：
导入请求，lxml
从bs4导入BeautifulSoup
标题={
“用户代理”：
“Mozilla/5.0（Windows NT 10.0；Win64；x64）AppleWebKit/537.36（KHTML，类似Gecko）Chrome/72.0.3538.102 Safari/537.36 Edge/18.19582”
}
链接=[
'https://www.google.com/search?q=chuck 诺里斯'，
'https://www.google.com/search?q=minecraft “粉丝”，
'https://www.google.com/search?q=fus 罗达
]
对于链接中的url：
html=requests.get（url，headers=headers）.text
soup=BeautifulSoup（html，“lxml”）
对于汤中的标题。选择（'.DKV0Md'）：
title=titles.text
印刷品（标题）
#仅用于分离打印结果
打印（）


输出：
Chuck Norris-维基百科
查克·诺里斯：家
Chuck Norris-IMDb
Chuck Norris | Facebook
Chuck Norris（@Chuck Norris）|推特
查克·诺里斯-年龄、事实与电影-传记
101最佳Chuck Norris笑话-Chuck Norris事实-游行
Chuck Norris，著名退伍军人| millity.com
这些Chuck Norris的事实会让你更爱他。。。
官方Minecraft Wiki–Minecraft的终极资源
官方Minecraft维基-Minecraft维基-粉丝
minecraft爱好者关闭：minecraft-Reddit
2021年的900多个Minecraft粉丝创意|梦之队，我的。。。
14 Minecraft粉丝创意| Minecraft粉丝艺术、梦之队、我的。。。
Minecraft爱好者-Minecraft维基指南-IGN
不屈不挠的力量（天空边缘）|长者卷轴|粉丝
知道你的模因吗
Fus ro dah-城市词典
天空边缘：不屈不挠的力量-非官方的长者卷轴页面。。。
Fus | Thuum.org-龙语词典
60“Fus ro dah！”（老卷轴V:Skyrim）想法| Skyrim。。。

或者，你也可以使用SerpApi。这是一个付费API，免费试用5,00次