用python制作我自己的网络爬虫，它展示了页面排名的主要思想_Python_Web Crawler

用python制作我自己的网络爬虫，它展示了页面排名的主要思想

python web-crawler

用python制作我自己的网络爬虫，它展示了页面排名的主要思想,python,web-crawler,Python,Web Crawler,我正在尝试制作网页爬虫，它展示了网页排名的基本思想。我的代码对我来说似乎很好，但会返回错误e.x `Traceback (most recent call last): File "C:/Users/Janis/Desktop/WebCrawler/Web_crawler.py", line 89, in <module> webpages() File "C:/Users/Janis/Desktop/WebCrawler/Web_crawler.py", line

我正在尝试制作网页爬虫，它展示了网页排名的基本思想。我的代码对我来说似乎很好，但会返回错误e.x

`Traceback (most recent call last):
  File "C:/Users/Janis/Desktop/WebCrawler/Web_crawler.py", line 89, in <module>
    webpages()
  File "C:/Users/Janis/Desktop/WebCrawler/Web_crawler.py", line 17, in webpages
    get_single_item_data(href)
  File "C:/Users/Janis/Desktop/WebCrawler/Web_crawler.py", line 23, in get_single_item_data
    source_code = requests.get(item_url)
  File "C:\Python34\lib\site-packages\requests\api.py", line 65, in get
    return request('get', url, **kwargs)
  File "C:\Python34\lib\site-packages\requests\api.py", line 49, in request
    response = session.request(method=method, url=url, **kwargs)
  File "C:\Python34\lib\site-packages\requests\sessions.py", line 447, in request
    prep = self.prepare_request(req)
  File "C:\Python34\lib\site-packages\requests\sessions.py", line 378, in prepare_request
    hooks=merge_hooks(request.hooks, self.hooks),
  File "C:\Python34\lib\site-packages\requests\models.py", line 303, in prepare
    self.prepare_url(url, params)
  File "C:\Python34\lib\site-packages\requests\models.py", line 360, in prepare_url
    "Perhaps you meant http://{0}?".format(url))
requests.exceptions.MissingSchema: Invalid URL '//www.hm.com/lv/logout': No schema supplied. Perhaps you meant http:////www.hm.com/lv/logout?`

也许问题出在两个

上，但我敢肯定，无论如何，当我尝试crall其他网页e.x时。http://en.wikipedia.org/wiki/Wiki 它返回

None

和相同的错误

    import requests
    from bs4 import BeautifulSoup
    from collections import defaultdict
    from operator import itemgetter

    all_links = defaultdict(int)

    def webpages():

            url = 'http://www.hm.com/lv/'
            source_code = requests.get(url)
            text = source_code.text
            soup = BeautifulSoup(text)
            for link in soup.findAll ('a'):
                href = link.get('href')
                print(href)
                get_single_item_data(href)
            return all_links

    def get_single_item_data(item_url):
        #if not item_url.startswith('http'):
            #item_url = 'http' + item_url
        source_code = requests.get(item_url)
        text = source_code.text
        soup = BeautifulSoup(text)
        for link in soup.findAll('a'):
            href = link.get('href')
            if href and href.startswith('http://www.'):
                if href:
                    all_links[href] += 1
                print(href)

    def sort_algorithm(list):
        for index in range(1,len(list)):
            value= list[index]
            i = index - 1
            while i>=0:
                if value < list[i]:
                    list[i+1] = list[i]
                    list[i] = value
                    i=i -1
                else:
                    break

    vieni = ["", "viens", "divi", "tris", "cetri", "pieci",
             "sesi", "septini", "astoni", "devini"]
    padsmiti = ["", "vienpadsmit", "divpadsmit", "trispadsmit", "cetrpadsmit",
             "piecpadsmit", 'sespadsmit', "septinpadsmit", "astonpadsmit", "devinpadsmit"]
    desmiti = ["", "desmit", "divdesmit", "trisdesmit", "cetrdesmit",
            "piecdesmit", "sesdesmit", "septindesmit", "astondesmit", "devindesmit"]



    def num_to_words(n):
        words = []
        if n == 0:
            words.append("zero")
        else:
            num_str = "{}".format(n)
            groups = (len(num_str) + 2) // 3
            num_str = num_str.zfill(groups * 3)
            for i in range(0, groups * 3, 3):
                h = int(num_str[i])
                t = int(num_str[i + 1])
                u = int(num_str[i + 2])
                print()
                print(vieni[i])
                g = groups - (i // 3 + 1)
                if h >= 1:
                    words.append(vieni[h])
                    words.append("hundred")
                    if int(num_str) % 100:
                        words.append("and")
                if t > 1:
                    words.append(desmiti[t])
                    if u >= 1:
                        words.append(vieni[u])
                elif t == 1:
                    if u >= 1:
                        words.append(padsmiti[u])
                    else:
                        words.append(desmiti[t])
                else:
                    if u >= 1:
                        words.append(vieni[u])

        return " ".join(words)

    webpages()

    for k, v in sorted(webpages().items(),key=itemgetter(1),reverse=True):
        print(k, num_to_words(v))

导入请求
从bs4导入BeautifulSoup
从集合导入defaultdict
从运算符导入itemgetter
所有链接=defaultdict（int）
def webpages（）：
url='1〕http://www.hm.com/lv/'
source_code=requests.get（url）
text=source\u code.text
soup=BeautifulSoup（文本）
对于soup.findAll（'a'）中的链接：
href=link.get（'href'）
打印（href）
获取单个项目数据（href）
返回所有链接
def获取单个项目数据（项目url）：
#如果不是项目_url.startswith（'http'）：
#item_url='http'+item_url
source\u code=requests.get（item\u url）
text=source\u code.text
soup=BeautifulSoup（文本）
对于soup.findAll（'a'）中的链接：
href=link.get（'href'）
if href和href.startswith（'http://www.'):
如果href:
所有链接[href]+=1
打印（href）
def排序算法（列表）：
对于范围（1，len（list））中的索引：
值=列表[索引]
i=指数-1
当i>=0时：
如果值<列表[i]：
列表[i+1]=列表[i]
列表[i]=值
i=i-1
其他：
打破
vieni=[”、“viens”、“divi”、“tris”、“cetri”、“pieci”，
“塞西”、“塞普蒂尼”、“阿斯托尼”、“德维尼”]
Padsmit=[”、“vienpadsmit”、“divpadsmit”、“trispadsmit”、“cetrpadsmit”，
“piecpadsmit”、“sespadsmit”、“septinpadsmit”、“astonpadsmit”、“devinpadsmit”]
desmiti=[”，“desmit”，“divdesmit”，“trisdesmit”，“cetrdesmit”，
“piecdesmit”、“sesdesmit”、“septindesmit”、“Astonesmit”、“devindesmit”]
def num_to_单词（n）：
单词=[]
如果n==0：
单词。追加（“零”）
其他：
num_str=“{}”。格式（n）
组=（len（num_str）+2）//3
num_str=num_str.zfill（组*3）
对于范围内的i（0，组*3，3）：
h=int（num_str[i]）
t=int（num_str[i+1]）
u=int（num_str[i+2]）
打印（）
印刷品（维耶尼[i]）
g=组-（i//3+1）
如果h>=1：
words.append（vieni[h]）
字。附加（“百”）
如果int（num_str）%100：
词语。附加（“和”）
如果t>1：
words.append（desmiti[t]）
如果u>=1：
words.append（vieni[u]）
elif t==1：
如果u>=1：
words.append（padsmiti[u]）
其他：
words.append（desmiti[t]）
其他：
如果u>=1：
words.append（vieni[u]）
返回“”连接（字）
网页（）
对于排序中的k，v（webpages（）.items（），key=itemgetter（1），reverse=True）：
打印（k，从数字到单词（v））

来自网页循环函数的链接可能以两个斜杠开头。这意味着此链接使用当前架构。例如，打开“//en.wikipedia.org/login”链接将是“”。开放将是

在html“A”标记中打开url的更好方法是使用urlparse.urljoin函数。它连接目标url和当前url。不考虑绝对/相对路径

希望这能对你有所帮助。

因为我的语言中有数字，所以不要对此感到困惑。它的意思是单位=[“”、“一”…]等等。我不熟悉这个函数。你能告诉我它的语法吗。或者你的意思是像

url=http://www.hm.com/lv/'source_code=requests.get（url）text=source_code.text soup=BeautifulSoup（text）for soup.findAll（'a'）：href=link.get（'href'））urlparse.urljoin(http://www.)打印（href）获取单个项目数据（href）将网页功能中的所有链接更改获取单个项目数据（href）
返回到获取单个项目数据（urljoin（url，href））
。从URLPASSE import urljoin向导入添加。
    import requests
    from bs4 import BeautifulSoup
    from collections import defaultdict
    from operator import itemgetter

    all_links = defaultdict(int)

    def webpages():

            url = 'http://www.hm.com/lv/'
            source_code = requests.get(url)
            text = source_code.text
            soup = BeautifulSoup(text)
            for link in soup.findAll ('a'):
                href = link.get('href')
                print(href)
                get_single_item_data(href)
            return all_links

    def get_single_item_data(item_url):
        #if not item_url.startswith('http'):
            #item_url = 'http' + item_url
        source_code = requests.get(item_url)
        text = source_code.text
        soup = BeautifulSoup(text)
        for link in soup.findAll('a'):
            href = link.get('href')
            if href and href.startswith('http://www.'):
                if href:
                    all_links[href] += 1
                print(href)

    def sort_algorithm(list):
        for index in range(1,len(list)):
            value= list[index]
            i = index - 1
            while i>=0:
                if value < list[i]:
                    list[i+1] = list[i]
                    list[i] = value
                    i=i -1
                else:
                    break

    vieni = ["", "viens", "divi", "tris", "cetri", "pieci",
             "sesi", "septini", "astoni", "devini"]
    padsmiti = ["", "vienpadsmit", "divpadsmit", "trispadsmit", "cetrpadsmit",
             "piecpadsmit", 'sespadsmit', "septinpadsmit", "astonpadsmit", "devinpadsmit"]
    desmiti = ["", "desmit", "divdesmit", "trisdesmit", "cetrdesmit",
            "piecdesmit", "sesdesmit", "septindesmit", "astondesmit", "devindesmit"]



    def num_to_words(n):
        words = []
        if n == 0:
            words.append("zero")
        else:
            num_str = "{}".format(n)
            groups = (len(num_str) + 2) // 3
            num_str = num_str.zfill(groups * 3)
            for i in range(0, groups * 3, 3):
                h = int(num_str[i])
                t = int(num_str[i + 1])
                u = int(num_str[i + 2])
                print()
                print(vieni[i])
                g = groups - (i // 3 + 1)
                if h >= 1:
                    words.append(vieni[h])
                    words.append("hundred")
                    if int(num_str) % 100:
                        words.append("and")
                if t > 1:
                    words.append(desmiti[t])
                    if u >= 1:
                        words.append(vieni[u])
                elif t == 1:
                    if u >= 1:
                        words.append(padsmiti[u])
                    else:
                        words.append(desmiti[t])
                else:
                    if u >= 1:
                        words.append(vieni[u])

        return " ".join(words)

    webpages()

    for k, v in sorted(webpages().items(),key=itemgetter(1),reverse=True):
        print(k, num_to_words(v))