用python制作我自己的网络爬虫,它展示了页面排名的主要思想
我正在尝试制作网页爬虫,它展示了网页排名的基本思想。我的代码对我来说似乎很好,但会返回错误e.x用python制作我自己的网络爬虫,它展示了页面排名的主要思想,python,web-crawler,Python,Web Crawler,我正在尝试制作网页爬虫,它展示了网页排名的基本思想。我的代码对我来说似乎很好,但会返回错误e.x `Traceback (most recent call last): File "C:/Users/Janis/Desktop/WebCrawler/Web_crawler.py", line 89, in <module> webpages() File "C:/Users/Janis/Desktop/WebCrawler/Web_crawler.py", line
`Traceback (most recent call last):
File "C:/Users/Janis/Desktop/WebCrawler/Web_crawler.py", line 89, in <module>
webpages()
File "C:/Users/Janis/Desktop/WebCrawler/Web_crawler.py", line 17, in webpages
get_single_item_data(href)
File "C:/Users/Janis/Desktop/WebCrawler/Web_crawler.py", line 23, in get_single_item_data
source_code = requests.get(item_url)
File "C:\Python34\lib\site-packages\requests\api.py", line 65, in get
return request('get', url, **kwargs)
File "C:\Python34\lib\site-packages\requests\api.py", line 49, in request
response = session.request(method=method, url=url, **kwargs)
File "C:\Python34\lib\site-packages\requests\sessions.py", line 447, in request
prep = self.prepare_request(req)
File "C:\Python34\lib\site-packages\requests\sessions.py", line 378, in prepare_request
hooks=merge_hooks(request.hooks, self.hooks),
File "C:\Python34\lib\site-packages\requests\models.py", line 303, in prepare
self.prepare_url(url, params)
File "C:\Python34\lib\site-packages\requests\models.py", line 360, in prepare_url
"Perhaps you meant http://{0}?".format(url))
requests.exceptions.MissingSchema: Invalid URL '//www.hm.com/lv/logout': No schema supplied. Perhaps you meant http:////www.hm.com/lv/logout?`
也许问题出在两个/
上,但我敢肯定,无论如何,当我尝试crall其他网页e.x时。http://en.wikipedia.org/wiki/Wiki 它返回None
和相同的错误
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
from operator import itemgetter
all_links = defaultdict(int)
def webpages():
url = 'http://www.hm.com/lv/'
source_code = requests.get(url)
text = source_code.text
soup = BeautifulSoup(text)
for link in soup.findAll ('a'):
href = link.get('href')
print(href)
get_single_item_data(href)
return all_links
def get_single_item_data(item_url):
#if not item_url.startswith('http'):
#item_url = 'http' + item_url
source_code = requests.get(item_url)
text = source_code.text
soup = BeautifulSoup(text)
for link in soup.findAll('a'):
href = link.get('href')
if href and href.startswith('http://www.'):
if href:
all_links[href] += 1
print(href)
def sort_algorithm(list):
for index in range(1,len(list)):
value= list[index]
i = index - 1
while i>=0:
if value < list[i]:
list[i+1] = list[i]
list[i] = value
i=i -1
else:
break
vieni = ["", "viens", "divi", "tris", "cetri", "pieci",
"sesi", "septini", "astoni", "devini"]
padsmiti = ["", "vienpadsmit", "divpadsmit", "trispadsmit", "cetrpadsmit",
"piecpadsmit", 'sespadsmit', "septinpadsmit", "astonpadsmit", "devinpadsmit"]
desmiti = ["", "desmit", "divdesmit", "trisdesmit", "cetrdesmit",
"piecdesmit", "sesdesmit", "septindesmit", "astondesmit", "devindesmit"]
def num_to_words(n):
words = []
if n == 0:
words.append("zero")
else:
num_str = "{}".format(n)
groups = (len(num_str) + 2) // 3
num_str = num_str.zfill(groups * 3)
for i in range(0, groups * 3, 3):
h = int(num_str[i])
t = int(num_str[i + 1])
u = int(num_str[i + 2])
print()
print(vieni[i])
g = groups - (i // 3 + 1)
if h >= 1:
words.append(vieni[h])
words.append("hundred")
if int(num_str) % 100:
words.append("and")
if t > 1:
words.append(desmiti[t])
if u >= 1:
words.append(vieni[u])
elif t == 1:
if u >= 1:
words.append(padsmiti[u])
else:
words.append(desmiti[t])
else:
if u >= 1:
words.append(vieni[u])
return " ".join(words)
webpages()
for k, v in sorted(webpages().items(),key=itemgetter(1),reverse=True):
print(k, num_to_words(v))
导入请求
从bs4导入BeautifulSoup
从集合导入defaultdict
从运算符导入itemgetter
所有链接=defaultdict(int)
def webpages():
url='1〕http://www.hm.com/lv/'
source_code=requests.get(url)
text=source\u code.text
soup=BeautifulSoup(文本)
对于soup.findAll('a')中的链接:
href=link.get('href')
打印(href)
获取单个项目数据(href)
返回所有链接
def获取单个项目数据(项目url):
#如果不是项目_url.startswith('http'):
#item_url='http'+item_url
source\u code=requests.get(item\u url)
text=source\u code.text
soup=BeautifulSoup(文本)
对于soup.findAll('a')中的链接:
href=link.get('href')
if href和href.startswith('http://www.'):
如果href:
所有链接[href]+=1
打印(href)
def排序算法(列表):
对于范围(1,len(list))中的索引:
值=列表[索引]
i=指数-1
当i>=0时:
如果值<列表[i]:
列表[i+1]=列表[i]
列表[i]=值
i=i-1
其他:
打破
vieni=[”、“viens”、“divi”、“tris”、“cetri”、“pieci”,
“塞西”、“塞普蒂尼”、“阿斯托尼”、“德维尼”]
Padsmit=[”、“vienpadsmit”、“divpadsmit”、“trispadsmit”、“cetrpadsmit”,
“piecpadsmit”、“sespadsmit”、“septinpadsmit”、“astonpadsmit”、“devinpadsmit”]
desmiti=[”,“desmit”,“divdesmit”,“trisdesmit”,“cetrdesmit”,
“piecdesmit”、“sesdesmit”、“septindesmit”、“Astonesmit”、“devindesmit”]
def num_to_单词(n):
单词=[]
如果n==0:
单词。追加(“零”)
其他:
num_str=“{}”。格式(n)
组=(len(num_str)+2)//3
num_str=num_str.zfill(组*3)
对于范围内的i(0,组*3,3):
h=int(num_str[i])
t=int(num_str[i+1])
u=int(num_str[i+2])
打印()
印刷品(维耶尼[i])
g=组-(i//3+1)
如果h>=1:
words.append(vieni[h])
字。附加(“百”)
如果int(num_str)%100:
词语。附加(“和”)
如果t>1:
words.append(desmiti[t])
如果u>=1:
words.append(vieni[u])
elif t==1:
如果u>=1:
words.append(padsmiti[u])
其他:
words.append(desmiti[t])
其他:
如果u>=1:
words.append(vieni[u])
返回“”连接(字)
网页()
对于排序中的k,v(webpages().items(),key=itemgetter(1),reverse=True):
打印(k,从数字到单词(v))
来自网页循环函数的链接可能以两个斜杠开头。这意味着此链接使用当前架构。例如,打开“//en.wikipedia.org/login”链接将是“”。开放将是
在html“A”标记中打开url的更好方法是使用urlparse.urljoin函数。它连接目标url和当前url。不考虑绝对/相对路径
希望这能对你有所帮助。因为我的语言中有数字,所以不要对此感到困惑。它的意思是单位=[“”、“一”…]等等。我不熟悉这个函数。你能告诉我它的语法吗。或者你的意思是像
url=http://www.hm.com/lv/'source_code=requests.get(url)text=source_code.text soup=BeautifulSoup(text)for soup.findAll('a'):href=link.get('href'))urlparse.urljoin(http://www.)打印(href)获取单个项目数据(href)将网页功能中的所有链接更改获取单个项目数据(href)
返回到获取单个项目数据(urljoin(url,href))
。从URLPASSE import urljoin向导入添加。
import requests
from bs4 import BeautifulSoup
from collections import defaultdict
from operator import itemgetter
all_links = defaultdict(int)
def webpages():
url = 'http://www.hm.com/lv/'
source_code = requests.get(url)
text = source_code.text
soup = BeautifulSoup(text)
for link in soup.findAll ('a'):
href = link.get('href')
print(href)
get_single_item_data(href)
return all_links
def get_single_item_data(item_url):
#if not item_url.startswith('http'):
#item_url = 'http' + item_url
source_code = requests.get(item_url)
text = source_code.text
soup = BeautifulSoup(text)
for link in soup.findAll('a'):
href = link.get('href')
if href and href.startswith('http://www.'):
if href:
all_links[href] += 1
print(href)
def sort_algorithm(list):
for index in range(1,len(list)):
value= list[index]
i = index - 1
while i>=0:
if value < list[i]:
list[i+1] = list[i]
list[i] = value
i=i -1
else:
break
vieni = ["", "viens", "divi", "tris", "cetri", "pieci",
"sesi", "septini", "astoni", "devini"]
padsmiti = ["", "vienpadsmit", "divpadsmit", "trispadsmit", "cetrpadsmit",
"piecpadsmit", 'sespadsmit', "septinpadsmit", "astonpadsmit", "devinpadsmit"]
desmiti = ["", "desmit", "divdesmit", "trisdesmit", "cetrdesmit",
"piecdesmit", "sesdesmit", "septindesmit", "astondesmit", "devindesmit"]
def num_to_words(n):
words = []
if n == 0:
words.append("zero")
else:
num_str = "{}".format(n)
groups = (len(num_str) + 2) // 3
num_str = num_str.zfill(groups * 3)
for i in range(0, groups * 3, 3):
h = int(num_str[i])
t = int(num_str[i + 1])
u = int(num_str[i + 2])
print()
print(vieni[i])
g = groups - (i // 3 + 1)
if h >= 1:
words.append(vieni[h])
words.append("hundred")
if int(num_str) % 100:
words.append("and")
if t > 1:
words.append(desmiti[t])
if u >= 1:
words.append(vieni[u])
elif t == 1:
if u >= 1:
words.append(padsmiti[u])
else:
words.append(desmiti[t])
else:
if u >= 1:
words.append(vieni[u])
return " ".join(words)
webpages()
for k, v in sorted(webpages().items(),key=itemgetter(1),reverse=True):
print(k, num_to_words(v))