Python &引用;另见「;网络爬虫
我在编写维基百科的网络爬虫时遇到了问题。此爬虫程序需要显示特定链接的“请参阅”部分。更重要的是,这个爬虫还必须在第一次使用“See-allow”时为每个链接显示“See-allow”部分。例如:此Wiki页面:其“另见”部分包含此页面,此众筹页面包含以下内容: 本例基于单个链接,但在“另请参阅”部分中有10多个链接,这就是我需要创建的。我也必须递归地做。这是我的草稿的样子,但它给了我错误,而且它没有像它应该的那样工作(它甚至不是递归的):DPython &引用;另见「;网络爬虫,python,json,web-crawler,Python,Json,Web Crawler,我在编写维基百科的网络爬虫时遇到了问题。此爬虫程序需要显示特定链接的“请参阅”部分。更重要的是,这个爬虫还必须在第一次使用“See-allow”时为每个链接显示“See-allow”部分。例如:此Wiki页面:其“另见”部分包含此页面,此众筹页面包含以下内容: 本例基于单个链接,但在“另请参阅”部分中有10多个链接,这就是我需要创建的。我也必须递归地做。这是我的草稿的样子,但它给了我错误,而且它没有像它应该的那样工作(它甚至不是递归的):D #导入库 导入时间#用于延迟 导入urllib.req
#导入库
导入时间#用于延迟
导入urllib.request#提取网页
进口稀土
#定义页面
开始页面=”https://en.wikipedia.org/wiki/Spacetime"
种子_页=”https://en.wikipedia.org“#在英文维基百科上爬行
#下载整个Web文档(原始页面内容)
def下载页面(url):
尝试:
标题={}
标题['User-Agent']=“Mozilla/5.0(X11;Linux i686)AppleWebKit/537.17(KHTML,如Gecko)Chrome/24.0.1312.27 Safari/537.17”
req=urllib.request.request(url,headers=headers)
resp=urllib.request.urlopen(req)
respData=str(resp.read())
返回数据
例外情况除外,如e:
打印(str(e))
#提取“另请参见”部分元素
def extract_另见(第页):
如果第页中的“id=”另见“>”:
start\u see\u allow=page.find('id=“see\u allow”>”)
start\u list\u items=page.find(“”,start\u另见+1)
结束\u参见\u还=第页。查找(“”,开始\u列表\u项+1)
另请参见第页[开始列表项目:结束另请参见]
纯\u项\u原始=(re.sub(r'','',另请参见第节))。替换('\n',','))
pure_item_raw2=pure_item_raw。替换(“,”,“,”,“)
纯项目=纯项目2.替换(“,”,“,”,“)
标志=0
其他:
pure_item=“无相关链接”
标志=1
返回纯_项、标志
#通过“获取下一个链接”获取所有链接
def获取所有链接(第页):
链接=[]
尽管如此:
链接,结束链接=获取下一个链接(第页)
如果link==“无链接”:
打破
其他:
links.append(link)#追加名为“links”的列表中的所有链接
#睡眠时间(0.1)
页面=页面[结束链接:]
返回链接
#爬行起始
#检查URL中的文件类型,以便爬虫程序不会对图像和文本文件进行爬网
def扩展_扫描(url):
a=['.png'、'.jpg'、'.jpeg'、'.gif'、'.tif'、'.txt']
j=0
而j<(len(a)):
如果url中有[j]:
#打印(“那里!”)
flag2=1
打破
其他:
#打印(“不在那里!”)
flag2=0
j=j+1
#打印(flag2)
返回信号旗2
#不完整或重复URL的URL解析
def url_解析(url):
尝试:
从urllib.parse导入urlparse
除恐怖外:
从URLPRASE导入URLPRASE
url=url#.lower()#使其小写
s=urlparse(url)#解析给定的url
seed_page_n=seed_page#.lower()#小写
#t=urlparse(种子页面)#解析种子页面(参考页面)
i=0
标志=0
而我
#Import Libraries
import time #For Delay
import urllib.request #Extracting web pages
import re
#Defining pages
starting_page = "https://en.wikipedia.org/wiki/Spacetime"
seed_page = "https://en.wikipedia.org" #Crawling the English Wikipedia
#Downloading entire Web Document (Raw Page Content)
def download_page(url):
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
req = urllib.request.Request(url, headers = headers)
resp = urllib.request.urlopen(req)
respData = str(resp.read())
return respData
except Exception as e:
print(str(e))
#Extract the "See also" section elements
def extract_see_also(page):
if 'id="See_also">' in page:
start_see_also = page.find('id="See_also">')
start_list_items = page.find('<li>', start_see_also + 1)
end_see_also = page.find('<h2>', start_list_items + 1)
see_also_section = page[start_list_items: end_see_also]
pure_item_raw = (re.sub(r'<.+?>', '', see_also_section)).replace('\n', ',')
pure_item_raw2 = pure_item_raw.replace(',,', ',')
pure_item = pure_item_raw2.replace(',,', ',')
flag = 0
else:
pure_item = "No Related Links"
flag = 1
return pure_item, flag
#Getting all links with the help of 'get_next_links'
def get_all_links(page):
links = []
while True:
link, end_link = get_next_link(page)
if link == "no_links":
break
else:
links.append(link) #Append all the links in the list named 'Links'
#time.sleep(0.1)
page = page[end_link:]
return links
#Crawl Initiation
#Check for file type in URL so crawler does not crawl images and text files
def extension_scan(url):
a = ['.png','.jpg','.jpeg','.gif','.tif','.txt']
j = 0
while j < (len(a)):
if a[j] in url:
#print("There!")
flag2 = 1
break
else:
#print("Not There!")
flag2 = 0
j = j+1
#print(flag2)
return flag2
#URL parsing for incomplete or duplicate URLs
def url_parse(url):
try:
from urllib.parse import urlparse
except ImportError:
from urlparse import urlparse
url = url #.lower() #Make it lower case
s = urlparse(url) #parse the given url
seed_page_n = seed_page #.lower() #Make it lower case
#t = urlparse(seed_page_n) #parse the seed page (reference page)
i = 0
flag = 0
while i<=9:
if url == "/":
url = seed_page_n
flag = 0
elif not s.scheme:
url = "http://" + url
flag = 0
elif "#" in url:
url = url[:url.find("#")]
flag = 0
elif "?" in url:
url = url[:url.find("?")]
flag = 0
elif s.netloc == "":
url = seed_page + s.path
flag = 0
#elif "www" not in url:
# url = "www."[:7] + url[7:]
# flag = 0
elif url[len(url)-1] == "/":
url = url[:-1]
flag = 0
#elif s.netloc != t.netloc:
# url = url
# flag = 1
# break
else:
url = url
flag = 0
break
i = i+1
s = urlparse(url) #Parse after every loop to update the values of url parameters
return(url, flag)
t0 = time.time()
database = {} #Create a dictionary
#Main Crawl function that calls all the above function and crawls the entire site sequentially
def web_crawl():
to_crawl = [starting_page] #Define list name 'Seed Page'
#print(to_crawl)
crawled=[] #Define list name 'Seed Page'
#database = {} #Create a dictionary
#k = 0;
for k in range(0, 3):
i=0 #Initiate Variable to count No. of Iterations
while i<3: #Continue Looping till the 'to_crawl' list is not empty
urll = to_crawl.pop(0) #If there are elements in to_crawl then pop out the first element
urll,flag = url_parse(urll)
#print(urll)
flag2 = extension_scan(urll)
time.sleep(3)
#If flag = 1, then the URL is outside the seed domain URL
if flag == 1 or flag2 == 1:
pass #Do Nothing
else:
if urll in crawled: #Else check if the URL is already crawled
pass #Do Nothing
else: #If the URL is not already crawled, then crawl it and extract all the links from it
print("Link = " + urll)
raw_html = download_page(urll)
#print(raw_html)
see_also,flag2 = extract_see_also(raw_html)
print("Related Links = " + see_also)
crawled.append(urll)
#Remove duplicated from to_crawl
n = 1
j = 0
#k = 0
while j < (len(to_crawl)-n):
if to_crawl[j] in to_crawl[j+1:(len(to_crawl)-1)]:
to_crawl.pop(j)
n = n+1
else:
pass #Do Nothing
j = j+1
i=i+1
#print(to_crawl)
#print("Iteration No. = " + str(i))
#print("To Crawl = " + str(len(to_crawl)))
#print("Crawled = " + str(len(crawled)))
return ""
print (web_crawl())
t1 = time.time()
total_time = t1-t0