Python &引用;另见「;网络爬虫

Python &引用;另见「;网络爬虫,python,json,web-crawler,Python,Json,Web Crawler,我在编写维基百科的网络爬虫时遇到了问题。此爬虫程序需要显示特定链接的“请参阅”部分。更重要的是,这个爬虫还必须在第一次使用“See-allow”时为每个链接显示“See-allow”部分。例如:此Wiki页面:其“另见”部分包含此页面,此众筹页面包含以下内容: 本例基于单个链接,但在“另请参阅”部分中有10多个链接,这就是我需要创建的。我也必须递归地做。这是我的草稿的样子,但它给了我错误,而且它没有像它应该的那样工作(它甚至不是递归的):D #导入库 导入时间#用于延迟 导入urllib.req

我在编写维基百科的网络爬虫时遇到了问题。此爬虫程序需要显示特定链接的“请参阅”部分。更重要的是,这个爬虫还必须在第一次使用“See-allow”时为每个链接显示“See-allow”部分。例如:此Wiki页面:其“另见”部分包含此页面,此众筹页面包含以下内容:

本例基于单个链接,但在“另请参阅”部分中有10多个链接,这就是我需要创建的。我也必须递归地做。这是我的草稿的样子,但它给了我错误,而且它没有像它应该的那样工作(它甚至不是递归的):D

#导入库
导入时间#用于延迟
导入urllib.request#提取网页
进口稀土
#定义页面
开始页面=”https://en.wikipedia.org/wiki/Spacetime"
种子_页=”https://en.wikipedia.org“#在英文维基百科上爬行
#下载整个Web文档(原始页面内容)
def下载页面(url):
尝试:
标题={}
标题['User-Agent']=“Mozilla/5.0(X11;Linux i686)AppleWebKit/537.17(KHTML,如Gecko)Chrome/24.0.1312.27 Safari/537.17”
req=urllib.request.request(url,headers=headers)
resp=urllib.request.urlopen(req)
respData=str(resp.read())
返回数据
例外情况除外,如e:
打印(str(e))
#提取“另请参见”部分元素
def extract_另见(第页):
如果第页中的“id=”另见“>”:
start\u see\u allow=page.find('id=“see\u allow”>”)
start\u list\u items=page.find(“
  • ”,start\u另见+1) 结束\u参见\u还=第页。查找(“”,开始\u列表\u项+1) 另请参见第页[开始列表项目:结束另请参见] 纯\u项\u原始=(re.sub(r'','',另请参见第节))。替换('\n',',')) pure_item_raw2=pure_item_raw。替换(“,”,“,”,“) 纯项目=纯项目2.替换(“,”,“,”,“) 标志=0 其他: pure_item=“无相关链接” 标志=1 返回纯_项、标志 #通过“获取下一个链接”获取所有链接 def获取所有链接(第页): 链接=[] 尽管如此: 链接,结束链接=获取下一个链接(第页) 如果link==“无链接”: 打破 其他: links.append(link)#追加名为“links”的列表中的所有链接 #睡眠时间(0.1) 页面=页面[结束链接:] 返回链接 #爬行起始 #检查URL中的文件类型,以便爬虫程序不会对图像和文本文件进行爬网 def扩展_扫描(url): a=['.png'、'.jpg'、'.jpeg'、'.gif'、'.tif'、'.txt'] j=0 而j<(len(a)): 如果url中有[j]: #打印(“那里!”) flag2=1 打破 其他: #打印(“不在那里!”) flag2=0 j=j+1 #打印(flag2) 返回信号旗2 #不完整或重复URL的URL解析 def url_解析(url): 尝试: 从urllib.parse导入urlparse 除恐怖外: 从URLPRASE导入URLPRASE url=url#.lower()#使其小写 s=urlparse(url)#解析给定的url seed_page_n=seed_page#.lower()#小写 #t=urlparse(种子页面)#解析种子页面(参考页面) i=0 标志=0 而我
        #Import Libraries
        import time     #For Delay
        import urllib.request    #Extracting web pages
        import re
    
        #Defining pages
        starting_page = "https://en.wikipedia.org/wiki/Spacetime"
        seed_page = "https://en.wikipedia.org"  #Crawling the English Wikipedia
    
        #Downloading entire Web Document (Raw Page Content)
        def download_page(url):
            try:
                headers = {}
                headers['User-Agent'] = "Mozilla/5.0 (X11; Linux i686) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.27 Safari/537.17"
                req = urllib.request.Request(url, headers = headers)
                resp = urllib.request.urlopen(req)
                respData = str(resp.read())
                return respData
            except Exception as e:
                print(str(e))
    
        #Extract the "See also" section elements
        def extract_see_also(page):
            if 'id="See_also">' in page:
                start_see_also = page.find('id="See_also">')
                start_list_items = page.find('<li>', start_see_also + 1)
                end_see_also = page.find('<h2>', start_list_items + 1)
                see_also_section = page[start_list_items: end_see_also]
                pure_item_raw = (re.sub(r'<.+?>', '', see_also_section)).replace('\n', ',')
                pure_item_raw2 = pure_item_raw.replace(',,', ',')
                pure_item = pure_item_raw2.replace(',,', ',')
                flag = 0
            else:
                pure_item = "No Related Links"
                flag = 1
            return pure_item, flag
    
        #Getting all links with the help of 'get_next_links'
        def get_all_links(page):
            links = []
            while True:
                link, end_link = get_next_link(page)
                if link == "no_links":
                    break
                else:
                    links.append(link)      #Append all the links in the list named 'Links'
                    #time.sleep(0.1)
                    page = page[end_link:]
            return links 
    
        #Crawl Initiation
        #Check for file type in URL so crawler does not crawl images and text files
        def extension_scan(url):
            a = ['.png','.jpg','.jpeg','.gif','.tif','.txt']
            j = 0
            while j < (len(a)):
                if a[j] in url:
                    #print("There!")
                    flag2 = 1
                    break
                else:
                    #print("Not There!")
                    flag2 = 0
                    j = j+1
            #print(flag2)
            return flag2
    
        #URL parsing for incomplete or duplicate URLs
        def url_parse(url):
            try:
                from urllib.parse import urlparse
            except ImportError:
                from urlparse import urlparse
            url = url  #.lower()    #Make it lower case
            s = urlparse(url)       #parse the given url
            seed_page_n = seed_page #.lower()       #Make it lower case
            #t = urlparse(seed_page_n)     #parse the seed page (reference page)
            i = 0
            flag = 0
            while i<=9:
                if url == "/":
                    url = seed_page_n
                    flag = 0  
                elif not s.scheme:
                    url = "http://" + url
                    flag = 0
                elif "#" in url:
                    url = url[:url.find("#")]
                    flag = 0
                elif "?" in url:
                    url = url[:url.find("?")]
                    flag = 0
                elif s.netloc == "":
                    url = seed_page + s.path
                    flag = 0
                #elif "www" not in url:
                #    url = "www."[:7] + url[7:]
                #    flag = 0
    
                elif url[len(url)-1] == "/":
                    url = url[:-1]
                    flag = 0
                #elif s.netloc != t.netloc:
                #    url = url
                #    flag = 1
                #    break        
                else:
                    url = url
                    flag = 0
                    break
    
                i = i+1
                s = urlparse(url)   #Parse after every loop to update the values of url parameters
            return(url, flag)
    
    
    
        t0 = time.time()
        database = {}   #Create a dictionary
    
        #Main Crawl function that calls all the above function and crawls the entire site sequentially
        def web_crawl():  
            to_crawl = [starting_page]      #Define list name 'Seed Page'
            #print(to_crawl)
            crawled=[]      #Define list name 'Seed Page'
            #database = {}   #Create a dictionary
            #k = 0;
            for k in range(0, 3):
                i=0        #Initiate Variable to count No. of Iterations
                while i<3:     #Continue Looping till the 'to_crawl' list is not empty
                    urll = to_crawl.pop(0)      #If there are elements in to_crawl then pop out the first element
                    urll,flag = url_parse(urll)
                    #print(urll)
                    flag2 = extension_scan(urll)
                    time.sleep(3)
    
                    #If flag = 1, then the URL is outside the seed domain URL
                    if flag == 1 or flag2 == 1:
                        pass        #Do Nothing
    
                    else:       
                        if urll in crawled:     #Else check if the URL is already crawled
                            pass        #Do Nothing
                        else:       #If the URL is not already crawled, then crawl it and extract all the links from it
                            print("Link = " + urll)
    
                            raw_html = download_page(urll)
                            #print(raw_html)
    
    
                            see_also,flag2 = extract_see_also(raw_html)
                            print("Related Links = " + see_also)
    
    
                            crawled.append(urll)                  
    
                            #Remove duplicated from to_crawl
                            n = 1
                            j = 0
                            #k = 0
                            while j < (len(to_crawl)-n):
                                if to_crawl[j] in to_crawl[j+1:(len(to_crawl)-1)]:
                                    to_crawl.pop(j)
                                    n = n+1
                                else:
                                    pass     #Do Nothing
                                j = j+1
                        i=i+1
    
                        #print(to_crawl)
                        #print("Iteration No. = " + str(i))
                        #print("To Crawl = " + str(len(to_crawl)))
                        #print("Crawled = " + str(len(crawled)))
            return ""
    
        print (web_crawl())
    
        t1 = time.time()
        total_time = t1-t0