Python 2.7 简单网络爬虫_Python 2.7_Beautifulsoup

Python 2.7 简单网络爬虫

python-2.7

Python 2.7 简单网络爬虫,python-2.7,beautifulsoup,Python 2.7,Beautifulsoup,我用python为非常简单的网络爬虫编写了下面的程序，但当我运行它时，它返回给我 “非类型”对象不可调用，请帮助我好吗 import BeautifulSoup import urllib2 def union(p,q): for e in q: if e not in p: p.append(e) def crawler(SeedUrl): tocrawl=[SeedUrl] crawled=[] while tocra

我用python为非常简单的网络爬虫编写了下面的程序，但当我运行它时，它返回给我 “非类型”对象不可调用，请帮助我好吗

import BeautifulSoup
import urllib2
def union(p,q):
    for e in q:
        if e not in p:
            p.append(e)

def crawler(SeedUrl):
    tocrawl=[SeedUrl]
    crawled=[]
    while tocrawl:
        page=tocrawl.pop()
        pagesource=urllib2.urlopen(page)
        s=pagesource.read()
        soup=BeautifulSoup.BeautifulSoup(s)
        links=soup('a')        
        if page not in crawled:
            union(tocrawl,links)
            crawled.append(page)

    return crawled
crawler('http://www.princeton.edu/main/')

[更新]以下是完整的项目代码

[ANWSER]

soup（'a'）返回完整的html标记

<a href="http://itunes.apple.com/us/store">Buy Music Now</a>

您还需要验证url。请参阅以下步骤

我再次建议您使用python集合代替数组。您可以轻松地添加、ommit重复URL

尝试以下代码：

import re
import httplib
import urllib2
from urlparse import urlparse
import BeautifulSoup

regex = re.compile(
        r'^(?:http|ftp)s?://' # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)

def isValidUrl(url):
    if regex.match(url) is not None:
        return True;
    return False

def crawler(SeedUrl):
    tocrawl=[SeedUrl]
    crawled=[]
    while tocrawl:
        page=tocrawl.pop()
        print 'Crawled:'+page
        pagesource=urllib2.urlopen(page)
        s=pagesource.read()
        soup=BeautifulSoup.BeautifulSoup(s)
        links=soup.findAll('a',href=True)        
        if page not in crawled:
            for l in links:
                if isValidUrl(l['href']):
                    tocrawl.append(l['href'])
            crawled.append(page)   
    return crawled
crawler('http://www.princeton.edu/main/')

[更新]以下是完整的项目代码

[ANWSER]

soup（'a'）返回完整的html标记

<a href="http://itunes.apple.com/us/store">Buy Music Now</a>

您还需要验证url。请参阅以下步骤

我再次建议您使用python集合代替数组。您可以轻松地添加、ommit重复URL

尝试以下代码：

import re
import httplib
import urllib2
from urlparse import urlparse
import BeautifulSoup

regex = re.compile(
        r'^(?:http|ftp)s?://' # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
        r'localhost|' #localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
        r'(?::\d+)?' # optional port
        r'(?:/?|[/?]\S+)$', re.IGNORECASE)

def isValidUrl(url):
    if regex.match(url) is not None:
        return True;
    return False

def crawler(SeedUrl):
    tocrawl=[SeedUrl]
    crawled=[]
    while tocrawl:
        page=tocrawl.pop()
        print 'Crawled:'+page
        pagesource=urllib2.urlopen(page)
        s=pagesource.read()
        soup=BeautifulSoup.BeautifulSoup(s)
        links=soup.findAll('a',href=True)        
        if page not in crawled:
            for l in links:
                if isValidUrl(l['href']):
                    tocrawl.append(l['href'])
            crawled.append(page)   
    return crawled
crawler('http://www.princeton.edu/main/')

你能发布完整的回溯吗？这至少应该缩小对

None

值的函数调用范围。您可以发布完整的回溯吗？这至少应该缩小对

None

值的函数调用范围。