Python 2.7 简单网络爬虫
我用python为非常简单的网络爬虫编写了下面的程序,但当我运行它时,它返回给我 “非类型”对象不可调用,请帮助我好吗Python 2.7 简单网络爬虫,python-2.7,beautifulsoup,Python 2.7,Beautifulsoup,我用python为非常简单的网络爬虫编写了下面的程序,但当我运行它时,它返回给我 “非类型”对象不可调用,请帮助我好吗 import BeautifulSoup import urllib2 def union(p,q): for e in q: if e not in p: p.append(e) def crawler(SeedUrl): tocrawl=[SeedUrl] crawled=[] while tocra
import BeautifulSoup
import urllib2
def union(p,q):
for e in q:
if e not in p:
p.append(e)
def crawler(SeedUrl):
tocrawl=[SeedUrl]
crawled=[]
while tocrawl:
page=tocrawl.pop()
pagesource=urllib2.urlopen(page)
s=pagesource.read()
soup=BeautifulSoup.BeautifulSoup(s)
links=soup('a')
if page not in crawled:
union(tocrawl,links)
crawled.append(page)
return crawled
crawler('http://www.princeton.edu/main/')
[更新]以下是完整的项目代码 [ANWSER] soup('a')返回完整的html标记
<a href="http://itunes.apple.com/us/store">Buy Music Now</a>
您还需要验证url。请参阅以下步骤
import re
import httplib
import urllib2
from urlparse import urlparse
import BeautifulSoup
regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
r'localhost|' #localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
def isValidUrl(url):
if regex.match(url) is not None:
return True;
return False
def crawler(SeedUrl):
tocrawl=[SeedUrl]
crawled=[]
while tocrawl:
page=tocrawl.pop()
print 'Crawled:'+page
pagesource=urllib2.urlopen(page)
s=pagesource.read()
soup=BeautifulSoup.BeautifulSoup(s)
links=soup.findAll('a',href=True)
if page not in crawled:
for l in links:
if isValidUrl(l['href']):
tocrawl.append(l['href'])
crawled.append(page)
return crawled
crawler('http://www.princeton.edu/main/')
[更新]以下是完整的项目代码 [ANWSER] soup('a')返回完整的html标记
<a href="http://itunes.apple.com/us/store">Buy Music Now</a>
您还需要验证url。请参阅以下步骤
import re
import httplib
import urllib2
from urlparse import urlparse
import BeautifulSoup
regex = re.compile(
r'^(?:http|ftp)s?://' # http:// or https://
r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' #domain...
r'localhost|' #localhost...
r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})' # ...or ip
r'(?::\d+)?' # optional port
r'(?:/?|[/?]\S+)$', re.IGNORECASE)
def isValidUrl(url):
if regex.match(url) is not None:
return True;
return False
def crawler(SeedUrl):
tocrawl=[SeedUrl]
crawled=[]
while tocrawl:
page=tocrawl.pop()
print 'Crawled:'+page
pagesource=urllib2.urlopen(page)
s=pagesource.read()
soup=BeautifulSoup.BeautifulSoup(s)
links=soup.findAll('a',href=True)
if page not in crawled:
for l in links:
if isValidUrl(l['href']):
tocrawl.append(l['href'])
crawled.append(page)
return crawled
crawler('http://www.princeton.edu/main/')
你能发布完整的回溯吗?这至少应该缩小对
None
值的函数调用范围。您可以发布完整的回溯吗?这至少应该缩小对None
值的函数调用范围。