Python 我如何实现一个抓取广告链接的网络爬虫?
为了获取培训数据,我编写了一个爬虫程序,跟踪Alexa上的500强网站,深度为2,并将找到的所有链接写入一个文件。现在,它查找html中的所有链接并将它们写入文件。问题是,爬虫程序会丢失指向广告的所有链接,其中一些链接位于iFrame或CSS文件中。我如何改变我的网络爬虫,使其刮除所有链接,包括广告?相关代码可在下面找到 类爬虫对象:Python 我如何实现一个抓取广告链接的网络爬虫?,python,web-crawler,Python,Web Crawler,为了获取培训数据,我编写了一个爬虫程序,跟踪Alexa上的500强网站,深度为2,并将找到的所有链接写入一个文件。现在,它查找html中的所有链接并将它们写入文件。问题是,爬虫程序会丢失指向广告的所有链接,其中一些链接位于iFrame或CSS文件中。我如何改变我的网络爬虫,使其刮除所有链接,包括广告?相关代码可在下面找到 类爬虫对象: def __init__(self, root, depth, locked=True): self.root = root self.depth
def __init__(self, root, depth, locked=True):
self.root = root
self.depth = depth
self.locked = locked
self.host = urlparse.urlparse(root)[1]
self.urls = []
self.links = 0
self.followed = 0
def crawl(self):
#print " in crawl"
page = Fetcher(self.root)
q = Queue()
#print "made fetcher"
try:
page.fetch()
if page.urls == []:
print "Error: could not fetch urls for %s" % (self.root)
return
#raise KeyboardInterrupt
else:
target = open("output.txt", 'w')
for url in page.urls:
q.put(url)
target.write((url+'\n').encode('utf-8'))
followed = [self.root]
target.close()
except Exception as e:
print('Error: could not fetch urls')
raise KeyboardInterrupt
'''
q = Queue()
target = open("output.txt", 'w')
for url in page.urls:
q.put(url) f
target.write((url+'\n').encode('utf-8'))
followed = [self.root]
target.close()
#print followed
'''
n = 0
while True:
try:
url = q.get()
except QueueEmpty:
break
n += 1
if url not in followed:
try:
host = urlparse.urlparse(url)[1]
if self.locked and re.match(".*%s" % self.host, host):
followed.append(url)
#print url
self.followed += 1
page = Fetcher(url)
page.fetch()
for i, url in enumerate(page):
if url not in self.urls:
self.links += 1
q.put(url)
self.urls.append(url)
with open("data.out", 'w') as f:
f.write(url)
if n > self.depth and self.depth > 0:
break
except Exception, e:
print "ERROR: Can't process url '%s' (%s)" % (url, e)
print format_exc()
类获取对象:
def __init__(self, url):
self.url = url
self.urls = []
def __getitem__(self, x):
return self.urls[x]
def _addHeaders(self, request):
request.add_header("User-Agent", AGENT)
def open(self):
url = self.url
try:
request = urllib2.Request(url)
handle = urllib2.build_opener()
except IOError:
return None
return (request, handle)
def fetch(self):
request, handle = self.open()
self._addHeaders(request)
if handle:
try:
content = unicode(handle.open(request).read(), "utf-8",
errors="replace")
soup = BeautifulSoup(content)
tags = soup('a')
except urllib2.HTTPError, error:
if error.code == 404:
print >> sys.stderr, "ERROR: %s -> %s" % (error, error.url)
else:
print >> sys.stderr, "ERROR: %s" % error
tags = []
except urllib2.URLError, error:
print >> sys.stderr, "ERROR: %s" % error
tags = []
for tag in tags:
href = tag.get("href")
if href is not None:
url = urlparse.urljoin(self.url, escape(href))
if url not in self:
self.urls.append(url)
def getLinks(url):
page = Fetcher(url)
page.fetch()
for i, url in enumerate(page):
print "%d. %s" % (i, url)
静态方法:
def main():
depth =2
file_in = []
reload(sys)
sys.setdefaultencoding('utf-8')
filename = "stuff.txt"
text = open(filename)
for line in text:
file_in.append(line.rstrip())
for i in file_in:
print "Crawling %s (Max Depth: %d)" % (i, depth)
crawler = Crawler(i, depth)
crawler.crawl()
print "\n".join(crawler.urls)
很多广告都是通过页面上执行的异步javascript来传递的。如果只是删除服务器的初始输出,则无法获得其他链接。一种方法是使用无头浏览器(如PhantomJS)将html呈现到文件中,然后在该文件上使用脚本。还有其他可能性。您标记了[scrapy],但您没有使用它……寻求调试帮助的问题此代码为什么不起作用?必须包括所需的行为、特定的问题或错误以及在问题本身中重现这些问题所需的最短代码。没有明确问题陈述的问题对其他读者没有用处。请参阅:。