Python Scrapy在抓取网站时无法跟踪内部链接
我试图跟踪所有内部链接,同时跟踪网站的所有内部和外部链接。我刚刚开始与Scrapy合作,我无法理解如何在跟踪网站所有内部链接的同时进行爬行 它只获取深度1上的链接,但不跟踪它们Python Scrapy在抓取网站时无法跟踪内部链接,python,scrapy,Python,Scrapy,我试图跟踪所有内部链接,同时跟踪网站的所有内部和外部链接。我刚刚开始与Scrapy合作,我无法理解如何在跟踪网站所有内部链接的同时进行爬行 它只获取深度1上的链接,但不跟踪它们 class BRS(CrawlSpider): name = "brs" rules = (Rule(SgmlLinkExtractor(allow=()), callback='parse_obj', follow=True),) def __init__(self): glob
class BRS(CrawlSpider):
name = "brs"
rules = (Rule(SgmlLinkExtractor(allow=()), callback='parse_obj', follow=True),)
def __init__(self):
global start_urls
#settings.overrides['DEPTH_LIMIT'] = 10
path = os.path.dirname(os.path.abspath(__file__))
with open(os.path.join(path,"urls.txt"), "rt") as f:
self.start_urls = filter(None,[url.strip() for url in f.readlines()])
start_urls = self.start_urls
def parse(self, response):
brsitem = BrsItem()
brsitem['url'] = response.url
internal = LinkExtractor(allow_domains=[response.url])
external = LinkExtractor(deny_domains=[response.url])
links = internal.extract_links(response)
internal = []
fd = open('output.txt','a+')
for link in links:
internal.append(link.url)
links = external.extract_links(response)
external = []
for link in links:
external.append(link.url)
for link in internal:
fd.write(link+"\tinternal\n")
for link in external:
fd.write(link+"\texternal\n")
return brsitem
My urls.txt目前包含以下内容:
http://www.stackoverflow.com
非常感谢您的帮助。使用此引用使其正常工作,并且在我忘记设置DEPTH\u LIMIT参数时在stackoverflow上阻止了我的ip。有些事情是通过艰苦的方式学会的
import scrapy
from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor
from scrapy.contrib.spiders import Rule, CrawlSpider
from scrapy.linkextractors import LinkExtractor
import urllib2,requests
from scrapy.conf import settings
from requests.auth import HTTPBasicAuth
import urllib2,requests,os,sys
from urlparse import urlparse
from brs.items import BrsItem
class BRS(CrawlSpider):
name = "brs"
def __init__(self):
global start_urls,rules
settings.overrides['DEPTH_LIMIT'] = 10
path = os.path.dirname(os.path.abspath(__file__))
with open(os.path.join(path,"urls.txt"), "r+") as f:
self.start_urls = filter(None,[url.strip() for url in f.readlines()])
start_urls = self.start_urls
self.rules = (Rule(SgmlLinkExtractor(allow=()), callback=self.parse_items, follow=True),)
rules = self.rules
self._rules = rules
def extract_domain(self,url):
return urlparse(url).netloc
def parse_items(self, response):
internal = LinkExtractor(allow_domains=[self.extract_domain(response.url)])
external = LinkExtractor(deny_domains=[self.extract_domain(response.url)])
links = internal.extract_links(response)
internal = []
fd = open('output.txt','a+')
for link in links:
internal.append(link.url)
for link in internal:
fd.write(link+"\tinternal\n")
links = external.extract_links(response)
external = []
for link in links:
external.append(link.url)
for link in external:
fd.write(link+"\texternal\n")
for link in internal:
yield scrapy.Request(link.strip(), callback=self.parse_attr)
def parse_attr(self, response):
brsitem = BrsItem()
brsitem['url'] = response.url.strip()
return brsitem