Web scraping 在Scrapy中Tor ip更改时如何更改用户代理
我使用Tor和Privoxy以及TorIpChanger在随机数目的物品被刮下后更改ip。它工作得很好。我想改变用户代理以及,当ip改变。 我对实现这一目标的方法有点困惑。我看过scrapy_用户代理和类似的解决方案,寻找灵感,但目前还没有太多成功。这就是我试图做的,基于和 extensions.pyWeb scraping 在Scrapy中Tor ip更改时如何更改用户代理,web-scraping,scrapy,Web Scraping,Scrapy,我使用Tor和Privoxy以及TorIpChanger在随机数目的物品被刮下后更改ip。它工作得很好。我想改变用户代理以及,当ip改变。 我对实现这一目标的方法有点困惑。我看过scrapy_用户代理和类似的解决方案,寻找灵感,但目前还没有太多成功。这就是我试图做的,基于和 extensions.py class TorRenewIdentity(object): def __init__(self, crawler, item_count, user_agents):
class TorRenewIdentity(object):
def __init__(self, crawler, item_count, user_agents):
self.crawler = crawler
self.item_count = self.randomize(item_count) # Randomize the item count to confound traffic analysis
self._item_count = item_count # Also remember the given item count for future randomizations
self.items_scraped = 0
self.user_agents = user_agents
# Connect the extension object to signals
self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
@staticmethod
def randomize(item_count, min_factor=0.5, max_factor=1.5):
'''Randomize the number of items scraped before changing identity. (A similar technique is applied to Scrapy's DOWNLOAD_DELAY setting).'''
randomized_item_count = random.randint(int(min_factor*item_count), int(max_factor*item_count))
logger.info("The crawler will scrape the following (randomized) number of items before changing identity (again): {}".format(randomized_item_count))
return randomized_item_count
@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('TOR_RENEW_IDENTITY_ENABLED'):
raise NotConfigured
item_count = crawler.settings.getint('TOR_ITEMS_TO_SCRAPE_PER_IDENTITY', 10)
user_agents = crawler.settings['USER_AGENT']
return cls(crawler=crawler, item_count=item_count, user_agents=user_agents) # Instantiate the extension object
def item_scraped(self, item, spider):
'''When item_count items are scraped, pause the engine and change IP address.'''
self.items_scraped += 1
if self.items_scraped == self.item_count:
logger.info("Scraped {item_count} items. Pausing engine while changing identity...".format(item_count=self.item_count))
self.crawler.engine.pause()
ip_changer.get_new_ip() # Change IP address with toripchanger https://github.com/DusanMadar/TorIpChanger
self.items_scraped = 0 # Reset the counter
self.item_count = self.randomize(self._item_count) # Generate a new random number of items to scrape before changing identity again
# Get new user agent from list
if self.user_agents:
new_user_agent = random.choice(self.user_agents)
logger.info('Load {} user_agents from settings. New user agent is {}.'.format(
len(self.user_agents) if self.user_agents else 0, new_user_agent))
# Change user agent here ?
# For next self.item_count items
# headers.setdefault('User-Agent', new_user_agent)
#
self.crawler.engine.unpause()
USER_AGENT = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:56.0) Gecko/20100101 Firefox/56.0'
]
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'scrapydevua.middlewares.ScrapydevuaSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'scrapydevua.middlewares.ScrapydevuaDownloaderMiddleware': 543,
#}
EXTENSIONS = {
'scrapydevua.extensions.TorRenewIdentity': 1,
}
设置.py
class TorRenewIdentity(object):
def __init__(self, crawler, item_count, user_agents):
self.crawler = crawler
self.item_count = self.randomize(item_count) # Randomize the item count to confound traffic analysis
self._item_count = item_count # Also remember the given item count for future randomizations
self.items_scraped = 0
self.user_agents = user_agents
# Connect the extension object to signals
self.crawler.signals.connect(self.item_scraped, signal=signals.item_scraped)
@staticmethod
def randomize(item_count, min_factor=0.5, max_factor=1.5):
'''Randomize the number of items scraped before changing identity. (A similar technique is applied to Scrapy's DOWNLOAD_DELAY setting).'''
randomized_item_count = random.randint(int(min_factor*item_count), int(max_factor*item_count))
logger.info("The crawler will scrape the following (randomized) number of items before changing identity (again): {}".format(randomized_item_count))
return randomized_item_count
@classmethod
def from_crawler(cls, crawler):
if not crawler.settings.getbool('TOR_RENEW_IDENTITY_ENABLED'):
raise NotConfigured
item_count = crawler.settings.getint('TOR_ITEMS_TO_SCRAPE_PER_IDENTITY', 10)
user_agents = crawler.settings['USER_AGENT']
return cls(crawler=crawler, item_count=item_count, user_agents=user_agents) # Instantiate the extension object
def item_scraped(self, item, spider):
'''When item_count items are scraped, pause the engine and change IP address.'''
self.items_scraped += 1
if self.items_scraped == self.item_count:
logger.info("Scraped {item_count} items. Pausing engine while changing identity...".format(item_count=self.item_count))
self.crawler.engine.pause()
ip_changer.get_new_ip() # Change IP address with toripchanger https://github.com/DusanMadar/TorIpChanger
self.items_scraped = 0 # Reset the counter
self.item_count = self.randomize(self._item_count) # Generate a new random number of items to scrape before changing identity again
# Get new user agent from list
if self.user_agents:
new_user_agent = random.choice(self.user_agents)
logger.info('Load {} user_agents from settings. New user agent is {}.'.format(
len(self.user_agents) if self.user_agents else 0, new_user_agent))
# Change user agent here ?
# For next self.item_count items
# headers.setdefault('User-Agent', new_user_agent)
#
self.crawler.engine.unpause()
USER_AGENT = [
'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.36 Safari/535.7',
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.12; rv:56.0) Gecko/20100101 Firefox/56.0'
]
# Enable or disable spider middlewares
# See https://docs.scrapy.org/en/latest/topics/spider-middleware.html
#SPIDER_MIDDLEWARES = {
# 'scrapydevua.middlewares.ScrapydevuaSpiderMiddleware': 543,
#}
# Enable or disable downloader middlewares
# See https://docs.scrapy.org/en/latest/topics/downloader-middleware.html
#DOWNLOADER_MIDDLEWARES = {
# 'scrapydevua.middlewares.ScrapydevuaDownloaderMiddleware': 543,
#}
EXTENSIONS = {
'scrapydevua.extensions.TorRenewIdentity': 1,
}