Python 我正在删除rightmove scraper网站,但在代码中出现此错误
这是我的代码:Python 我正在删除rightmove scraper网站,但在代码中出现此错误,python,scrapy,Python,Scrapy,这是我的代码: #packages import scrapy from scrapy.crawler import CrawlerProcess from scrapy.selector import Selector import json import urllib import datetime import re #sold houses class class soldhouses(scrapy.Spider): #scraper name name = '
#packages
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
import json
import urllib
import datetime
import re
#sold houses class
class soldhouses(scrapy.Spider):
#scraper name
name = 'rightmove2'
#base url
base_url = 'https://www.rightmove.co.uk/house-prices/CR0.html?'
#page index
page_index = 0
#string query parameters
params = {
'country': 'england',
'locationIdentifier': '',
'searchLocation': '',
'referrer' : 'landingPage'
}
#headers
headers = {
'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
}
#custom setting
#custom_settings = {
#uncomment to set accordingly
#"CONCURRENT_REQUESTS_PER_DOMAIN":2,
#"DOWNLOAD_TIMEOUT":1 #250 ms of delay
# }
#on create
def __init__(self):
#inti postcodes
postcodes = ''
# open "postcodes.json"
with open('postcodes.json', 'r') as json_file:
for line in json_file.read():
postcodes += line
#parse postcodes
self.postcodes = json.loads(postcodes)
#crawler's entry point
def start_requests(self):
#init filename
filename = './output/Sold_houses_'+datetime.datetime.today().strftime('%Y-%m-%d-%H-%M')+'.jsonl'
#postcodes count
count = 0
#init string query parameters
for item in self.postcodes:
self.page_index = 0
self.params['locationIdendifier'] = item['locationId']
self.params['searchLocation'] = item['postcode']
url = self.base_url +urllib.parse.urlencode(self.params)
#print(url)
yield scrapy.Request(url=url,headers=self.headers,meta={'postcode':item['postcode'],'filename':filename, 'count':count}, callback = self.parse_links)
count +=1
break
def parse_links(self, res):
# response = Selector(res= text)
# with open('res.html', 'w' ) as f:
# f.write(res.text)
postcode = res.meta.get('postcode')
filename = res.meta.get('filename')
count = res.meta.get('count')
# #extract links
# for card in res.css('div.propertyCard-content')
# #debug selectors
# content = ''
# with open('res2.html', 'r' ) as f:
# for line in f.read():
# content += line
# res = Selector(text=content)
#extract basic features
properties = ''.join([script for script in res.css("script::text").getall() if 'window.__PRELOADED_STATE__ = {"results":' in script])
properties = json.loads(properties.split('window.__PRELOADED_STATE__ = ')[-1])
properties = properties['results']['properties']
for property in properties:
if property['detailUrl']:
prop_id = property['detailUrl'].split('?prop=')[-1].split('&')[0]
links = 'https://www.rightmove.co.uk/property-for-sale/property-'+ prop_id +'.html'
yield res.follow(url = property['detailUrl'],headers = self.headers, meta = {'property':properties,'postcode': postcode, 'filename':filename, 'count':count}, callback = self.parse_listing)
break
def parse_listing(self,res):
# content = ''
# with open('res.html', 'r' ) as f:
# for line in f.read():
# content += line
# res = Selector(text=content)
#extract features
features = {
'id' : ''.join(re.findall('\d+',res.url.split('/')[-1])),
'url' : res.url,
'postcode' : res.meta.get('postcode'),
'title' : res.css('div.lef')
.css('h1.fs-22::text')
.get(),
'address' : res.css('div.cell')
.css('address')
.css('meta::attr(content)')
.get(),
'price' : res.css('div.cell')
.css('p.property-header-price')
.css('strong::text')
.get(),
'agent_link' : res.css('div.agent-details-display')
.css('a::attr(href)')
.get(),
'agent_name' : res.css('div.agent-details-display')
.css('strong::text')
.get(),
'agent_address' : res.css('div.agent-details-display')
.css('address::text')
.get(),
'agent_phone' : res.css('div.request-property-details')
.css('strong::text')
.get(),
'image_urls' : res.css('div.no-js-hidden')
.css('img::attr(src)')
.getall(),
'floor_area' : res.css('div.sect')
.css('ul.list-style-square')
.css('li::text')
.get(),
'key_features' : res.css('div.key-features')
.css('ul.list-style-square')
.css('li::text')
.getall(),
'full_descriptions' : ''.join([feature.replace('\r').replace('\n').strip() for feature in
res.css('p[itemprop = "description"]::text')
.getall() if feature != ''
])
}
#extract price description
# price_descr = res.css('div.cell').css('p.property-header-price').css('small::text').get(),
#add text value to the price if available
#features['price'] = price_descr.replace('\n').replace('\r').replace('\t')
filename = res.meta.get('filename')
with open(filename, 'a') as json_file:
json_file.write(json.dumps(features,indent = 2))
#main driver
if __name__ == '__main__':
#run scrapers
process = CrawlerProcess()
process.crawl(soldhouses)
process.start()
#soldhouses.parse_links(soldhouses,'')
输出为以下错误:
Traceback (most recent call last):
File "/home/danish-khan/miniconda3/lib/python3.7/site-packages/scrapy/utils/defer.py", line 55, in mustbe_deferred
result = f(*args, **kw)
File "/home/danish-khan/miniconda3/lib/python3.7/site-packages/scrapy/core/spidermw.py", line 60, in process_spider_input
return scrape_func(response, request, spider)
File "/home/danish-khan/miniconda3/lib/python3.7/site-packages/scrapy/core/scraper.py", line 148, in call_spider
warn_on_generator_with_return_value(spider, callback)
File "/home/danish-khan/miniconda3/lib/python3.7/site-packages/scrapy/utils/misc.py", line 202, in warn_on_generator_with_return_value
if is_generator_with_return_value(callable):
File "/home/danish-khan/miniconda3/lib/python3.7/site-packages/scrapy/utils/misc.py", line 187, in is_generator_with_return_value
tree = ast.parse(dedent(inspect.getsource(callable)))
File "/home/danish-khan/miniconda3/lib/python3.7/ast.py", line 35, in parse
return compile(source, filename, mode, PyCF_ONLY_AST)
File "<unknown>", line 1
def parse_links(self, res):
^
IndentationError: unexpected indent
回溯(最近一次呼叫最后一次):
文件“/home/danish khan/miniconda3/lib/python3.7/site packages/scrapy/utils/defer.py”,第55行,必须延迟
结果=f(*参数,**kw)
文件“/home/danish khan/miniconda3/lib/python3.7/site packages/scrapy/core/spidermw.py”,第60行,进程中输入
返回scrape_func(响应、请求、spider)
文件“/home/danish khan/miniconda3/lib/python3.7/site packages/scrapy/core/scraper.py”,第148行,在call_spider中
使用返回值警告\u生成器上的\u(爬行器,回调)
文件“/home/danish khan/miniconda3/lib/python3.7/site packages/scrapy/utils/misc.py”,第202行,带返回值
如果是带有返回值的生成器(可调用):
文件“/home/danish khan/miniconda3/lib/python3.7/site packages/scrapy/utils/misc.py”,第187行,带返回值
tree=ast.parse(dedent(inspect.getsource(可调用)))
文件“/home/danish khan/miniconda3/lib/python3.7/ast.py”,第35行,解析
返回编译(源、文件名、模式、仅PyCF\u AST)
文件“”,第1行
def解析_链接(self,res):
^
缩进错误:意外缩进
您最有可能需要检查代码缩进,确保正确计算空格,并且不要将空格与制表符混在一起。使用像PyCharm或Visual Studio代码这样的IDE可能会更容易避免这些错误。很可能需要检查代码缩进,确保正确计算空格,并且不要将空格与选项卡混合。使用像PyCharm或visualstudio这样的IDE代码可能会更容易避免这些错误。