Python 我正在删除rightmove scraper网站,但在代码中出现此错误

Python 我正在删除rightmove scraper网站,但在代码中出现此错误,python,scrapy,Python,Scrapy,这是我的代码: #packages import scrapy from scrapy.crawler import CrawlerProcess from scrapy.selector import Selector import json import urllib import datetime import re #sold houses class class soldhouses(scrapy.Spider): #scraper name name = '

这是我的代码:

#packages
import scrapy
from scrapy.crawler import CrawlerProcess
from scrapy.selector import Selector
import json
import urllib
import datetime
import re

#sold houses class
class soldhouses(scrapy.Spider):
      #scraper name
      name = 'rightmove2'
      #base url 
      base_url = 'https://www.rightmove.co.uk/house-prices/CR0.html?' 
      
      #page index
      page_index = 0
      
      #string query parameters
      params = {
         'country': 'england',
         'locationIdentifier': '',
         'searchLocation': '',
         'referrer' : 'landingPage'
           }
      
      #headers
      headers = {
             'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/87.0.4280.66 Safari/537.36'
        }

       #custom setting
       #custom_settings = {
             #uncomment to set accordingly
             #"CONCURRENT_REQUESTS_PER_DOMAIN":2,
             #"DOWNLOAD_TIMEOUT":1   #250 ms  of delay
        #    }     
         
      #on create 
      def __init__(self):

          #inti postcodes
          postcodes = ''
          
          # open "postcodes.json" 
          with open('postcodes.json', 'r') as json_file:
               for line in json_file.read():
                    postcodes += line
          #parse postcodes
          self.postcodes = json.loads(postcodes)

     
        

      #crawler's entry point
      def start_requests(self):
           #init filename
           filename = './output/Sold_houses_'+datetime.datetime.today().strftime('%Y-%m-%d-%H-%M')+'.jsonl'
           #postcodes count
           count = 0
           #init string query parameters
           for item in self.postcodes:
                self.page_index = 0
                self.params['locationIdendifier'] = item['locationId']
                self.params['searchLocation'] = item['postcode']
                url = self.base_url +urllib.parse.urlencode(self.params) 
                #print(url)
                yield scrapy.Request(url=url,headers=self.headers,meta={'postcode':item['postcode'],'filename':filename, 'count':count}, callback = self.parse_links)
                count +=1
                break


      def parse_links(self, res):

#         response = Selector(res= text)
#          with open('res.html', 'w' ) as f:
#                   f.write(res.text)
      
          postcode = res.meta.get('postcode')
          filename = res.meta.get('filename')
          count = res.meta.get('count')
#         #extract links
#         for card in res.css('div.propertyCard-content')
#         #debug selectors
          
#          content = ''
#          with open('res2.html', 'r' ) as f:
#              for line in f.read():
#                 content += line
#          res = Selector(text=content) 
          
         #extract basic features
          properties = ''.join([script for script in res.css("script::text").getall() if 'window.__PRELOADED_STATE__ = {"results":' in script]) 
          properties = json.loads(properties.split('window.__PRELOADED_STATE__ = ')[-1])
          properties = properties['results']['properties'] 
          
          for property in properties:
            if property['detailUrl']: 
               prop_id = property['detailUrl'].split('?prop=')[-1].split('&')[0]
               links = 'https://www.rightmove.co.uk/property-for-sale/property-'+ prop_id +'.html'
             
               yield res.follow(url = property['detailUrl'],headers = self.headers, meta = {'property':properties,'postcode': postcode, 'filename':filename, 'count':count}, callback = self.parse_listing)
               break
      def parse_listing(self,res):
         
#          content = ''
#          with open('res.html', 'r' ) as f:
#              for line in f.read():
#                 content += line
#          res = Selector(text=content)    
          #extract features
          features = {
                   'id' : ''.join(re.findall('\d+',res.url.split('/')[-1])),
                   'url' : res.url,
                   'postcode' : res.meta.get('postcode'),
                   'title' : res.css('div.lef')
                                 .css('h1.fs-22::text')
                                  .get(),

                   'address' : res.css('div.cell')
                                   .css('address')
                                   .css('meta::attr(content)')
                                   .get(),
                           
                   'price' :   res.css('div.cell')
                                  .css('p.property-header-price')
                                  .css('strong::text')
                                  .get(),

                   'agent_link' : res.css('div.agent-details-display')
                                     .css('a::attr(href)')
                                     .get(),

                   'agent_name' : res.css('div.agent-details-display')
                                     .css('strong::text')
                                     .get(),

                   'agent_address' : res.css('div.agent-details-display')
                                     .css('address::text')
                                     .get(),

                   'agent_phone' : res.css('div.request-property-details')
                                      .css('strong::text')
                                      .get(),

                   'image_urls' : res.css('div.no-js-hidden')
                                     .css('img::attr(src)')
                                     .getall(),

                   'floor_area' : res.css('div.sect')
                                     .css('ul.list-style-square')
                                     .css('li::text')
                                     .get(),
                                
                   'key_features' : res.css('div.key-features')
                                       .css('ul.list-style-square')
                                       .css('li::text')
                                       .getall(),

                   'full_descriptions' : ''.join([feature.replace('\r').replace('\n').strip() for feature in 
                                            res.css('p[itemprop = "description"]::text')
                                            .getall() if feature != ''
                                                 ])
                     } 
           #extract price description
          # price_descr = res.css('div.cell').css('p.property-header-price').css('small::text').get(),
           #add text value to the price if available
           #features['price'] = price_descr.replace('\n').replace('\r').replace('\t')


          filename = res.meta.get('filename')
          with open(filename, 'a') as json_file:
                 json_file.write(json.dumps(features,indent = 2))

#main driver
if __name__ == '__main__':
    #run scrapers
    process = CrawlerProcess()
    process.crawl(soldhouses)
    process.start()   

     #soldhouses.parse_links(soldhouses,'')
输出为以下错误:

Traceback (most recent call last):
File "/home/danish-khan/miniconda3/lib/python3.7/site-packages/scrapy/utils/defer.py", line 55, in mustbe_deferred
    result = f(*args, **kw)
File "/home/danish-khan/miniconda3/lib/python3.7/site-packages/scrapy/core/spidermw.py", line 60, in process_spider_input
    return scrape_func(response, request, spider)
File "/home/danish-khan/miniconda3/lib/python3.7/site-packages/scrapy/core/scraper.py", line 148, in call_spider
    warn_on_generator_with_return_value(spider, callback)
File "/home/danish-khan/miniconda3/lib/python3.7/site-packages/scrapy/utils/misc.py", line 202, in warn_on_generator_with_return_value
    if is_generator_with_return_value(callable):
File "/home/danish-khan/miniconda3/lib/python3.7/site-packages/scrapy/utils/misc.py", line 187, in is_generator_with_return_value
    tree = ast.parse(dedent(inspect.getsource(callable)))
File "/home/danish-khan/miniconda3/lib/python3.7/ast.py", line 35, in parse
    return compile(source, filename, mode, PyCF_ONLY_AST)
File "<unknown>", line 1
    def parse_links(self, res):
    ^
IndentationError: unexpected indent
回溯(最近一次呼叫最后一次):
文件“/home/danish khan/miniconda3/lib/python3.7/site packages/scrapy/utils/defer.py”,第55行,必须延迟
结果=f(*参数,**kw)
文件“/home/danish khan/miniconda3/lib/python3.7/site packages/scrapy/core/spidermw.py”,第60行,进程中输入
返回scrape_func(响应、请求、spider)
文件“/home/danish khan/miniconda3/lib/python3.7/site packages/scrapy/core/scraper.py”,第148行,在call_spider中
使用返回值警告\u生成器上的\u(爬行器,回调)
文件“/home/danish khan/miniconda3/lib/python3.7/site packages/scrapy/utils/misc.py”,第202行,带返回值
如果是带有返回值的生成器(可调用):
文件“/home/danish khan/miniconda3/lib/python3.7/site packages/scrapy/utils/misc.py”,第187行,带返回值
tree=ast.parse(dedent(inspect.getsource(可调用)))
文件“/home/danish khan/miniconda3/lib/python3.7/ast.py”,第35行,解析
返回编译(源、文件名、模式、仅PyCF\u AST)
文件“”,第1行
def解析_链接(self,res):
^
缩进错误:意外缩进

您最有可能需要检查代码缩进,确保正确计算空格,并且不要将空格与制表符混在一起。使用像PyCharm或Visual Studio代码这样的IDE可能会更容易避免这些错误。很可能需要检查代码缩进,确保正确计算空格,并且不要将空格与选项卡混合。使用像PyCharm或visualstudio这样的IDE代码可能会更容易避免这些错误。