Python 无法访问派生类中的新方法
我编写了以下代码,用于抓取网页,然后将其存储在Python 无法访问派生类中的新方法,python,scrapy,web-crawler,Python,Scrapy,Web Crawler,我编写了以下代码,用于抓取网页,然后将其存储在Solr索引中 crawledLinks = [] solr = pysolr.Solr('some url', timeout=10) class MySpider(Spider): name = "tutsplus" start_urls = ["some url"] allowed_domains = ["some domain"] custom_settings = { 'CONCURREN
Solr
索引中
crawledLinks = []
solr = pysolr.Solr('some url', timeout=10)
class MySpider(Spider):
name = "tutsplus"
start_urls = ["some url"]
allowed_domains = ["some domain"]
custom_settings = {
'CONCURRENT_REQUESTS': 100,
'CONCURRENT_REQUESTS_PER_DOMAIN': 100,
'DEPTH_LIMIT': 100,
'LOG_ENABLED': True,
}
def parse(self, response):
links = response.xpath('//a/@href').extract()
current_url = response.url
asyncio.ensure_future(add_to_index(response.body, current_url))
for link in links:
# If it is a proper link and is not checked yet, yield it to the Spider
internal_link = urljoin(current_url, link)
result = urlparse(internal_link)
if result.scheme and result.netloc and result.path and not internal_link in crawledLinks:
crawledLinks.append(internal_link)
yield Request(internal_link, self.parse)
item = TutsplusItem()
item["url"] = current_url
yield item
async def add_to_index(body, current_url):
soup = BeautifulSoup(body)
texts = soup.find_all(text=True)
visible_texts = []
for text in texts:
if text.parent.name not in ['style', 'script', 'meta', '[document]'] and not isinstance(text, Comment):
visible_texts.append(text)
fetched_text = u" ".join(t.strip() for t in visible_texts)
words = nltk.word_tokenize(fetched_text)
stop = set(stopwords.words('english'))
stopwordsfree_words = [word for word in words if word not in stop]
detokenizer = MosesDetokenizer()
doc = detokenizer.detokenize(stopwordsfree_words, return_str=True)
doc = doc.encode('utf-8')
url = "some url"
try:
res = requests.post(url, data=doc)
except Exception as e:
print(e)
if not doc:
doc = soup.title.string
if res.status_code == 200:
words = json.loads(res.text)
doc = detokenizer.detokenize(words, return_str=True)
solr.add([{"doc": doc, "url": str(current_url)}])
我想以“点燃并忘记”的方式调用函数add_to_index()
。但我面临的问题是我得到了错误
未定义的名称“添加到索引”
在parse方法中。因此,该功能未被识别。我是python新手。你能帮我解决这个问题吗
谢谢
Nilesh.您是否尝试过像这样调用
add\u to\u index
:self.add\u to\u index(response.body,current\u url)我刚才试过。IDE现在没有显示错误。谢谢让我试着运行它。可能还需要self
作为add_to_index
函数中的第一个参数。是的,必须添加它。它正在运行。谢谢:)