Python 多次回拨后的报废项目
我是新来的刮痧和它有问题。我认为我还没有完全掌握如何以一种不连贯的方式编写代码 以下是代码试图实现的目标:Python 多次回拨后的报废项目,python,web-scraping,scrapy,data-extraction,Python,Web Scraping,Scrapy,Data Extraction,我是新来的刮痧和它有问题。我认为我还没有完全掌握如何以一种不连贯的方式编写代码 以下是代码试图实现的目标: 访问州立公园url并获取所有单独的公园url。创造 每个公园都有一个零碎的项目(或任何存储数据的方式) 作为一个单独的项目将要产生 访问每个公园url,并根据关键字匹配(附加数据功能)刮取公园内容。将数据添加到该公园的scrapy项 在该公园中查找与特定关键字匹配的链接(匹配链接功能)。访问该链接并再次基于keyowrds进行刮取。将数据添加到该公园的scrapy项 交出那件脏兮兮的东西,
logging.basicConfig(level=logging.WARNING)
def generate_item(fields):
item = Item()
for f in fields:
item.fields[f] = Field()
item[f] = []
return item
Parkdata = generate_item(cols_data.index)
class parkcrawler(sc.Spider):
name = 'parkcrawler'
def __init__(self):
self.park_cols = {k:[] for k in cols_data.index}
self.base_url = 'https://www.alapark.com'
def start_requests(self):
url = 'https://www.alapark.com/parks'
yield sc.Request(url, callback = self.parse)
def parse(self, r):
soup = bs(r.text, 'lxml')
for s1 in tqdm(soup.findAll('div', class_ = 'col col-xs-12 col-sm-6 col-md-6 col-lg-4')[:2]):
try:
for div in s1.findAll(class_ = 'field-content parks-list-title'):
#self.name = div.find(href=True).text
Parkdata['Park Image or Video'] = self.base_url + s1.find('img', src = True)['src']
Parkdata['Protected Area/Park Name'] = div.find(href=True).text
Parkdata['Website'] = self.base_url + div.find(href=True)['href']
yield r.follow(div.find(href=True)['href'], callback = self.ppagecontent, dont_filter = True, meta = {'item': Parkdata})
yield r.follow(div.find(href=True)['href'], callback = self.pparklinkcontent, dont_filter = True, meta = {'item': Parkdata})
except AttributeError:
print(div)
def ppagecontent(self, r):
soup = bs(r.text, 'lxml')
Parkdata = r.meta['item']
aside = soup.find('aside')
content = soup.find(class_ = 'columns')
phone = aside.find('h4', text = re.compile("phone", re.IGNORECASE))
for item in phone.find_next_siblings():
if item.name == 'h4':
break
Parkdata['Telephone Number - Information'].append(item.text)
hours = aside.find('h4', text = re.compile("hours", re.IGNORECASE))
for item in hours.find_next_siblings():
if item.name == 'h4':
break
Parkdata['Hours of Operation Detail'].append(item.text)
for c in content.findAll('p'):
append_data(c.text, Parkdata, 'searchcontent')
Parkdata['Description'].append(c.text)
for l in content.findAll(href = True):
if not l['href'].startswith('http'):
append_data(self.base_url + l['href'], Parkdata, 'searchlink')
else:
append_data(l['href'], Parkdata, 'searchlink')
for img in content.findAll('img', src = True):
if not img['src'].startswith('http'):
append_data(self.base_url + img['src'], Parkdata, 'searchlink')
else:
append_data(img['src'], Parkdata, 'searchlink')
yield Parkdata
def pparklinkcontent(self, r):
soup = bs(r.text, 'lxml')
Parkdata = r.meta['item']
content = soup.find('div', class_ = "column is-4 content-sidebar")
all_links = [link['href'] for link in content.findAll(href = True) if link['href'].startswith('/parks/')]
matching_links = [get_matching_links(link, Parkdata) for link in all_links]
for i,d in enumerate(matching_links):
if d != {}:
link = list(d.keys())[0]
cols = list(d.values())[0]
yield r.follow(link, callback = self.plinkpagecontent, meta= {'cols': cols, 'item': Parkdata})
def plinkpagecontent(self, r):
soup = bs(r.text, 'lxml')
cols = r.meta['cols']
Parkdata = r.meta['item']
content = soup.find(class_ = re.compile('main-content'))
for c in content.findAll('p'):
for col in cols:
Parkdata[col].append(c.text)
append_data(c.text, Parkdata, 'searchcontent')
for l in content.findAll(href = True):
if not l['href'].startswith('http'):
append_data(self.base_url + l['href'], Parkdata, 'searchlink')
else:
append_data(l['href'], Parkdata, 'searchlink')
for img in content.findAll('img', src = True):
if not img['src'].startswith('http'):
append_data(self.base_url + img['src'], Parkdata, 'searchlink')
else:
append_data(img['src'], Parkdata, 'searchlink')
yield Parkdata
显然,每次循环中都必须声明Scrapy项。我很想澄清一下。同样,将收益率置于for循环之外对我也很有效。第一个for循环如下所示:
def parse(self, r):
soup = bs(r.text, 'lxml')
for s1 in tqdm(soup.findAll('div', class_ = 'col col-xs-12 col-sm-6 col-md-6 col-lg-4')[:2]):
try:
for div in s1.findAll(class_ = 'field-content parks-list-title'):
Parkdata = generate_item(cols_data.index)
Parkdata['Park Image or Video'] = self.base_url + s1.find('img', src = True)['src']
Parkdata['Protected Area/Park Name'] = div.find(href=True).text
Parkdata['Website'] = self.base_url + div.find(href=True)['href']
yield r.follow(div.find(href=True)['href'], callback = self.ppagecontent, dont_filter = True, meta = {'item': Parkdata})
yield r.follow(div.find(href=True)['href'], callback = self.pparklinkcontent, dont_filter = True, meta = {'item': Parkdata})
except AttributeError:
print(div)
yield Parkdata