Python Web抓取时出现索引错误
这是我的密码-Python Web抓取时出现索引错误,python,Python,这是我的密码- # coding: utf-8 # ## Extracting just the links from the Security home page # In[126]: base_url = "https://www.cnet.com" additional_url = "/topics/security/how-to/" import re import numpy as np import requests from bs4 import BeautifulSoup
# coding: utf-8
# ## Extracting just the links from the Security home page
# In[126]:
base_url = "https://www.cnet.com"
additional_url = "/topics/security/how-to/"
import re
import numpy as np
import requests
from bs4 import BeautifulSoup
from time import sleep
# To keep a count of the number of articles to be scrapped
limit = 0;
next_page = base_url + additional_url
# List to store the links
list_of_links = []
# Change the limit as per requirements
while next_page and limit <= 200:
temp_list_of_links = []
# Load and extract the content of the page
page = requests.get(next_page)
#sleep(15)
soup = BeautifulSoup(page.content, 'html.parser')
# Find the 'news' links of the page
for link in soup.find_all('a', href=True):
if link['href'].startswith('/news/'):
temp_list_of_links.append(link['href'])
# Save the unique links
link_list = set(temp_list_of_links)
# Find the length of the list of unique links
length = len(link_list)
#print(length)
# Add the links to the final list
list_of_links.extend(link_list)
#sleep(120)
# Increment the limit
limit = limit + length
# Find the links of the Show More page
next_page = soup.find('a', class_='load-more')
# Change the href to the Show More page link
if next_page :
next_page = base_url + next_page['href']
# In[127]:
# Final list with unique links
link_list = set(list_of_links)
# Remove the lone '/news'/ link
link_list.remove('/news/')
# Converting the set into a list
link_list = list(link_list)
# ## Extracting the data from each link
# In[128]:
all_articles = []
for item in link_list:
new_page = base_url + item
page = requests.get(new_page)
soup = BeautifulSoup(page.content, 'html.parser')
sleep(120)
article = []
article_title = soup.title.text
article.append(article_title)
#print(soup.prettify())
article_content = []
content = soup.find("div", {"class":"col-7 article-main-body row"}).findAll('p')
# Writing the content found in the list in its text form
for item in content:
article_content.append(item.text)
# Joining the list elements to form a proper paragraph
article_content = " ".join(article_content)
article.append(article_content)
all_articles.append(article)
# In[129]:
import pandas as pd
df = pd.DataFrame()
df = df.append(all_articles)
df.to_csv('cnet.csv',encoding='utf-8')
# In[1181]:
#编码:utf-8
###仅从安全主页提取链接
#在[126]中:
基本url=”https://www.cnet.com"
附加_url=“/topics/security/how-to/”
进口稀土
将numpy作为np导入
导入请求
从bs4导入BeautifulSoup
从时间上导入睡眠
#记录要报废的物品数量
极限=0;
下一页=基本url+附加url
#列表以存储链接
链接列表=[]
#根据要求更改限制
而“下一页”和“限制”似乎是熊猫身上的一只虫子。如果试图将空列表附加到数据帧,则会出现此错误
这将引发相同的错误。因此,它不是代码中的错误
pandas.DataFrame().append([])
确保您的所有文章
不是空列表,并且不会出现此错误。是否df.append()
返回任何内容?可能值得向pandas github问题跟踪器提交错误。但可能已经有报道了。目前,该图书馆有近2000期公开发行,考虑到该图书馆的受欢迎程度,这是一个非常高的数字。
pandas.DataFrame().append([])