Python Web抓取时出现索引错误_Python

Python Web抓取时出现索引错误

python

Python Web抓取时出现索引错误,python,Python,这是我的密码- # coding: utf-8 # ## Extracting just the links from the Security home page # In[126]: base_url = "https://www.cnet.com" additional_url = "/topics/security/how-to/" import re import numpy as np import requests from bs4 import BeautifulSoup

这是我的密码-

# coding: utf-8

# ## Extracting just the links from the Security home page

# In[126]:

base_url = "https://www.cnet.com"
additional_url = "/topics/security/how-to/"

import re
import numpy as np
import requests
from bs4 import BeautifulSoup
from time import sleep

# To keep a count of the number of articles to be scrapped
limit = 0;

next_page = base_url + additional_url

# List to store the links
list_of_links = []

# Change the limit as per requirements
while next_page and limit <= 200:

    temp_list_of_links = []
    # Load and extract the content of the page
    page = requests.get(next_page)
    #sleep(15)
    soup = BeautifulSoup(page.content, 'html.parser')

    # Find the 'news' links of the page
    for link in soup.find_all('a', href=True):
        if link['href'].startswith('/news/'):
            temp_list_of_links.append(link['href'])

    # Save the unique links
    link_list = set(temp_list_of_links)

    # Find the length of the list of unique links
    length = len(link_list)
    #print(length)

    # Add the links to the final list
    list_of_links.extend(link_list)

    #sleep(120)

    # Increment the limit
    limit = limit + length

    # Find the links of the Show More page
    next_page = soup.find('a', class_='load-more')

    # Change the href to the Show More page link
    if next_page : 
        next_page = base_url + next_page['href']




# In[127]:

# Final list with unique links
link_list = set(list_of_links)

# Remove the lone '/news'/ link
link_list.remove('/news/')

# Converting the set into a list
link_list = list(link_list)


# ## Extracting the data from each link

# In[128]:

all_articles = []
for item in link_list:

    new_page = base_url + item
    page = requests.get(new_page)
    soup = BeautifulSoup(page.content, 'html.parser')

    sleep(120)

    article = []
    article_title = soup.title.text
    article.append(article_title)

    #print(soup.prettify())

    article_content = []
    content = soup.find("div", {"class":"col-7 article-main-body row"}).findAll('p')

    # Writing the content found in the list in its text form
    for item in content:
        article_content.append(item.text)

    # Joining the list elements to form a proper paragraph
    article_content = " ".join(article_content)

    article.append(article_content)
    all_articles.append(article)


# In[129]:

import pandas as pd
df = pd.DataFrame()
df = df.append(all_articles)
df.to_csv('cnet.csv',encoding='utf-8')


# In[1181]:

#编码：utf-8
###仅从安全主页提取链接
#在[126]中：
基本url=”https://www.cnet.com"
附加_url=“/topics/security/how-to/”
进口稀土
将numpy作为np导入
导入请求
从bs4导入BeautifulSoup
从时间上导入睡眠
#记录要报废的物品数量
极限=0；
下一页=基本url+附加url
#列表以存储链接
链接列表=[]
#根据要求更改限制
而“下一页”和“限制”似乎是熊猫身上的一只虫子。如果试图将空列表附加到数据帧，则会出现此错误
这将引发相同的错误。因此，它不是代码中的错误
pandas.DataFrame().append([])

确保您的所有文章
不是空列表，并且不会出现此错误。
是否df.append（）返回任何内容？可能值得向pandas github问题跟踪器提交错误。但可能已经有报道了。目前，该图书馆有近2000期公开发行，考虑到该图书馆的受欢迎程度，这是一个非常高的数字。
pandas.DataFrame().append([])