Python 我的脚本不会转到下一页进行刮取

Python 我的脚本不会转到下一页进行刮取,python,web-scraping,beautifulsoup,Python,Web Scraping,Beautifulsoup,我写了一个网页抓取的代码,除了下一页活动外,一切都正常。当我运行代码从网站抓取数据时,它只是抓取第一页,而不是向前抓取其他页面的数据。实际上,我对使用python进行web抓取还不熟悉,所以请指导我。你能帮我修一下密码吗。请看一下我的代码并帮助我,谢谢 这是我的密码: import requests from bs4 import BeautifulSoup #import pandas as pd #import pandas as pd import csv def get_page(ur

我写了一个网页抓取的代码,除了下一页活动外,一切都正常。当我运行代码从网站抓取数据时,它只是抓取第一页,而不是向前抓取其他页面的数据。实际上,我对使用python进行web抓取还不熟悉,所以请指导我。你能帮我修一下密码吗。请看一下我的代码并帮助我,谢谢

这是我的密码:

import requests
from bs4 import BeautifulSoup
#import pandas as pd
#import pandas as pd
import csv

def get_page(url):
    response = requests.get(url)
    if not response.ok:
        print('server responded:', response.status_code)
    else:
        soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
    return soup

def get_detail_page(soup):

     try:
        title = (soup.find('h1',class_="cdm_style",id=False).text)
     except:
         title = 'Empty Title'
     try:
         collection = (soup.find('td',id="metadata_collec").find('a').text)
     except:
         collection = "Empty Collection"
     try:
         author = (soup.find('td',id="metadata_creato").text)
     except:
         author = "Empty Author"
     try:
         abstract = (soup.find('td',id="metadata_descri").text)
     except:
         abstract = "Empty Abstract"
     try:
         keywords = (soup.find('td',id="metadata_keywor").text)
     except:
         keywords = "Empty Keywords"
     try:
         publishers = (soup.find('td',id="metadata_publis").text)
     except:
         publishers = "Empty Publishers"
     try:
         date_original = (soup.find('td',id="metadata_contri").text)
     except:
         date_original = "Empty Date original"
     try:
        date_digital = (soup.find('td',id="metadata_date").text)
     except:
        date_digital = "Empty Date digital"
     try:
        formatt = (soup.find('td',id="metadata_source").text)
     except:
        formatt = "Empty Format"
     try:
        release_statement = (soup.find('td',id="metadata_rights").text)
     except:
        release_statement = "Empty Realease Statement"
     try:
        library = (soup.find('td',id="metadata_librar").text)
     except:
        library = "Empty Library"
     try:
        date_created = (soup.find('td',id="metadata_dmcreated").text)
     except:
        date_created = "Empty date Created"
     data = {
         'Title'        : title.strip(),
         'Collection'   : collection.strip(),
         'Author'       : author.strip(),
         'Abstract'     : abstract.strip(),
         'Keywords'     : keywords.strip(),
         'Publishers'   : publishers.strip(),
         'Date_original': date_original.strip(),
         'Date_digital' : date_digital.strip(),
         'Format'       : formatt.strip(),
         'Release-st'   : release_statement.strip(),
         'Library'      : library.strip(),
         'Date_created' : date_created.strip()


     }
     return data
def get_index_data(soup):
    try:
        titles_link = soup.find_all('a',class_="body_link_11")
    except:
        titles_link = []
    else:
        titles_link_output = []
        for link in titles_link:
            try:
                item_id = link.attrs.get('item_id', None) #All titles with valid links will have an item_id
                if item_id:
                    titles_link_output.append("{}{}".format("http://cgsc.cdmhost.com",link.attrs.get('href', None)))
            except:
                continue
    return titles_link_output
def write_csv(data,url):
    with open('11_to_55.csv','a') as csvfile:
        writer = csv.writer(csvfile)
        row = [data['Title'], data['Collection'], data['Author'],
        data['Abstract'], data['Keywords'], data['Publishers'], data['Date_original'],
        data['Date_digital'], data['Format'], data['Release-st'], data['Library'],
        data['Date_created'], url]
        writer.writerow(row)
def main():
    #url = "http://cgsc.cdmhost.com/cdm/singleitem/collection/p4013coll8/id/2653/rec/1"
    #get_page(url)
    for x in range(1,4):
        mainurl = ("http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/order/nosort/page/")
        print(x)
        url = (mainurl + str(x))
        products = get_index_data(get_page(url))
        for product in products:
            data1 = get_detail_page(get_page(product))
            write_csv(data1,product)
    #write_csv(data,url)


if __name__ == '__main__':
    main()

我开始试图找出为什么不能正确加载下一页,但在找到答案之前,我找到了另一种方法来获取您要查找的数据。页面上有一个选项,可以更改要返回的结果数量。我将其更改为10000,现在集合中的所有项目都加载到一个页面上

如果这不是您想要的,但仍然想解决页面更改问题,请告诉我,我会再看一看。

cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/display/10000/order/nosort/ad/asc

我测试了加载索引页面,但没有测试详细页面。我不想下载整个收藏

以下是我所做的一些修改和一些建议

  • 不再需要循环索引页。页面现在返回所有项目
  • 具体说明您捕获的异常情况。在这种情况下,它的AttributeError
  • 将用户代理添加到您的请求中,许多网站将阻止没有用户代理的请求
  • 祝你好运

    import requests
    from bs4 import BeautifulSoup
    #import pandas as pd
    #import pandas as pd
    import csv
    
    def get_page(url):
        response = requests.get(url, headers={'User-Agent': "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.88 Safari/537.37"})
        if not response.ok:
            print('server responded:', response.status_code)
        else:
            soup = BeautifulSoup(response.text, 'html.parser') # 1. html , 2. parser
        return soup
    
    def get_detail_page(soup):
    
        # Be specific with your exception capturing. 
        try:
            title = (soup.find('h1',class_="cdm_style",id=False).text)
        except AttributeError:
            title = 'Empty Title'
        try:
            collection = (soup.find('td',id="metadata_collec").find('a').text)
        except AttributeError:
            collection = "Empty Collection"
        try:
            author = (soup.find('td',id="metadata_creato").text)
        except AttributeError:
            author = "Empty Author"
        try:
            abstract = (soup.find('td',id="metadata_descri").text)
        except AttributeError:
            abstract = "Empty Abstract"
        try:
            keywords = (soup.find('td',id="metadata_keywor").text)
        except AttributeError:
            keywords = "Empty Keywords"
        try:
            publishers = (soup.find('td',id="metadata_publis").text)
        except AttributeError:
            publishers = "Empty Publishers"
        try:
            date_original = (soup.find('td',id="metadata_contri").text)
        except AttributeError:
            date_original = "Empty Date original"
        try:
            date_digital = (soup.find('td',id="metadata_date").text)
        except AttributeError:
            date_digital = "Empty Date digital"
        try:
            formatt = (soup.find('td',id="metadata_source").text)
        except AttributeError:
            formatt = "Empty Format"
        try:
            release_statement = (soup.find('td',id="metadata_rights").text)
        except AttributeError:
            release_statement = "Empty Realease Statement"
        try:
            library = (soup.find('td',id="metadata_librar").text)
        except AttributeError:
            library = "Empty Library"
        try:
            date_created = (soup.find('td',id="metadata_dmcreated").text)
        except AttributeError:
            date_created = "Empty date Created"
        data = {
        'Title'        : title.strip(),
        'Collection'   : collection.strip(),
        'Author'       : author.strip(),
        'Abstract'     : abstract.strip(),
        'Keywords'     : keywords.strip(),
        'Publishers'   : publishers.strip(),
        'Date_original': date_original.strip(),
        'Date_digital' : date_digital.strip(),
        'Format'       : formatt.strip(),
        'Release-st'   : release_statement.strip(),
        'Library'      : library.strip(),
        'Date_created' : date_created.strip()
        }
        return data
    
    def get_index_data(soup):
        try:
            titles_link = soup.find_all('a',class_="body_link_11")
        except:
            titles_link = []
        else:
            titles_link_output = []
            for link in titles_link:
                try:
                    item_id = link.attrs.get('item_id', None) #All titles with valid links will have an item_id
                    if item_id:
                        titles_link_output.append("{}{}".format("http://cgsc.cdmhost.com",link.attrs.get('href', None)))
                except:
                    continue
        return titles_link_output
    
    def write_csv(data,url):
        with open('11_to_55.csv','a') as csvfile:
            writer = csv.writer(csvfile)
            row = [data['Title'], data['Collection'], data['Author'],
            data['Abstract'], data['Keywords'], data['Publishers'], data['Date_original'],
            data['Date_digital'], data['Format'], data['Release-st'], data['Library'],
            data['Date_created'], url]
            writer.writerow(row)
    
    def main():
        main_url = ("http://cgsc.cdmhost.com/cdm/search/collection/p4013coll8/searchterm/1/field/all/mode/all/conn/and/display/10000/order/nosort/ad/asc")
        products = get_index_data(get_page(main_url))
        print(products)
    #     for product in products:
    #         data1 = get_detail_page(get_page(product))
    #         write_csv(data1,product)
    #     write_csv(data,url)
    
    
    if __name__ == '__main__':
        main()
    

    改变url=f“{mainurl}{x}”也许?将其从()中删除。嗨,谢谢,但它不起作用。请帮帮我。谢谢,请引导我。你能帮我修一下密码吗。请看一下我的代码,并帮助我堆栈溢出不是一个免费的调试服务,它是针对特定的技术问题。请看。另外,不要使用裸露的
    ,除了那样的
    ,请参见。@AMC我认为他们问了一个有效的问题。他们被困在一个问题上,试图解决它,请求帮助,并给我们他们的代码。当然,他们本可以把这个问题的措辞再好一点,但这是一个有效的问题。谢谢你,先生!显示10000条记录解决了我的问题。如果某个网站只显示了20条记录,并且没有选择更改显示记录,请告诉我,我们可以通过代码或其他方式进行更改吗?