Python 我正试图刮一个aspx网站，但无法超越第2页_Python_Web Scraping_Urllib_Python 3.5

Python 我正试图刮一个aspx网站，但无法超越第2页

python web-scraping

Python 我正试图刮一个aspx网站，但无法超越第2页,python,web-scraping,urllib,python-3.5,Python,Web Scraping,Urllib,Python 3.5,我正在尝试抓取一个aspx站点：。出于测试目的，请使用33133作为zipcode，100作为半径最初，我通过在搜索页面上迭代来收集个人资料链接，我成功地在第一页获得了前20个链接，但无法超出第1页，消息来源说-'很抱歉，您查找的页面或文件找不到' 请参阅下面我的代码： #!/usr/bin/env python # -*- coding: utf-8 -*- import sys, re import urllib.request, urllib.parse, time, csv from

我正在尝试抓取一个aspx站点：。出于测试目的，请使用33133作为zipcode，100作为半径

最初，我通过在搜索页面上迭代来收集个人资料链接，我成功地在第一页获得了前20个链接，但无法超出第1页，消息来源说-'很抱歉，您查找的页面或文件找不到'

请参阅下面我的代码：

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys, re
import urllib.request, urllib.parse, time, csv
from bs4 import BeautifulSoup
from lxml import html
from sys import argv

profile_links = []

def result_checker(self):
    No_results = self.xpath('//td[@colspan="3"]//p//text()')
    if "No results" in str(No_results):
        print (str(No_results).replace("['","").replace(".']","")+" for other zipcodes")
        time.sleep(10)
        sys.exit()
    else:
        pass

def Get_data(zipcode, radius):
    headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Encoding':'gzip, deflate',
                'Accept-Language':'en-US,en;q=0.8,pt;q=0.6',
                'Connection':'keep-alive',
                'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
                'Host':'www.tcms.com',
                'Origin':'https://www.aae.org',
                'Referer':'https://www.aae.org/patients/find.aspx'}

    class MyOpener(urllib.request.FancyURLopener):
        version = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17'

    myopener = MyOpener()
    url = 'https://www.aae.org/patients/find.aspx'
    f = myopener.open(url)
    soup = BeautifulSoup(f,'lxml')
    viewstate = soup.select("#__VIEWSTATE")[0]['value']
    eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
    EktronClientManager = soup.select("#EktronClientManager")[0]['value']

    formData = (
        ('__EVENTVALIDATION', eventvalidation),
        ('__VIEWSTATE', viewstate),
        ('ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch','Search'),
        ('ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$ddlRadius', radius),
        ('ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$txtZipCode',zipcode),
        ('EktronClientManager',EktronClientManager),
        ('ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$btnFind','SEARCH'))

    encodedFields = urllib.parse.urlencode(formData)
    f1 = myopener.open(url, encodedFields)
    source = f1.read()
    target = open('sample.txt','w')
    target.write(str(source))
    target.close()
    source1 = html.fromstring(source)
    result_checker(source1)
    links = source1.xpath("//table[@class='Results']//tr//a//@href")
    for each in links:
        if "MemberID" and "AddressID" in each:
            print (each)
            profile_links.append("https://www.aae.org/patients/"+str(each))
        else:
            pass

    j = 2
    soup2 = BeautifulSoup(source,'lxml')
    viewstate = soup2.select("#__VIEWSTATE")[0]['value']
    eventvalidation = soup2.select("#__EVENTVALIDATION")[0]['value']

    while j < 5:
        pages = 'Page$'+str(j)
        print (pages,'\n---------------')
        formData1 = (('__EVENTTARGET','ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$grdResults'),
                    ('__EVENTARGUMENT',pages),
                    ('__VIEWSTATE',viewstate),
                    ('__EVENTVALIDATION',eventvalidation),
                    ('ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch','Search'))

        encodedFields1 = urllib.parse.urlencode(formData1)
        f2 = myopener.open(url, encodedFields1)
        source2 = f2.read()
        target = open('sample.txt','w')
        target.write(str(source2))
        target.close()
        source3 = html.fromstring(source2)
        links2 = source3.xpath("//table[@class='Results']//tr//a//@href")
        for each1 in links2:
            if "MemberID" and "AddressID" in each1:
                print (each1)
                profile_links.append("https://www.aae.org/patients/"+str(each1))
            else:
                pass
        soup3 = BeautifulSoup(source2,'lxml')
        viewstate = soup3.select("#__VIEWSTATE")[0]['value']
        eventvalidation = soup3.select("#__EVENTVALIDATION")[0]['value']
        j+=1

if __name__ == "__main__":
    #Get_data('38132', 5)
    Get_data('33133', 100)

#/usr/bin/env python
#-*-编码：utf-8-*-
导入系统，re
导入urllib.request、urllib.parse、时间、csv
从bs4导入BeautifulSoup
从lxml导入html
从系统导入argv
配置文件链接=[]
def结果检查程序（自身）：
No_results=self.xpath（'//td[@colspan=“3”]//p//text（）
如果str中“无结果”（无结果）：
打印（str（无结果）。替换（“[”，”）。替换（“.]”，“）+”用于其他Zipcode）
时间。睡眠（10）
sys.exit（）
其他：
通过
def Get_数据（zipcode，radius）：
headers={'Accept'：'text/html，application/xhtml+xml，application/xml；q=0.9，image/webp，*/*；q=0.8'，
“接受编码”：“gzip，deflate”，
‘接受语言’：‘en-US，en；q=0.8，pt；q=0.6’，
“连接”：“保持活动状态”，
“内容类型”：“application/x-www-form-urlencoded；charset=UTF-8”，
‘主持人’：‘www.tcms.com’，
“来源”：https://www.aae.org',
“Referer”：https://www.aae.org/patients/find.aspx'}
类MyOpener（urllib.request.FancyURLopener）：
版本='Mozilla/5.0（Windows NT 6.1）AppleWebKit/537.17（KHTML，类似Gecko）Chrome/24.0.1312.57 Safari/537.17'
myopener=myopener（）
url='1〕https://www.aae.org/patients/find.aspx'
f=myopener.open（url）
汤=美汤（f，'lxml'）
viewstate=soup。选择（“#uu viewstate”）[0]['value']
eventvalidation=soup。选择（“#u eventvalidation”）[0]['value']
EktronClientManager=soup。选择（“#EktronClientManager”）[0]['value']
formData=(
（“事件验证”，事件验证），
（“视图状态”，视图状态），
（'ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch'，'Search'），
（'ctl00$ctl00$cphContentTypes$cphPageContent$AAEFINDEDO$DDLDRADIUS'，radius），
（'ctl00$ctl00$cphContentTypes$cphPageContent$aaefindedo$txtZipCode'，zipcode），
（'EktronClientManager'，EktronClientManager），
（'ctl00$ctl00$cphContentTypes$cphPageContent$aaefinddo$btnFind'，'SEARCH'））
encodedFields=urllib.parse.urlencode（formData）
f1=myopener.open（url，encodedFields）
source=f1.read（）
target=open（'sample.txt'，'w'）
target.write（str（源））
target.close（）
source1=html.fromstring（源）
结果检查程序（源1）
links=source1.xpath（//table[@class='Results']///tr//a//@href）
对于每个链接：
如果“MemberID”和“AddressID”分别为：
打印（每个）
配置文件链接。附加（“https://www.aae.org/patients/“+str（每个））
其他：
通过
j=2
soup2=beautifulsou（源，“lxml”）
viewstate=soup2。选择（“#uu viewstate”）[0]['value']
eventvalidation=soup2。选择（“#u eventvalidation”）[0]['value']
当j<5时：
pages='Page$'+str（j）
打印（第页，'\n-----------------'）
formData1=（“事件目标”，“ctl00$ctl00$cphContentTypes$cphPageContent$AAEFINDEDO$grdResults”），
（“事件参数”，第页），
（“视图状态”，视图状态），
（“事件验证”，事件验证），
（'ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch'，'Search'））
encodedFields1=urllib.parse.urlencode（formData1）
f2=myopener.open（url，编码字段1）
source2=f2.read（）
target=open（'sample.txt'，'w'）
target.write（str（source2））
target.close（）
source3=html.fromstring（source2）
links2=source3.xpath（//table[@class='Results']///tr//a//@href）
对于链接2中的每个1：
如果each1中的“MemberID”和“AddressID”：
打印（each1）
配置文件链接。附加（“https://www.aae.org/patients/“+str（each1））
其他：
通过
soup3=BeautifulSoup（源代码2，'lxml'）
viewstate=soup3。选择（“#uu viewstate”）[0]['value']
eventvalidation=soup3。选择（“#uu eventvalidation”）[0]['value']
j+=1
如果名称=“\uuuuu main\uuuuuuuu”：
#获取_数据（'38132'，5）
获取数据（'33133'，100）

是的，Greg Sadetsky，你对cookie的看法完全正确，创建一个会话，然后用所需的数据参数传递所有POST请求

在Requests lib的帮助下，我能够创建一个会话来存储可以跨请求使用的cookies

import requests
from bs4 import BeautifulSoup
from requests import Request, Session
from lxml import html

def Get_data(zipcode, radius):
    All_links = []
    url = 'https://www.aae.org/patients/find.aspx'
    s = requests.Session()
    r = s.get(url)
    #print (r.text.encode('utf-8'))
    soup = BeautifulSoup(r.content,'lxml')
    viewstate = soup.select("#__VIEWSTATE")[0]['value']
    eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
    EktronClientManager = soup.select("#EktronClientManager")[0]['value']
    params = {'EktronClientManager':EktronClientManager,
              '__VIEWSTATE':viewstate,
              '__EVENTVALIDATION':eventvalidation,
              'ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch':'Search',
              'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$ddlRadius':radius,
              'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$txtZipCode':zipcode,
              'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$btnFind':'SEARCH'}
    r2 = s.post(url,data=params)
    source = html.fromstring(r2.content)
    links = source.xpath("//table[@class='Results']//tr//a//@href")
    for each in links:
        if "MemberID" and "AddressID" in each:
            print (each)
            All_links.append("https://www.aae.org/patients/"+str(each))
    #print (r2.content)
    soup1 = BeautifulSoup(r2.content,'lxml')
    viewstate = soup1.select("#__VIEWSTATE")[0]['value']
    eventvalidation = soup1.select("#__EVENTVALIDATION")[0]['value']
    EktronClientManager = soup1.select("#EktronClientManager")[0]['value']
    j = 2
    while j < 7:
        page = 'Page$'+str(j)
        print (page)
        params1 = {'__EVENTTARGET':'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$grdResults',
                   '__EVENTARGUMENT':page,
                   'EktronClientManager':EktronClientManager,
                   '__VIEWSTATE':viewstate,
                   '__EVENTVALIDATION':eventvalidation,
                   'ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch':'Search'}
        r3 = s.post(url,data=params1)
        source1 = html.fromstring(r3.content)
        links1 = source1.xpath("//table[@class='Results']//tr//a//@href")
        for each1 in links1:
            if "MemberID" and "AddressID" in each1:
                print (each1)
                All_links.append("https://www.aae.org/patients/"+str(each1))
        soup2 = BeautifulSoup(r3.content,'lxml')
        viewstate = soup2.select("#__VIEWSTATE")[0]['value']
        eventvalidation = soup2.select("#__EVENTVALIDATION")[0]['value']
        EktronClientManager = soup2.select("#EktronClientManager")[0]['value']
        j+=1

Get_data(33133, 100)

导入请求
从bs4导入BeautifulSoup
从请求导入请求，会话
从lxml导入html
def Get_数据（zipcode，radius）：
所有链接=[]
url='1〕https://www.aae.org/patients/find.aspx'
s=请求。会话（）
r=s.get（url）
#打印（r.text.encode（'utf-8'））
汤=美汤（r.含量，'lxml'）
viewstate=soup。选择（“#uu viewstate”）[0]['value']
eventvalidation=soup。选择（“#u eventvalidation”）[0]['value']
EktronClientManager=soup。选择（“#EktronClientManager”）[0]['value']
参数={'EktronClientManager'：EktronClientManager，
“_VIEWSTATE”：VIEWSTATE，
“\uu EVENTVALIDATION”：EVENTVALIDATION，
'ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch'：'Search'，
“ctl00$ctl00$cphContentTypes$cphPageContent$AAEFINDEDO$DDLDRADIUS”：半径，
“ctl00$ctl00$cphContentTypes$cphPageContent$aaefindenddo$txtZipCode”：zipcode，
“ctl00$ctl00$cphContentTypes$cphPageContent$aaefinddo$btnFind'：'SEARCH'}
r2=s.post（url，数据=params）
source=html.fromstring（r2.content）
links=source.xpath（“//table[@class='Results']///tr//a//@href”）
对于每个链接：
如果“会员”