Python 我正试图刮一个aspx网站,但无法超越第2页

Python 我正试图刮一个aspx网站,但无法超越第2页,python,web-scraping,urllib,python-3.5,Python,Web Scraping,Urllib,Python 3.5,我正在尝试抓取一个aspx站点:。出于测试目的,请使用33133作为zipcode,100作为半径 最初,我通过在搜索页面上迭代来收集个人资料链接,我成功地在第一页获得了前20个链接,但无法超出第1页,消息来源说-'很抱歉,您查找的页面或文件找不到' 请参阅下面我的代码: #!/usr/bin/env python # -*- coding: utf-8 -*- import sys, re import urllib.request, urllib.parse, time, csv from

我正在尝试抓取一个aspx站点:。出于测试目的,请使用33133作为zipcode,100作为半径

最初,我通过在搜索页面上迭代来收集个人资料链接,我成功地在第一页获得了前20个链接,但无法超出第1页,消息来源说-'很抱歉,您查找的页面或文件找不到'

请参阅下面我的代码:

#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys, re
import urllib.request, urllib.parse, time, csv
from bs4 import BeautifulSoup
from lxml import html
from sys import argv

profile_links = []

def result_checker(self):
    No_results = self.xpath('//td[@colspan="3"]//p//text()')
    if "No results" in str(No_results):
        print (str(No_results).replace("['","").replace(".']","")+" for other zipcodes")
        time.sleep(10)
        sys.exit()
    else:
        pass

def Get_data(zipcode, radius):
    headers = {'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
                'Accept-Encoding':'gzip, deflate',
                'Accept-Language':'en-US,en;q=0.8,pt;q=0.6',
                'Connection':'keep-alive',
                'Content-Type':'application/x-www-form-urlencoded; charset=UTF-8',
                'Host':'www.tcms.com',
                'Origin':'https://www.aae.org',
                'Referer':'https://www.aae.org/patients/find.aspx'}

    class MyOpener(urllib.request.FancyURLopener):
        version = 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.17 (KHTML, like Gecko) Chrome/24.0.1312.57 Safari/537.17'

    myopener = MyOpener()
    url = 'https://www.aae.org/patients/find.aspx'
    f = myopener.open(url)
    soup = BeautifulSoup(f,'lxml')
    viewstate = soup.select("#__VIEWSTATE")[0]['value']
    eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
    EktronClientManager = soup.select("#EktronClientManager")[0]['value']

    formData = (
        ('__EVENTVALIDATION', eventvalidation),
        ('__VIEWSTATE', viewstate),
        ('ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch','Search'),
        ('ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$ddlRadius', radius),
        ('ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$txtZipCode',zipcode),
        ('EktronClientManager',EktronClientManager),
        ('ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$btnFind','SEARCH'))

    encodedFields = urllib.parse.urlencode(formData)
    f1 = myopener.open(url, encodedFields)
    source = f1.read()
    target = open('sample.txt','w')
    target.write(str(source))
    target.close()
    source1 = html.fromstring(source)
    result_checker(source1)
    links = source1.xpath("//table[@class='Results']//tr//a//@href")
    for each in links:
        if "MemberID" and "AddressID" in each:
            print (each)
            profile_links.append("https://www.aae.org/patients/"+str(each))
        else:
            pass

    j = 2
    soup2 = BeautifulSoup(source,'lxml')
    viewstate = soup2.select("#__VIEWSTATE")[0]['value']
    eventvalidation = soup2.select("#__EVENTVALIDATION")[0]['value']

    while j < 5:
        pages = 'Page$'+str(j)
        print (pages,'\n---------------')
        formData1 = (('__EVENTTARGET','ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$grdResults'),
                    ('__EVENTARGUMENT',pages),
                    ('__VIEWSTATE',viewstate),
                    ('__EVENTVALIDATION',eventvalidation),
                    ('ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch','Search'))

        encodedFields1 = urllib.parse.urlencode(formData1)
        f2 = myopener.open(url, encodedFields1)
        source2 = f2.read()
        target = open('sample.txt','w')
        target.write(str(source2))
        target.close()
        source3 = html.fromstring(source2)
        links2 = source3.xpath("//table[@class='Results']//tr//a//@href")
        for each1 in links2:
            if "MemberID" and "AddressID" in each1:
                print (each1)
                profile_links.append("https://www.aae.org/patients/"+str(each1))
            else:
                pass
        soup3 = BeautifulSoup(source2,'lxml')
        viewstate = soup3.select("#__VIEWSTATE")[0]['value']
        eventvalidation = soup3.select("#__EVENTVALIDATION")[0]['value']
        j+=1

if __name__ == "__main__":
    #Get_data('38132', 5)
    Get_data('33133', 100)
#/usr/bin/env python
#-*-编码:utf-8-*-
导入系统,re
导入urllib.request、urllib.parse、时间、csv
从bs4导入BeautifulSoup
从lxml导入html
从系统导入argv
配置文件链接=[]
def结果检查程序(自身):
No_results=self.xpath('//td[@colspan=“3”]//p//text()
如果str中“无结果”(无结果):
打印(str(无结果)。替换(“[”,”)。替换(“.]”,“)+”用于其他Zipcode)
时间。睡眠(10)
sys.exit()
其他:
通过
def Get_数据(zipcode,radius):
headers={'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
“接受编码”:“gzip,deflate”,
‘接受语言’:‘en-US,en;q=0.8,pt;q=0.6’,
“连接”:“保持活动状态”,
“内容类型”:“application/x-www-form-urlencoded;charset=UTF-8”,
‘主持人’:‘www.tcms.com’,
“来源”:https://www.aae.org',
“Referer”:https://www.aae.org/patients/find.aspx'}
类MyOpener(urllib.request.FancyURLopener):
版本='Mozilla/5.0(Windows NT 6.1)AppleWebKit/537.17(KHTML,类似Gecko)Chrome/24.0.1312.57 Safari/537.17'
myopener=myopener()
url='1〕https://www.aae.org/patients/find.aspx'
f=myopener.open(url)
汤=美汤(f,'lxml')
viewstate=soup。选择(“#uu viewstate”)[0]['value']
eventvalidation=soup。选择(“#u eventvalidation”)[0]['value']
EktronClientManager=soup。选择(“#EktronClientManager”)[0]['value']
formData=(
(“事件验证”,事件验证),
(“视图状态”,视图状态),
('ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch','Search'),
('ctl00$ctl00$cphContentTypes$cphPageContent$AAEFINDEDO$DDLDRADIUS',radius),
('ctl00$ctl00$cphContentTypes$cphPageContent$aaefindedo$txtZipCode',zipcode),
('EktronClientManager',EktronClientManager),
('ctl00$ctl00$cphContentTypes$cphPageContent$aaefinddo$btnFind','SEARCH'))
encodedFields=urllib.parse.urlencode(formData)
f1=myopener.open(url,encodedFields)
source=f1.read()
target=open('sample.txt','w')
target.write(str(源))
target.close()
source1=html.fromstring(源)
结果检查程序(源1)
links=source1.xpath(//table[@class='Results']///tr//a//@href)
对于每个链接:
如果“MemberID”和“AddressID”分别为:
打印(每个)
配置文件链接。附加(“https://www.aae.org/patients/“+str(每个))
其他:
通过
j=2
soup2=beautifulsou(源,“lxml”)
viewstate=soup2。选择(“#uu viewstate”)[0]['value']
eventvalidation=soup2。选择(“#u eventvalidation”)[0]['value']
当j<5时:
pages='Page$'+str(j)
打印(第页,'\n-----------------')
formData1=(“事件目标”,“ctl00$ctl00$cphContentTypes$cphPageContent$AAEFINDEDO$grdResults”),
(“事件参数”,第页),
(“视图状态”,视图状态),
(“事件验证”,事件验证),
('ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch','Search'))
encodedFields1=urllib.parse.urlencode(formData1)
f2=myopener.open(url,编码字段1)
source2=f2.read()
target=open('sample.txt','w')
target.write(str(source2))
target.close()
source3=html.fromstring(source2)
links2=source3.xpath(//table[@class='Results']///tr//a//@href)
对于链接2中的每个1:
如果each1中的“MemberID”和“AddressID”:
打印(each1)
配置文件链接。附加(“https://www.aae.org/patients/“+str(each1))
其他:
通过
soup3=BeautifulSoup(源代码2,'lxml')
viewstate=soup3。选择(“#uu viewstate”)[0]['value']
eventvalidation=soup3。选择(“#uu eventvalidation”)[0]['value']
j+=1
如果名称=“\uuuuu main\uuuuuuuu”:
#获取_数据('38132',5)
获取数据('33133',100)

是的,Greg Sadetsky,你对cookie的看法完全正确,创建一个会话,然后用所需的数据参数传递所有POST请求

在Requests lib的帮助下,我能够创建一个会话来存储可以跨请求使用的cookies

import requests
from bs4 import BeautifulSoup
from requests import Request, Session
from lxml import html

def Get_data(zipcode, radius):
    All_links = []
    url = 'https://www.aae.org/patients/find.aspx'
    s = requests.Session()
    r = s.get(url)
    #print (r.text.encode('utf-8'))
    soup = BeautifulSoup(r.content,'lxml')
    viewstate = soup.select("#__VIEWSTATE")[0]['value']
    eventvalidation = soup.select("#__EVENTVALIDATION")[0]['value']
    EktronClientManager = soup.select("#EktronClientManager")[0]['value']
    params = {'EktronClientManager':EktronClientManager,
              '__VIEWSTATE':viewstate,
              '__EVENTVALIDATION':eventvalidation,
              'ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch':'Search',
              'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$ddlRadius':radius,
              'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$txtZipCode':zipcode,
              'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$btnFind':'SEARCH'}
    r2 = s.post(url,data=params)
    source = html.fromstring(r2.content)
    links = source.xpath("//table[@class='Results']//tr//a//@href")
    for each in links:
        if "MemberID" and "AddressID" in each:
            print (each)
            All_links.append("https://www.aae.org/patients/"+str(each))
    #print (r2.content)
    soup1 = BeautifulSoup(r2.content,'lxml')
    viewstate = soup1.select("#__VIEWSTATE")[0]['value']
    eventvalidation = soup1.select("#__EVENTVALIDATION")[0]['value']
    EktronClientManager = soup1.select("#EktronClientManager")[0]['value']
    j = 2
    while j < 7:
        page = 'Page$'+str(j)
        print (page)
        params1 = {'__EVENTTARGET':'ctl00$ctl00$cphContentTypes$cphPageContent$aaeFindEndo$grdResults',
                   '__EVENTARGUMENT':page,
                   'EktronClientManager':EktronClientManager,
                   '__VIEWSTATE':viewstate,
                   '__EVENTVALIDATION':eventvalidation,
                   'ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch':'Search'}
        r3 = s.post(url,data=params1)
        source1 = html.fromstring(r3.content)
        links1 = source1.xpath("//table[@class='Results']//tr//a//@href")
        for each1 in links1:
            if "MemberID" and "AddressID" in each1:
                print (each1)
                All_links.append("https://www.aae.org/patients/"+str(each1))
        soup2 = BeautifulSoup(r3.content,'lxml')
        viewstate = soup2.select("#__VIEWSTATE")[0]['value']
        eventvalidation = soup2.select("#__EVENTVALIDATION")[0]['value']
        EktronClientManager = soup2.select("#EktronClientManager")[0]['value']
        j+=1

Get_data(33133, 100)
导入请求
从bs4导入BeautifulSoup
从请求导入请求,会话
从lxml导入html
def Get_数据(zipcode,radius):
所有链接=[]
url='1〕https://www.aae.org/patients/find.aspx'
s=请求。会话()
r=s.get(url)
#打印(r.text.encode('utf-8'))
汤=美汤(r.含量,'lxml')
viewstate=soup。选择(“#uu viewstate”)[0]['value']
eventvalidation=soup。选择(“#u eventvalidation”)[0]['value']
EktronClientManager=soup。选择(“#EktronClientManager”)[0]['value']
参数={'EktronClientManager':EktronClientManager,
“_VIEWSTATE”:VIEWSTATE,
“\uu EVENTVALIDATION”:EVENTVALIDATION,
'ctl00$ctl00$aaeUtilitySiteSearchWidget$tbxSiteSearch':'Search',
“ctl00$ctl00$cphContentTypes$cphPageContent$AAEFINDEDO$DDLDRADIUS”:半径,
“ctl00$ctl00$cphContentTypes$cphPageContent$aaefindenddo$txtZipCode”:zipcode,
“ctl00$ctl00$cphContentTypes$cphPageContent$aaefinddo$btnFind':'SEARCH'}
r2=s.post(url,数据=params)
source=html.fromstring(r2.content)
links=source.xpath(“//table[@class='Results']///tr//a//@href”)
对于每个链接:
如果“会员”