Python 3:如何使用CSFR从网站上获取研究结果?

Python 3:如何使用CSFR从网站上获取研究结果?,python,web-scraping,beautifulsoup,csrf,Python,Web Scraping,Beautifulsoup,Csrf,我正试图从一个网站上搜集法国众贷金融科技公司的研究成果: 手动操作时,我在单选按钮中选择IFP,然后它为我提供13个结果页面,每页10个结果。每个结果都有一个超链接,我也想从中获取信息,并将其放入最终的表中 我的主要问题似乎来自CSRF,在结果地址中,有: p_auth=8mxk0SsK 因此,我不能简单地通过将链接中的p=2更改为p=13来循环搜索结果页面: 如果我尝试手动使用VPN,wesite地址将变得稳定: 所以我尝试在python代码中使用它: 导入请求 从bs4导入Beautif

我正试图从一个网站上搜集法国众贷金融科技公司的研究成果:

手动操作时,我在单选按钮中选择IFP,然后它为我提供13个结果页面,每页10个结果。每个结果都有一个超链接,我也想从中获取信息,并将其放入最终的表中

我的主要问题似乎来自CSRF,在结果地址中,有: p_auth=8mxk0SsK 因此,我不能简单地通过将链接中的p=2更改为p=13来循环搜索结果页面:

如果我尝试手动使用VPN,wesite地址将变得稳定:

所以我尝试在python代码中使用它:

导入请求 从bs4导入BeautifulSoup k=1 %测试k从1到13 url=http://www.orias.fr/search?p_p_id=intermediaryDetailedSearch_WAR_oriasportlet&p_p_lifecycle=0&p_p_state=normal&p_p_mode=view&p_p_col_id=column-1&p\p\U col\u count=1和中间详细搜索\u WAR\u或门户\u d-16544-p=+strk+&_intermediaryDetailedSearch\u WAR\u oriasportlet\u implicitModel=true&\u intermediaryDetailedSearch\u WAR\u oriasportlet\u spring\u render=searchResult response=requests.geturl,proxies=proxies它通过的代理 soup=BeautifulSoupresponse.text,html.parser table=soup.find'table',attrs={'class':'table table compressed table striped table bordered'} table_rows=table.find_all'tr' l=[] 对于表_行中的tr: td=tr.find_all'td' 行=[tr.text代表td中的tr] l、 阿佩德罗
它不像在web浏览器中那样工作,它只是提供一个页面,就好像没有请求任何结果一样。您知道如何使其工作吗?

我会在循环期间更改post请求中的页面参数。执行初始请求以查找页数

from bs4 import BeautifulSoup as bs
import requests, re, math
import pandas as pd

headers = {
    'Content-Type': 'application/x-www-form-urlencoded',
    'User-Agent': 'Mozilla/5.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'Referer': 'https://www.orias.fr/web/guest/search'
}

params = [['p_p_id', 'intermediaryDetailedSearch_WAR_oriasportlet'],
    ['p_p_lifecycle', '0'],
    ['p_p_state', 'normal'],
    ['p_p_mode', 'view'],
    ['p_p_col_id', 'column-1'],
    ['p_p_col_count', '1'],
    ['_intermediaryDetailedSearch_WAR_oriasportlet_d-16544-p', '1'],
    ['_intermediaryDetailedSearch_WAR_oriasportlet_implicitModel', 'true'],
    ['_intermediaryDetailedSearch_WAR_oriasportlet_spring_render', 'searchResult']]

data = {
  'searchString': '',
  'address': '',
  'zipCodeOrCity': '',
  '_coa': 'on',
  '_aga': 'on',
  '_ma': 'on',
  '_mia': 'on',
  '_euIAS': 'on',
  'mandatorDenomination': '',
  'wantsMandator': 'no',
  '_cobsp': 'on',
  '_mobspl': 'on',
  '_mobsp': 'on',
  '_miobsp': 'on',
  '_bankActivities': '1',
  '_euIOBSP': 'on',
  '_cif': 'on',
  '_alpsi': 'on',
  '_cip': 'on',
  'ifp': 'true',
  '_ifp': 'on',
  'submit': 'Search'
}

p = re.compile(r'(\d+)\s+intermediaries found')

with requests.Session() as s:
    r= requests.post('https://www.orias.fr/search', headers=headers, params= params, data=data)
    soup = bs(r.content, 'lxml') 
    num_results = int(p.findall(r.text)[0])
    results_per_page = 20
    num_pages = math.ceil(num_results/results_per_page)
    df = pd.read_html(str(soup.select_one('.table')))[0]

    for i in range(2, num_pages + 1):
        params[6][1] = str(i)
        r= requests.post('https://www.orias.fr/search', headers=headers, params= params, data=data)
        soup = bs(r.content, 'lxml') 
        df_next = pd.read_html(str(soup.select_one('.table')))[0]
        df = pd.concat([df, df_next])

df.drop('Unnamed: 6', axis = 1, inplace = True)
df.reset_index(drop=True) 
检查:


这里是完整的代码,同时考虑到每个结果都有一个超链接,我也想从中获取信息,并将其放入最终的表中。因此,对于每家公司,我都会更新标题和标题,然后刮去注册或删除的日期。可能有一种更优雅的方式来呈现代码

from bs4 import BeautifulSoup as bs
import requests, re, math
import pandas as pd

headers = {
    'Content-Type': 'application/x-www-form-urlencoded',
    'User-Agent': 'Mozilla/5.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'Referer': 'https://www.orias.fr/web/guest/search'
}

params = [['p_p_id', 'intermediaryDetailedSearch_WAR_oriasportlet'],
    ['p_p_lifecycle', '0'],
    ['p_p_state', 'normal'],
    ['p_p_mode', 'view'],
    ['p_p_col_id', 'column-1'],
    ['p_p_col_count', '1'],
    ['_intermediaryDetailedSearch_WAR_oriasportlet_d-16544-p', '1'],
    ['_intermediaryDetailedSearch_WAR_oriasportlet_implicitModel', 'true'],
    ['_intermediaryDetailedSearch_WAR_oriasportlet_spring_render', 'searchResult']]

data = {
  'searchString': '',
  'address': '',
  'zipCodeOrCity': '',
  '_coa': 'on',
  '_aga': 'on',
  '_ma': 'on',
  '_mia': 'on',
  '_euIAS': 'on',
  'mandatorDenomination': '',
  'wantsMandator': 'no',
  '_cobsp': 'on',
  '_mobspl': 'on',
  '_mobsp': 'on',
  '_miobsp': 'on',
  '_bankActivities': '1',
  '_euIOBSP': 'on',
  '_cif': 'on',
  '_alpsi': 'on',
  '_cip': 'on',
  'ifp': 'true',
  '_ifp': 'on',
  'submit': 'Search'
}

p = re.compile(r'(\d+)\s+intermediaries found')

def webdata(soup):
    parsed_table = soup.find_all('table')[0]
    dataweb = [[td.a['href'] if td.find('a') else 
             ''.join(td.stripped_strings)
             for td in row.find_all('td')]
            for row in parsed_table.find_all('tr')]
    dfweb = pd.DataFrame(dataweb[1:], columns=['SIREN','ID','website','category','zipcode','city','website2'])
    dfweb = dfweb.loc[:,['ID','website']]
    dfweb.ID = dfweb.ID.astype(int)
    return dfweb

with requests.Session() as s:
    r= requests.post('https://www.orias.fr/search', headers=headers, params= params, data=data)
    soup = bs(r.content, 'lxml') 
    num_results = int(p.findall(r.text)[0])
    results_per_page = 20
    num_pages = math.ceil(num_results/results_per_page)
    df = pd.read_html(str(soup.select_one('.table')))[0]
    dfweb = webdata(soup)
    df = pd.merge(df,dfweb, on='ID')

    for i in range(2, num_pages + 1):
        params[6][1] = str(i)
        r= requests.post('https://www.orias.fr/search', headers=headers, params= params, data=data)
        soup = bs(r.content, 'lxml') 
        df_next = pd.read_html(str(soup.select_one('.table')))[0]
        dfweb = webdata(soup)
        df_next  = pd.merge(df_next ,dfweb, on='ID')
        df = pd.concat([df, df_next])

df.drop('Unnamed: 6', axis = 1, inplace = True)
df = df.reset_index(drop=True) 

# list the ORIAS identity number given to frims
# get le last 6 character of the link, but last is a space
df['oriasID'] = df.website.apply(lambda x: x[-7:][:6])
# remove = sign, non digit
df['oriasID'] = df.oriasID.apply(lambda y: ''.join(i for i in y if i.isdigit()))


# new parameters
def paramsub(IDi):
    return [['p_p_id', 'intermediaryDetailedSearch_WAR_oriasportlet'],
    ['p_p_lifecycle', '1'],
    ['p_p_state', 'normal'],
    ['p_p_mode', 'view'],
    ['p_p_col_id', 'column-1'],
    ['p_p_col_count', '1'],
    ['_intermediaryDetailedSearch_WAR_oriasportlet_myaction', 'viewDetails'],
    ['_intermediaryDetailedSearch_WAR_oriasportlet_partyId', IDi]]


df['date in'] = False
df['date out'] = False
with requests.Session() as s:
    for i in df.index:
        IDi = df.loc[i,'oriasID']
        r= requests.post('https://www.orias.fr/search', headers=headers, params= paramsub(IDi), data=data)
        soup = bs(r.content, 'lxml') 
        # keep data inside blocint3 is "(IFP)" is in the text
        for rowi in soup.find_all('div',{'class':'blocint3'}):
            if 'IFP' in rowi.text:
                if 'Deleted' in rowi.text:
                    # identify date
                    df.loc[i,'date out'] = re.search(r'\d{2}-\d{2}-\d{4}', rowi.text).group()
                elif 'Registered' in rowi.text:
                    df.loc[i,'date in'] = re.search(r'\d{2}-\d{2}-\d{4}', rowi.text).group()

# dates to date format
df['date in'] = pd.to_datetime(df['date in'], format="%d-%m-%Y", errors='coerce')
df['date out'] = pd.to_datetime(df['date out'], format="%d-%m-%Y", errors='coerce')

# sort by dates
df = df.sort_values(by='date out',ascending=True)
df = df.sort_values(by='date in',ascending=True)

df = df.reset_index(drop=True) 
# export
df.to_csv('20190817_ORIAS_in_out.csv')

谢谢你,QHarr,这正是我想要做的!现在对我来说,第二步是进入每个公司的额外页面,获取它被授予成为IFP的权利的日期,或者它被删除的日期。不幸的是,他们没有给我们它被删除的情况下被授予的日期,并将其添加为数据框的一列,列表将是完美的;我将使用标题技巧!
from bs4 import BeautifulSoup as bs
import requests, re, math
import pandas as pd

headers = {
    'Content-Type': 'application/x-www-form-urlencoded',
    'User-Agent': 'Mozilla/5.0',
    'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3',
    'Referer': 'https://www.orias.fr/web/guest/search'
}

params = [['p_p_id', 'intermediaryDetailedSearch_WAR_oriasportlet'],
    ['p_p_lifecycle', '0'],
    ['p_p_state', 'normal'],
    ['p_p_mode', 'view'],
    ['p_p_col_id', 'column-1'],
    ['p_p_col_count', '1'],
    ['_intermediaryDetailedSearch_WAR_oriasportlet_d-16544-p', '1'],
    ['_intermediaryDetailedSearch_WAR_oriasportlet_implicitModel', 'true'],
    ['_intermediaryDetailedSearch_WAR_oriasportlet_spring_render', 'searchResult']]

data = {
  'searchString': '',
  'address': '',
  'zipCodeOrCity': '',
  '_coa': 'on',
  '_aga': 'on',
  '_ma': 'on',
  '_mia': 'on',
  '_euIAS': 'on',
  'mandatorDenomination': '',
  'wantsMandator': 'no',
  '_cobsp': 'on',
  '_mobspl': 'on',
  '_mobsp': 'on',
  '_miobsp': 'on',
  '_bankActivities': '1',
  '_euIOBSP': 'on',
  '_cif': 'on',
  '_alpsi': 'on',
  '_cip': 'on',
  'ifp': 'true',
  '_ifp': 'on',
  'submit': 'Search'
}

p = re.compile(r'(\d+)\s+intermediaries found')

def webdata(soup):
    parsed_table = soup.find_all('table')[0]
    dataweb = [[td.a['href'] if td.find('a') else 
             ''.join(td.stripped_strings)
             for td in row.find_all('td')]
            for row in parsed_table.find_all('tr')]
    dfweb = pd.DataFrame(dataweb[1:], columns=['SIREN','ID','website','category','zipcode','city','website2'])
    dfweb = dfweb.loc[:,['ID','website']]
    dfweb.ID = dfweb.ID.astype(int)
    return dfweb

with requests.Session() as s:
    r= requests.post('https://www.orias.fr/search', headers=headers, params= params, data=data)
    soup = bs(r.content, 'lxml') 
    num_results = int(p.findall(r.text)[0])
    results_per_page = 20
    num_pages = math.ceil(num_results/results_per_page)
    df = pd.read_html(str(soup.select_one('.table')))[0]
    dfweb = webdata(soup)
    df = pd.merge(df,dfweb, on='ID')

    for i in range(2, num_pages + 1):
        params[6][1] = str(i)
        r= requests.post('https://www.orias.fr/search', headers=headers, params= params, data=data)
        soup = bs(r.content, 'lxml') 
        df_next = pd.read_html(str(soup.select_one('.table')))[0]
        dfweb = webdata(soup)
        df_next  = pd.merge(df_next ,dfweb, on='ID')
        df = pd.concat([df, df_next])

df.drop('Unnamed: 6', axis = 1, inplace = True)
df = df.reset_index(drop=True) 

# list the ORIAS identity number given to frims
# get le last 6 character of the link, but last is a space
df['oriasID'] = df.website.apply(lambda x: x[-7:][:6])
# remove = sign, non digit
df['oriasID'] = df.oriasID.apply(lambda y: ''.join(i for i in y if i.isdigit()))


# new parameters
def paramsub(IDi):
    return [['p_p_id', 'intermediaryDetailedSearch_WAR_oriasportlet'],
    ['p_p_lifecycle', '1'],
    ['p_p_state', 'normal'],
    ['p_p_mode', 'view'],
    ['p_p_col_id', 'column-1'],
    ['p_p_col_count', '1'],
    ['_intermediaryDetailedSearch_WAR_oriasportlet_myaction', 'viewDetails'],
    ['_intermediaryDetailedSearch_WAR_oriasportlet_partyId', IDi]]


df['date in'] = False
df['date out'] = False
with requests.Session() as s:
    for i in df.index:
        IDi = df.loc[i,'oriasID']
        r= requests.post('https://www.orias.fr/search', headers=headers, params= paramsub(IDi), data=data)
        soup = bs(r.content, 'lxml') 
        # keep data inside blocint3 is "(IFP)" is in the text
        for rowi in soup.find_all('div',{'class':'blocint3'}):
            if 'IFP' in rowi.text:
                if 'Deleted' in rowi.text:
                    # identify date
                    df.loc[i,'date out'] = re.search(r'\d{2}-\d{2}-\d{4}', rowi.text).group()
                elif 'Registered' in rowi.text:
                    df.loc[i,'date in'] = re.search(r'\d{2}-\d{2}-\d{4}', rowi.text).group()

# dates to date format
df['date in'] = pd.to_datetime(df['date in'], format="%d-%m-%Y", errors='coerce')
df['date out'] = pd.to_datetime(df['date out'], format="%d-%m-%Y", errors='coerce')

# sort by dates
df = df.sort_values(by='date out',ascending=True)
df = df.sort_values(by='date in',ascending=True)

df = df.reset_index(drop=True) 
# export
df.to_csv('20190817_ORIAS_in_out.csv')