Python 403 4-5个请求后的状态代码,每个请求之间有超时
我正在试图弄清楚我的Python 403 4-5个请求后的状态代码,每个请求之间有超时,python,web-scraping,python-requests,http-headers,Python,Web Scraping,Python Requests,Http Headers,我正在试图弄清楚我的标题是否有问题或者cookies,因为我不知道为什么在3-8次请求后会出现403状态代码 我正在尝试从此url访问数据: 用ticker表示我感兴趣的股票代码 这是我的密码: import os import sys sys.path.append(os.path.abspath(r'..\..\..\.')) from STOCK_DATA.utils import * import pandas as pd import numpy as np import
标题是否有问题
或者cookies
,因为我不知道为什么在3-8次请求后会出现403状态代码
我正在尝试从此url访问数据:
用ticker
表示我感兴趣的股票代码
这是我的密码:
import os
import sys
sys.path.append(os.path.abspath(r'..\..\..\.'))
from STOCK_DATA.utils import *
import pandas as pd
import numpy as np
import requests_html
from bs4 import BeautifulSoup
import math
import glob
import re
import random
# loading various user agents
USER_AGENTS = load_agents()
TICKERS = ['aapl', 'tsla', 'fvrr', 'upwk', 'xone', 'ptmn', 'gogl', 'old',
'urov', 'mlpc', 'nss', 'pine', 'sxi', 'cai', 'pcgu', 'lcnb',
'psl', 'adm', 'nkx', 'len.b', 'ndp', 'eyesw', 'metpe', 'play',
'eig', 'dx', 'lvhd', 'rnmc', 'zyxi', 'qtec', 'cfx', 'mua',
'nvmi', 'tqqq', 'dcp', 'unma', 'nymt', 'dxyn', 'fthm', 'abg',
'bsl', 'eqrr', 'gnus', 'ccb', 'thmo', 'chmipb', 'uonek', 'agope',
'kl', 'rdhl', 'vctr', 'rndv', 'bg', 'tse', 'bgh', 'swm', 'acbi',
'brgpd', 'fedu', 'mdna', 'tzacu', 'nid', 'happ', 'bkpc', 'e',
'glre', 'infu', 'cffa', 'rdy', 'wrbpb', 'ctic', 'itub', 'all',
'mgrc', 'mfin', 'dsxpb', 'ameh', 'ulta', 'ahtpd', 'bap', 'cbpo',
'csa', 'holx', 'tlk', 'opy', 'phk', 'bprn', 'mlp', 'bscm', 'dzz',
'tslx', 'ce', 'ibth',
'sbpc', 'hsbcpa', 'ainv', 'xpo', 'dfphu', 'pdi', 'dgly']
下面的函数发出两个请求:
第一个请求只是获取cookie,因为在没有任何cookie的情况下直接请求api会导致403
def sa_test3(ticker):
# get random user agent
ua = random.sample(USER_AGENTS, 1)[0].strip()
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Host': 'seekingalpha.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': ua,
}
s = requests.Session()
s.headers.update(headers)
# make a non-api call to url to update cookies for session
url = f'https://seekingalpha.com/symbol/{ticker}'
s.get(url)
# prep for real api call
params = (
('fields[]', 'long_desc'),
('slugs', f'{ticker}'),
)
headers2 = headers.update({'Referer': f'https://seekingalpha.com/symbol/{ticker}'})
response = s.get('https://seekingalpha.com/api/v3/symbol_data',
params=params, headers = headers2)
# Example url = https://seekingalpha.com/api/v3/symbol_data?fields[]=long_desc&slugs=TSLA
print(response.status_code)
现在,我们只需循环浏览每个股票代码并添加睡眠:
def scraper():
for ticker in TICKERS:
sa_test3(ticker)
time.sleep(random.randint(5, 10))
我能够成功地获得3-10个请求,但是403
开始弹出。
我不太清楚是什么引发了反机器人技术。
我尝试增加sleep
参数,但没有成功
多谢各位
以下是便于复制和粘贴的完整代码:
import os
import sys
sys.path.append(os.path.abspath(r'..\..\..\.'))
import pandas as pd
import numpy as np
import requests_html
from bs4 import BeautifulSoup
import math
import glob
import re
import random
from STOCK_DATA.utils import *
# loading various user agents
USER_AGENTS = load_agents()
TICKERS = ['aapl', 'tsla', 'fvrr', 'upwk', 'xone', 'ptmn', 'gogl', 'old',
'urov', 'mlpc', 'nss', 'pine', 'sxi', 'cai', 'pcgu', 'lcnb',
'psl', 'adm', 'nkx', 'len.b', 'ndp', 'eyesw', 'metpe', 'play',
'eig', 'dx', 'lvhd', 'rnmc', 'zyxi', 'qtec', 'cfx', 'mua',
'nvmi', 'tqqq', 'dcp', 'unma', 'nymt', 'dxyn', 'fthm', 'abg',
'bsl', 'eqrr', 'gnus', 'ccb', 'thmo', 'chmipb', 'uonek', 'agope',
'kl', 'rdhl', 'vctr', 'rndv', 'bg', 'tse', 'bgh', 'swm', 'acbi',
'brgpd', 'fedu', 'mdna', 'tzacu', 'nid', 'happ', 'bkpc', 'e',
'glre', 'infu', 'cffa', 'rdy', 'wrbpb', 'ctic', 'itub', 'all',
'mgrc', 'mfin', 'dsxpb', 'ameh', 'ulta', 'ahtpd', 'bap', 'cbpo',
'csa', 'holx', 'tlk', 'opy', 'phk', 'bprn', 'mlp', 'bscm', 'dzz',
'tslx', 'ce', 'ibth',
'sbpc', 'hsbcpa', 'ainv', 'xpo', 'dfphu', 'pdi', 'dgly']
def sa_test3(ticker):
# get random user agent
ua = random.sample(USER_AGENTS, 1)[0].strip()
headers = {
'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
'Accept-Encoding': 'gzip, deflate, br',
'Accept-Language': 'en-US,en;q=0.5',
'Connection': 'keep-alive',
'Host': 'seekingalpha.com',
'Upgrade-Insecure-Requests': '1',
'User-Agent': ua,
}
s = requests.Session()
s.headers.update(headers)
# make a non-api call to url to update cookies for session
url = f'https://seekingalpha.com/symbol/{ticker}'
s.get(url)
# prep for real api call
params = (
('fields[]', 'long_desc'),
('slugs', f'{ticker}'),
)
headers2 = headers.update({'Referer': f'https://seekingalpha.com/symbol/{ticker}'})
response = s.get('https://seekingalpha.com/api/v3/symbol_data',
params=params, headers = headers2)
# Example url = https://seekingalpha.com/api/v3/symbol_data?fields[]=long_desc&slugs=TSLA
print(response.status_code)
def scraper():
for ticker in TICKERS:
sa_test3(ticker)
time.sleep(random.randint(8, 10))
if __name__ == '__main__':
scraper()
被类似的东西粘住了。你明白了吗?不,对不起。如果我弄明白了,我会发帖的。