Web scraping 使用BeautifulSoup检索Google Scholar结果时遇到的问题
我将继续从我的这篇文章开始的分析。我在一个由四列组成的数据框架中获得了关于具体工作文件出版物的信息:出版年份、出版顺序(出版物每年的顺序,在这种情况下毫无用处)、标题和作者。因此,我想使用这个数据框来抓取Google Scholar并检索有关引用数量的信息。 因为一些论文的标题有点泛化,在某些情况下,谷歌学者的第一个成果实际上不是我感兴趣的。因此,为了进行更具针对性的研究,在创建执行研究的链接时,我已包括每篇论文的标题和作者。我已经按照线程编写了代码 注意:因为执行此刮取需要真实名称,所以我不希望创建示例数据帧。我将.csv文件上传到我的GitHub上Web scraping 使用BeautifulSoup检索Google Scholar结果时遇到的问题,web-scraping,beautifulsoup,google-scholar,Web Scraping,Beautifulsoup,Google Scholar,我将继续从我的这篇文章开始的分析。我在一个由四列组成的数据框架中获得了关于具体工作文件出版物的信息:出版年份、出版顺序(出版物每年的顺序,在这种情况下毫无用处)、标题和作者。因此,我想使用这个数据框来抓取Google Scholar并检索有关引用数量的信息。 因为一些论文的标题有点泛化,在某些情况下,谷歌学者的第一个成果实际上不是我感兴趣的。因此,为了进行更具针对性的研究,在创建执行研究的链接时,我已包括每篇论文的标题和作者。我已经按照线程编写了代码 注意:因为执行此刮取需要真实名称,所以我不希
import pandas as pd
import requests
from bs4 import BeautifulSoup as bs
from random import randint
from time import sleep
url = 'https://raw.githubusercontent.com/nicolacaravaggio/working_paper_roma3/master/rm3_working_paper_list.csv'
df = pd.read_csv(url, error_bad_lines = False)
papers = []
for index, rows in df.iterrows():
list_paper = rows.title + ' ' + rows.author
papers.append(list_paper)
title_list_gs = []
citations_list_gs = []
with requests.Session() as s:
for paper in papers:
sleep(randint(1,3))
url = 'https://scholar.google.com/scholar?q=' + paper + '&ie=UTF-8&oe=UTF-8&hl=en&btnG=Search'
r = s.get(url)
soup = bs(r.content, 'html.parser')
title_gs = soup.select_one('h3.gs_rt a').text if soup.select_one('h3.gs_rt a') is not None else 'No title'
title_list_gs.append(title_gs)
citations_gs = soup.select_one('a:contains("Cited by")').text if soup.select_one('a:contains("Cited by")') is not None else 'No citation count'
citations_list_gs.append(citations_gs)
print('Title:', title_gs, '; Citations:', citations_gs)
但是,我从这个脚本中得到的结果只是以下列表:
Title: No title ; Citations: No Citation count
我不确定问题是否出在我笨拙的脚本中(可能),或者是谷歌阻止我从学者那里获取太多东西。事实上,即使是我在这个线程中用作起点的,它也不总是以预期的结果返回。我希望有人能给我一些建议。提前谢谢。听起来你好像在触发机器人检测。根据谷歌学者的个人经验,45秒足以避免验证码和机器人检测。我有一台铲运机在没有检测到的情况下运行了3天以上。如果您确实被标记,等待大约2小时就足以重新开始
它显示验证码这就是它不返回任何数据的原因
class ScholarScrape():
def __init__(self):
self.page = None
self.last_url = None
self.last_time = time.time()
self.min_time_between_scrape = int(ConfigFile.instance().config.get('scholar','bot_avoidance_time'))
self.header = {'User-Agent':ConfigFile.instance().config.get('scholar','user_agent')}
self.session = requests.Session()
pass
def search(self, query=None, year_lo=None, year_hi=None, title_only=False, publication_string=None, author_string=None, include_citations=True, include_patents=True):
url = self.get_url(query, year_lo, year_hi, title_only, publication_string, author_string, include_citations, include_patents)
while True:
wait_time = self.min_time_between_scrape - (time.time() - self.last_time)
if wait_time > 0:
logger.info("Delaying search by {} seconds to avoid bot detection.".format(wait_time))
time.sleep(wait_time)
self.last_time = time.time()
logger.info("SCHOLARSCRAPE: " + url)
self.page = BeautifulSoup(self.session.get(url, headers=self.header).text, 'html.parser')
self.last_url = url
if "Our systems have detected unusual traffic from your computer network" in str(self.page):
raise BotDetectionException("Google has blocked this computer for a short time because it has detected this scraping script.")
return
def get_url(self, query=None, year_lo=None, year_hi=None, title_only=False, publication_string=None, author_string=None, include_citations=True, include_patents=True):
base_url = "https://scholar.google.com.au/scholar?"
url = base_url + "as_q=" + urllib.parse.quote(query)
if year_lo is not None and bool(re.match(r'.*([1-3][0-9]{3})', str(year_lo))):
url += "&as_ylo=" + str(year_lo)
if year_hi is not None and bool(re.match(r'.*([1-3][0-9]{3})', str(year_hi))):
url += "&as_yhi=" + str(year_hi)
if title_only:
url += "&as_yhi=title"
else:
url += "&as_yhi=any"
if publication_string is not None:
url += "&as_publication=" + urllib.parse.quote('"' + str(publication_string) + '"')
if author_string is not None:
url += "&as_sauthors=" + urllib.parse.quote('"' + str(author_string) + '"')
if include_citations:
url += "&as_vis=0"
else:
url += "&as_vis=1"
if include_patents:
url += "&as_sdt=0"
else:
url += "&as_sdt=1"
return url
def get_results_count(self):
e = self.page.findAll("div", {"class": "gs_ab_mdw"})
try:
item = e[1].text.strip()
except IndexError as ex:
if "Our systems have detected unusual traffic from your computer network" in str(self.page):
raise BotDetectionException("Google has blocked this computer for a short time because it has detected this scraping script.")
else:
raise ex
if self.has_numbers(item):
return self.get_results_count_from_soup_string(item)
for item in e:
item = item.text.strip()
if self.has_numbers(item):
return self.get_results_count_from_soup_string(item)
return 0
@staticmethod
def get_results_count_from_soup_string(element):
if "About" in element:
num = element.split(" ")[1].strip().replace(",","")
else:
num = element.split(" ")[0].strip().replace(",","")
return num
@staticmethod
def has_numbers(input_string):
return any(char.isdigit() for char in input_string)
class BotDetectionException(Exception):
pass
if __name__ == "__main__":
s = ScholarScrape()
s.search(**{
"query":"\"policy shaping\"",
# "publication_string":"JMLR",
"author_string": "gilboa",
"year_lo": "1995",
"year_hi": "2005",
})
x = s.get_results_count()
print(x)