Python 全局变量重置在Google应用程序引擎中不起作用
我正在从GAE中的处理程序调用一个web爬行函数,它检索一些图像,然后显示它们。它在第一次调用时工作正常,但下一次它显示所有相同的图像时,爬虫程序将从上次调用的位置启动。我认为这是我的全局变量没有正确重置的问题 每次我重新部署应用程序时,它都会在第一次正确运行,但问题开始了 这是我的代码,请让我知道,如果你需要我澄清它,但我认为它应该是有意义的 这里是刮刀功能Python 全局变量重置在Google应用程序引擎中不起作用,python,google-app-engine,global-variables,webapp2,Python,Google App Engine,Global Variables,Webapp2,我正在从GAE中的处理程序调用一个web爬行函数,它检索一些图像,然后显示它们。它在第一次调用时工作正常,但下一次它显示所有相同的图像时,爬虫程序将从上次调用的位置启动。我认为这是我的全局变量没有正确重置的问题 每次我重新部署应用程序时,它都会在第一次正确运行,但问题开始了 这是我的代码,请让我知道,如果你需要我澄清它,但我认为它应该是有意义的 这里是刮刀功能 visited_pages = [] visit_queue = deque([]) collected_pages = [] coll
visited_pages = []
visit_queue = deque([])
collected_pages = []
collected_pics = []
count = 0
pic_count = 0
def scrape_pages(url, root_url, keywords=[], recurse=True):
#variables
max_count = 16
pic_num = 100
global count
global pic_count
global collected_pics
global collected_pages
print 'the keywords and url are'
print keywords
print url
#this is all of the links that have been scraped
the_links = []
soup = soupify_url(url)
#only add new pages onto the queue if the recursion argument is true
if recurse:
#find all the links on the page
try:
for tag in soup.findAll('a'):
the_links.append(tag.get('href'))
except AttributeError:
return
try:
external_links, internal_links, root_links, primary_links = categorize_links(the_links, url, root_url)
except TypeError:
return
#change it so this depends on the input
links_to_visit = external_links + internal_links + root_links
#build the queue
for link in links_to_visit:
if link not in visited_pages and link not in visit_queue:
visit_queue.append(link)
visited_pages.append(url)
count = count + 1
# print 'number of pages visited'
# print count
#add pages to collected_pages depending on the criteria given if any keywords are given
if keywords:
page_to_add = find_pages(url, soup, keywords)
# print 'page to add'
# print page_to_add
if page_to_add and page_to_add not in collected_pages:
collected_pages.append(page_to_add)
pics_to_add = add_pics(url, soup)
# print 'pics to add'
# print pics_to_add
if pics_to_add:
collected_pics.extend(pics_to_add)
#here is where the actual recursion happens by finishing the queue
while visit_queue:
if count >= max_count:
return
if pic_count > pic_num:
return
link = visit_queue.popleft()
# print link
scrape_pages(link, root_url, keywords)
# print '***done***'
###done with the recursive scraping function here
#here I just get a list of links from Bing, add them to the queue and go through them then reset all the global variables
def scrape_bing_src(keywords):
visit_queue, the_url = scrape_bing.get_links(keywords, a_list = False)
scrape_pages(visit_queue.popleft(), the_url, keywords, recurse=True)
global collected_pics
global pic_count
global count
global visited_pages
global visit_queue
pic_count = 0
count = 0
visited_pages = []
visit_queue = deque([])
pics_to_return = collected_pics
collected_pics = []
return pics_to_return
#this just simply displays the images
class Try(BlogHandler):
def get(self, keyword):
keyword = str(keyword)
keyword_list = keyword.split()
img_list = scraper.scrape_bing_src(keyword_list)
for img in img_list:
self.response.write("""<br><img src='""" + img + """'>""")
self.response.write('we are done here')
下面是调用scraper函数的处理程序
visited_pages = []
visit_queue = deque([])
collected_pages = []
collected_pics = []
count = 0
pic_count = 0
def scrape_pages(url, root_url, keywords=[], recurse=True):
#variables
max_count = 16
pic_num = 100
global count
global pic_count
global collected_pics
global collected_pages
print 'the keywords and url are'
print keywords
print url
#this is all of the links that have been scraped
the_links = []
soup = soupify_url(url)
#only add new pages onto the queue if the recursion argument is true
if recurse:
#find all the links on the page
try:
for tag in soup.findAll('a'):
the_links.append(tag.get('href'))
except AttributeError:
return
try:
external_links, internal_links, root_links, primary_links = categorize_links(the_links, url, root_url)
except TypeError:
return
#change it so this depends on the input
links_to_visit = external_links + internal_links + root_links
#build the queue
for link in links_to_visit:
if link not in visited_pages and link not in visit_queue:
visit_queue.append(link)
visited_pages.append(url)
count = count + 1
# print 'number of pages visited'
# print count
#add pages to collected_pages depending on the criteria given if any keywords are given
if keywords:
page_to_add = find_pages(url, soup, keywords)
# print 'page to add'
# print page_to_add
if page_to_add and page_to_add not in collected_pages:
collected_pages.append(page_to_add)
pics_to_add = add_pics(url, soup)
# print 'pics to add'
# print pics_to_add
if pics_to_add:
collected_pics.extend(pics_to_add)
#here is where the actual recursion happens by finishing the queue
while visit_queue:
if count >= max_count:
return
if pic_count > pic_num:
return
link = visit_queue.popleft()
# print link
scrape_pages(link, root_url, keywords)
# print '***done***'
###done with the recursive scraping function here
#here I just get a list of links from Bing, add them to the queue and go through them then reset all the global variables
def scrape_bing_src(keywords):
visit_queue, the_url = scrape_bing.get_links(keywords, a_list = False)
scrape_pages(visit_queue.popleft(), the_url, keywords, recurse=True)
global collected_pics
global pic_count
global count
global visited_pages
global visit_queue
pic_count = 0
count = 0
visited_pages = []
visit_queue = deque([])
pics_to_return = collected_pics
collected_pics = []
return pics_to_return
#this just simply displays the images
class Try(BlogHandler):
def get(self, keyword):
keyword = str(keyword)
keyword_list = keyword.split()
img_list = scraper.scrape_bing_src(keyword_list)
for img in img_list:
self.response.write("""<br><img src='""" + img + """'>""")
self.response.write('we are done here')
#这只是简单地显示图像
类Try(BlogHandler):
def get(自我,关键字):
关键字=str(关键字)
关键字列表=关键字.split()
img\u list=scraper.scrape\u bing\u src(关键字列表)
对于img_列表中的img:
self.response.write(“
”)
self.response.write('我们到此结束')
您的代码并不是只在一个“服务器”和一个实例中运行,您可能已经注意到管理控制台中的“实例”选项卡。所以,即使在两次调用之间,您也有可能切换到不同的服务器,或者进程将“重新启动”(您可以读取更多)。在预热过程中,应用程序将从磁盘读取到内存中,然后开始处理请求。因此,每次您使用自己的全局变量值获取新的预缓存python实例时
在您的情况下,最好使用。可能重复伟大的资源,但似乎有相当多的信息,其中一些相互冲突。您认为最好将这些变量设置为None,还是将调用它的函数设置在原始类之外并让它返回它?没有两种方法:在函数定义中使用
keywords=None
。不,我不希望全局变量停留在实例之间。我想我只需要改变它,不再使用全局变量。全局变量不会“停留”在实例之间。然而,一旦你的应用程序被加载到一个实例上,程序就会一直运行直到关闭——这意味着在你的处理程序处理完一个请求后,全局变量是活动的。听起来你肯定不想要这里的全局数据。