python程序在运行几天时会冻结
问题是: 我正在编写一个python程序,其目的是不断地从RSS提要收集新闻。我想让程序收集一周的数据。问题是,该计划永远不会持续到周末。有时在运行几天、有时几个小时甚至几分钟后会结冰。它总是冻结,没有错误。当我说冻结时,我的意思是解释器似乎还在运行,因为我不能给它任何额外的命令。我怎样才能解决这个问题 我将在下面发布代码。谢谢大家python程序在运行几天时会冻结,python,freeze,Python,Freeze,问题是: 我正在编写一个python程序,其目的是不断地从RSS提要收集新闻。我想让程序收集一周的数据。问题是,该计划永远不会持续到周末。有时在运行几天、有时几个小时甚至几分钟后会结冰。它总是冻结,没有错误。当我说冻结时,我的意思是解释器似乎还在运行,因为我不能给它任何额外的命令。我怎样才能解决这个问题 我将在下面发布代码。谢谢大家 from goose import Goose from requests import get import urllib2 import feedparser
from goose import Goose
from requests import get
import urllib2
import feedparser
from urllib2 import urlopen
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import re
import datetime as dt
import time
import os
Symbols=['AAPL','T','BA','XOM','GOOG','JPM','PG','WMT']
url='http://finance.yahoo.com/rss/headline?s='
for t in xrange(7):
AAPL=pd.DataFrame()
AAPL['Published']=""
AAPL['Title']=""
AAPL['link']=""
AAPL['ID']=""
AAPL['News']=""
T=pd.DataFrame()
T['Published']=""
T['Title']=""
T['link']=""
T['ID']=""
T['News']=""
BA=pd.DataFrame()
BA['Published']=""
BA['Title']=""
BA['link']=""
BA['ID']=""
BA['News']=""
XOM=pd.DataFrame()
XOM['Published']=""
XOM['Title']=""
XOM['link']=""
XOM['ID']=""
XOM['News']=""
GOOG=pd.DataFrame()
GOOG['Published']=""
GOOG['Title']=""
GOOG['link']=""
GOOG['ID']=""
GOOG['News']=""
JPM=pd.DataFrame()
JPM['Published']=""
JPM['Title']=""
JPM['link']=""
JPM['ID']=""
JPM['News']=""
PG=pd.DataFrame()
PG['Published']=""
PG['Title']=""
PG['link']=""
PG['ID']=""
PG['News']=""
WMT=pd.DataFrame()
WMT['Published']=""
WMT['Title']=""
WMT['link']=""
WMT['ID']=""
WMT['News']=""
DaysIDsAAPL=[]
DaysIDsT=[]
DaysIDsBA=[]
DaysIDsXOM=[]
DaysIDsGOOG=[]
DaysIDsJPM=[]
DaysIDsPG=[]
DaysIDsWMT=[]
count=0
AAPLCount=0
TCount=0
BACount=0
XOMCount=0
GOOGCount=0
JPMCount=0
PGCount=0
WMTCount=0
date=dt.date.today()
newpathAAPL = r'D:\News Data\AAPL\\'+str(t)
newpathT = r'D:\News Data\T\\'+str(t)
newpathBA = r'D:\News Data\BA\\'+str(t)
newpathXOM = r'D:\News Data\XOM\\'+str(t)
newpathGOOG = r'D:\News Data\GOOG\\'+str(t)
newpathJPM = r'D:\News Data\JPM\\'+str(t)
newpathPG = r'D:\News Data\PG\\'+str(t)
newpathWMT = r'D:\News Data\WMT\\'+str(t)
os.makedirs(newpathAAPL)
os.makedirs(newpathT)
os.makedirs(newpathBA)
os.makedirs(newpathXOM)
os.makedirs(newpathGOOG)
os.makedirs(newpathJPM)
os.makedirs(newpathPG)
os.makedirs(newpathWMT)
while dt.date.today()==date:
print "Loop"
try:
#AAPL inner most loop
d1=feedparser.parse(url+Symbols[0])
for x in xrange(len(d1['entries'])):
if int(d1.entries[x]['id'][14:]) not in DaysIDsAAPL:
DaysIDsAAPL.append(int(d1.entries[x]['id'][14:]))
y = len(AAPL.index.tolist())
m=re.search(r'\*(.*)',d1.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
AAPL.loc[y,'Title'] =d1.entries[x]['title'].encode('utf8')
AAPL.loc[y,'link'] =m.encode('utf8')
AAPL.loc[y,'Published'] =d1.entries[x]['published'].encode('utf8')
AAPL.loc[y,'ID'] =int(d1.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathAAPL+r"\\"+str(AAPLCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
AAPL.loc[y,'News'] = AAPLCount
AAPLCount+=1
AAPL=AAPL.fillna("")
AAPL.to_csv(newpathAAPL+r'\Key.csv')
except:
print m
print "AAPL"
else:
Text_file = open(newpathAAPL+r"\\"+str(AAPLCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
AAPL.loc[y,'News'] =AAPLCount
AAPLCount+=1
AAPL=AAPL.fillna("")
AAPL.to_csv(newpathAAPL+r'\Key.csv')
print "AAPL"
#T inner most loop
d2=feedparser.parse(url+Symbols[1])
for x in xrange(len(d2['entries'])):
if int(d2.entries[x]['id'][14:]) not in DaysIDsT:
DaysIDsT.append(int(d2.entries[x]['id'][14:]))
y = len(T.index.tolist())
m=re.search(r'\*(.*)',d2.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
T.loc[y,'Title'] =d2.entries[x]['title'].encode('utf8')
T.loc[y,'link'] =m.encode('utf8')
T.loc[y,'Published'] =d2.entries[x]['published'].encode('utf8')
T.loc[y,'ID'] =int(d2.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathT+r"\\"+str(TCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
T.loc[y,'News'] = TCount
TCount+=1
T=T.fillna("")
T.to_csv(newpathT+r'\Key.csv')
except:
print m
print "T"
else:
Text_file = open(newpathT+r"\\"+str(TCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
T.loc[y,'News'] =TCount
TCount+=1
T=T.fillna("")
T.to_csv(newpathT+r'\Key.csv')
print "T"
#BA inner most loop
d3=feedparser.parse(url+Symbols[2])
for x in xrange(len(d3['entries'])):
if int(d3.entries[x]['id'][14:]) not in DaysIDsBA:
DaysIDsBA.append(int(d3.entries[x]['id'][14:]))
y = len(BA.index.tolist())
m=re.search(r'\*(.*)',d3.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
BA.loc[y,'Title'] =d3.entries[x]['title'].encode('utf8')
BA.loc[y,'link'] =m.encode('utf8')
BA.loc[y,'Published'] =d3.entries[x]['published'].encode('utf8')
BA.loc[y,'ID'] =int(d3.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathBA+r"\\"+str(BACount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
BA.loc[y,'News'] = BACount
BACount+=1
BA=BA.fillna("")
BA.to_csv(newpathBA+r'\Key.csv')
except:
print m
print "BA"
else:
Text_file = open(newpathBA+r"\\"+str(BACount)+".txt", "w")
Text_file.write(text)
Text_file.close()
BA.loc[y,'News'] =BACount
BACount+=1
BA=BA.fillna("")
BA.to_csv(newpathBA+r'\Key.csv')
print "BA"
#XOM inner most loop
d4=feedparser.parse(url+Symbols[3])
for x in xrange(len(d4['entries'])):
if int(d4.entries[x]['id'][14:]) not in DaysIDsXOM:
DaysIDsXOM.append(int(d4.entries[x]['id'][14:]))
y = len(XOM.index.tolist())
m=re.search(r'\*(.*)',d4.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
XOM.loc[y,'Title'] =d4.entries[x]['title'].encode('utf8')
XOM.loc[y,'link'] =m.encode('utf8')
XOM.loc[y,'Published'] =d4.entries[x]['published'].encode('utf8')
XOM.loc[y,'ID'] =int(d4.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathXOM+r"\\"+str(XOMCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
XOM.loc[y,'News'] = XOMCount
XOMCount+=1
XOM=XOM.fillna("")
XOM.to_csv(newpathXOM+r'\Key.csv')
except:
print m
print "XOM"
else:
Text_file = open(newpathXOM+r"\\"+str(XOMCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
XOM.loc[y,'News'] =XOMCount
XOMCount+=1
XOM=XOM.fillna("")
XOM.to_csv(newpathXOM+r'\Key.csv')
#GOOG inner most loop
d5=feedparser.parse(url+Symbols[4])
for x in xrange(len(d5['entries'])):
if int(d5.entries[x]['id'][14:]) not in DaysIDsGOOG:
DaysIDsGOOG.append(int(d5.entries[x]['id'][14:]))
y = len(GOOG.index.tolist())
m=re.search(r'\*(.*)',d5.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
GOOG.loc[y,'Title'] =d5.entries[x]['title'].encode('utf8')
GOOG.loc[y,'link'] =m.encode('utf8')
GOOG.loc[y,'Published'] =d5.entries[x]['published'].encode('utf8')
GOOG.loc[y,'ID'] =int(d5.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathGOOG+r"\\"+str(GOOGCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
GOOG.loc[y,'News'] = GOOGCount
GOOGCount+=1
GOOG=GOOG.fillna("")
GOOG.to_csv(newpathGOOG+r'\Key.csv')
except:
print m
print "GOOG"
else:
Text_file = open(newpathGOOG+r"\\"+str(GOOGCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
GOOG.loc[y,'News'] =GOOGCount
GOOGCount+=1
GOOG=GOOG.fillna("")
GOOG.to_csv(newpathGOOG+r'\Key.csv')
print "GOOG"
#JPM inner most loop
d6=feedparser.parse(url+Symbols[5])
for x in xrange(len(d6['entries'])):
if int(d6.entries[x]['id'][14:]) not in DaysIDsJPM:
DaysIDsJPM.append(int(d6.entries[x]['id'][14:]))
y = len(JPM.index.tolist())
m=re.search(r'\*(.*)',d6.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
JPM.loc[y,'Title'] =d6.entries[x]['title'].encode('utf8')
JPM.loc[y,'link'] =m.encode('utf8')
JPM.loc[y,'Published'] =d6.entries[x]['published'].encode('utf8')
JPM.loc[y,'ID'] =int(d6.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == '':
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathJPM+r"\\"+str(JPMCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
JPM.loc[y,'News'] = JPMCount
JPMCount+=1
JPM=JPM.fillna("")
JPM.to_csv(newpathJPM+r'\Key.csv')
except:
print m
print "JPM"
else:
Text_file = open(newpathJPM+r"\\"+str(JPMCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
JPM.loc[y,'News'] =JPMCount
JPMCount+=1
JPM=JPM.fillna("")
JPM.to_csv(newpathJPM+r'\Key.csv')
print "JPM"
#PG inner most loop
d7=feedparser.parse(url+Symbols[6])
for x in xrange(len(d7['entries'])):
if int(d7.entries[x]['id'][14:]) not in DaysIDsPG:
DaysIDsPG.append(int(d7.entries[x]['id'][14:]))
y = len(PG.index.tolist())
m=re.search(r'\*(.*)',d7.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
PG.loc[y,'Title'] =d7.entries[x]['title'].encode('utf8')
PG.loc[y,'link'] =m.encode('utf8')
PG.loc[y,'Published'] =d7.entries[x]['published'].encode('utf8')
PG.loc[y,'ID'] =int(d7.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == "":
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathPG+r"\\"+str(PGCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
PG.loc[y,'News'] = PGCount
PGCount+=1
PG=PG.fillna("")
PG.to_csv(newpathPG+r'\Key.csv')
except:
print m
print "PG"
else:
Text_file = open(newpathPG+r"\\"+str(PGCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
PG.loc[y,'News'] =PGCount
PGCount+=1
PG=PG.fillna("")
PG.to_csv(newpathPG+r'\Key.csv')
print "PG"
#WMT inner most loop
d8=feedparser.parse(url+Symbols[7])
for x in xrange(len(d8['entries'])):
if int(d8.entries[x]['id'][14:]) not in DaysIDsWMT:
DaysIDsWMT.append(int(d8.entries[x]['id'][14:]))
y = len(WMT.index.tolist())
m=re.search(r'\*(.*)',d8.entries[x]['link'])
z=re.search(r'\?ru=yahoo\?mod=yahoo_itp',m.group(1))
if type(z) is not None:
m=re.sub(r'\?ru=yahoo\?mod=yahoo_itp', '', m.group(1))
WMT.loc[y,'Title'] =d8.entries[x]['title'].encode('utf8')
WMT.loc[y,'link'] =m.encode('utf8')
WMT.loc[y,'Published'] =d8.entries[x]['published'].encode('utf8')
WMT.loc[y,'ID'] =int(d8.entries[x]['id'][14:])
hdr = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36'}
page = get(m,headers=hdr)
extractor = Goose()
article = extractor.extract(raw_html=page.text)
text = article.cleaned_text.encode('utf8')
if text == "":
try:
url2 = m
req = urllib2.Request(url2, None, hdr)
html2 = urlopen(req).read().decode('utf8')
raw = BeautifulSoup(html2,"lxml").get_text().encode('utf8')
Text_file = open(newpathWMT+r"\\"+str(WMTCount)+".txt", "w")
Text_file.write(raw)
Text_file.close()
WMT.loc[y,'News'] = WMTCount
WMTCount+=1
WMT=WMT.fillna("")
WMT.to_csv(newpathWMT+r'\Key.csv')
except:
print m
print "WMT"
else:
Text_file = open(newpathWMT+r"\\"+str(WMTCount)+".txt", "w")
Text_file.write(text)
Text_file.close()
WMT.loc[y,'News'] =WMTCount
WMTCount+=1
WMT=WMT.fillna("")
WMT.to_csv(newpathWMT+r'\Key.csv')
print "WMT"
count+=1
print count
time.sleep(1)
except:
print "Error"
AAPL=AAPL.fillna("")
AAPL.to_csv(newpathAAPL+r'\Key.csv')
T=T.fillna("")
T.to_csv(newpathT+r'\Key.csv')
BA=BA.fillna("")
BA.to_csv(newpathBA+r'\Key.csv')
XOM=XOM.fillna("")
XOM.to_csv(newpathXOM+r'\Key.csv')
GOOG=GOOG.fillna("")
GOOG.to_csv(newpathGOOG+r'\Key.csv')
JPM=JPM.fillna("")
JPM.to_csv(newpathJPM+r'\Key.csv')
PG=PG.fillna("")
PG.to_csv(newpathPG+r'\Key.csv')
WMT=WMT.fillna("")
WMT.to_csv(newpathWMT+r'\Key.csv')
在这种情况下,如果程序收集的提要太多,或者系统上有其他活动进程(这就是冻结时间不同的原因),则会消耗太多RAM,请参阅 程序运行的进程将用于计算的数组和变量存储在进程内存(ram)中 您可以通过强制程序使用硬盘内存来修复此问题 有关解决方法(
搁置
,定期将收集的提要保存到文本文件(将信息从ram移动到rom并释放ram),…)请参阅以下链接
您有很多重复的代码;您可以将粗糙的代码提取到助手方法(小函数)中,然后传入参数。如果您的代码没有bug,请尝试禁用gc并手动收集垃圾(如果您使用的是2.6)。可能尝试分解代码以隔离问题。在不同的函数中分隔所有这些部分,并添加一个负责跟踪的装饰器,例如
时间戳-func name start-parameters
和时间戳-func name end-duration
。您所呼叫的服务可能存在一个问题,即您处理得不优雅,但在500行程序中很难判断!嘿,伙计们,谢谢你们的建议,我会尝试一下,然后报告发生了什么。对不起,代码太混乱了,我知道里面有很多冗余,我更多的是为了时间而不是清洁。事后看来,这显然很糟糕,哈哈