Pyspark 为什么我的Datastax Spark应用程序使用不寻常的内存量?
我的系统内存是32GB。我目前有3台机器,1台机器已经有4GB,另一台只有10GB,另一台14GB,而且还在不断减少。现在我的系统已经运行了44个小时,内存使用率已经非常高,并且还在不断增加,这将导致崩溃 这是我的spark executor照片: 这是我的pyspark代码,它运行的时间越长,占用的内存就越多,我不希望这样,因为我的系统会崩溃。对如何改变这一点有什么建议吗Pyspark 为什么我的Datastax Spark应用程序使用不寻常的内存量?,pyspark,spark-streaming,datastax,datastax-enterprise,Pyspark,Spark Streaming,Datastax,Datastax Enterprise,我的系统内存是32GB。我目前有3台机器,1台机器已经有4GB,另一台只有10GB,另一台14GB,而且还在不断减少。现在我的系统已经运行了44个小时,内存使用率已经非常高,并且还在不断增加,这将导致崩溃 这是我的spark executor照片: 这是我的pyspark代码,它运行的时间越长,占用的内存就越多,我不希望这样,因为我的系统会崩溃。对如何改变这一点有什么建议吗 begin = datetime.now(tz) print 'begin:%s' % (begin) article
begin = datetime.now(tz)
print 'begin:%s' % (begin)
article_channels = articlestat.join(channels).map(lambda x:(x[1][0]['id'],{'id':x[1][0]['id'],'thumbnail':x[1][0]['thumbnail'],'title':x[1][0]['title'],'url':x[1][0]['url'],'created_at':x[1][0]['created_at'],'source':x[1][0]['source'],'category':x[1][0]['category'],'author':x[1][1]['name']}))
speed_rdd = axes.map(lambda x:(x.article,[[x.at,x.comments,x.likes,x.reads,x.shares]])) \
.reduceByKey(lambda x,y:x+y) \
.filter(lambda x:len(x[1])>=2) \
.map(lambda x:(x[0],sorted(x[1],key=lambda y:y[1],reverse = True)[0],sorted(x[1],key=lambda y:y[1],reverse = True)[1])) \
.filter(lambda x:(x[1][0]-x[2][0]).seconds>0) \
.map(lambda x:(x[0],{'id':x[0],'comments':x[1][1],'likes':x[1][2],'reads':x[1][3],'shares':x[1][4],'speed':5*300*((x[1][1]-x[2][1])/((x[1][0]-x[2][0]).seconds/60.0))})) \
.filter(lambda x:x[1]['comments']>0)
statistics = article_channels.join(speed_rdd) \
.map(lambda x:{'id':x[1][0]['id'],'thumbnail':x[1][0]['thumbnail'],'title':x[1][0]['title'],'url':x[1][0]['url'],'created_at':x[1][0]['created_at'],'source':x[1][0]['source'],'category':x[1][0]['category'],'author':x[1][0]['author'],'comments':x[1][1]['comments'],'likes':x[1][1]['likes'],'reads':x[1][1]['reads'],'shares':x[1][1]['shares'],'speed':x[1][1]['speed']})
timeone=datetime.now()-timedelta(hours=1)
timethree = datetime.now()-timedelta(hours=3)
timesix = datetime.now()-timedelta(hours=6)
timetwelve = datetime.now()-timedelta(hours=12)
timetwentyfour = datetime.now()-timedelta(hours=24)
result1 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timeone).map(lambda x:Row(timespan='1',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at'],genre='',reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
result3 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timethree).map(lambda x:Row(timespan='3',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at'],genre='',reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
result6 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timesix).map(lambda x:Row(timespan='6',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at'],genre='',reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
result12 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timetwelve).map(lambda x:Row(timespan='12',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at'],genre='',reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
result24 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timetwentyfour).map(lambda x:Row(timespan='24',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at'],genre='',reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
if result1.count()>0:
print 'result1 insert=======>',result1.take(1)
session_statis.execute('DELETE FROM statistics WHERE source = %s and timespan= %s', (source,'1'))
resultschema1 = sqlContext.createDataFrame(result1)
resultschema1.write.format("org.apache.spark.sql.cassandra").options(table="statistics", keyspace = "statistics").save(mode ="append")
if result3.count()>0:
print 'result3 insert=======>',result3.take(1)
session_statis.execute('DELETE FROM statistics WHERE source = %s and timespan= %s', (source,'3'))
resultschema3 = sqlContext.createDataFrame(result3)
resultschema3.write.format("org.apache.spark.sql.cassandra").options(table="statistics", keyspace = "statistics").save(mode ="append")
if result6.count()>0:
print 'result6 insert=======>',result6.take(1)
session_statis.execute('DELETE FROM statistics WHERE source = %s and timespan= %s', (source,'6'))
resultschema6 = sqlContext.createDataFrame(result6)
resultschema6.write.format("org.apache.spark.sql.cassandra").options(table="statistics", keyspace = "statistics").save(mode ="append")
if result12.count()>0:
print 'result12 insert=======>',result12.take(1)
session_statis.execute('DELETE FROM statistics WHERE source = %s and timespan= %s', (source,'12'))
resultschema12 = sqlContext.createDataFrame(result12)
resultschema12.write.format("org.apache.spark.sql.cassandra").options(table="statistics", keyspace = "statistics").save(mode ="append")
if result24.count()>0:
print 'result24 insert=======>',result24.take(1)
session_statis.execute('DELETE FROM statistics WHERE source = %s and timespan= %s', (source,'24'))
resultschema24 = sqlContext.createDataFrame(result24)
resultschema24.write.format("org.apache.spark.sql.cassandra").options(table="statistics", keyspace = "statistics").save(mode ="append")
end = datetime.now(tz)
print 'timeone:%s timethree:%s timesix:%s timetwelve:%s timetwentyfour:%s' % (timeone,timethree,timesix,timetwelve,timetwentyfour)
print 'all is done==================================:time:%s' % (datetime.now(tz))
print 'TOTAL TIME IS:%s seconds' % ((end-begin).seconds)
编辑:
这是我的代码,有两部分。
1) 阅读卡夫卡的信息并保存给卡桑德拉。
2) 从Cassandra读取数据,然后分析数据,然后将分析后的数据保存回Cassandra
import sys
import json
from pyspark import SparkContext, SparkConf, rddsampler
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext, Row
from pyspark.streaming.kafka import OffsetRange, KafkaUtils, TopicAndPartition
from datetime import datetime, timedelta
from dateutil.parser import parse
import pickle
from cassandra.cluster import Cluster
from expiringdict import ExpiringDict
import pytz
from dateutil.tz import tzutc
tz = pytz.timezone('')
appname = str(sys.argv[1])
source = str(sys.argv[2])
cluster = Cluster(['localhost']);
session_statis = cluster.connect('keyspace')
def read_json(x):
try:
y = json.loads(x)
except:
y = 0
return y
def categoryTransform(x):
try:
body = json.loads(x['body'])
return (body['article'])
except:
return 0
def TransformInData(x):
try:
body = json.loads(x['body'])
return (body['articles'])
except:
return 0
def axesTransformData(x):
try:
body = json.loads(x['body'])
body['id'] = x['attrs']['id']
return (body)
except:
return 0
def articleInCache(rdd):
rdd_channel=rdd.map(lambda x:(x[1]['channel'],{'id':x[0],'title':x[1]['title'],'thumbnail':x[1]['thumbnail'],'url':x[1]['url'],'created_at':x[1]['created_at']})) \
.join(channels).map(lambda x:{'id':x[1][0]['id'],'title':x[1][0]['title'],'thumbnail':x[1][0]['thumbnail'],'url':x[1][0]['url'],'created_at':x[1][0]['created_at'],'author':x[1][1]['name']})
rdd_cassandra = rdd.map(lambda x:(x[0],(x[0],x[1]['thumbnail'] if x[1]['thumbnail'] else '',x[1]['title'],x[1]['url'],datetime.strptime(parse(x[1]['created_at']).strftime('%Y-%m-%d %H:%M:%S'), "%Y-%m-%d %H:%M:%S")+timedelta(hours=8),source,x[1]['category'] if x[1]['category'] else '',x[1]['channel']))) \
.subtract(articles)
rdd_article = rdd_cassandra.map(lambda x:Row(id=x[1][0],source=x[1][5],thumbnail=x[1][1],title=x[1][2],url=x[1][3],created_at=x[1][4],category=x[1][6],channel=x[1][7]))
rdd_schedule = rdd_cassandra.map(lambda x:Row(source=x[1][5],type='article',scheduled_for=x[1][4]+timedelta(minutes=5),id=x[1][0]))
rdd_article_by_created_at = rdd_cassandra.map(lambda x:Row(source=x[1][5],created_at=x[1][4],article=x[1][0]))
rdd_article_by_url = rdd_cassandra.map(lambda x:Row(url=x[1][3],article=x[1][0]))
if rdd_article.count()>0:
result_rdd_article = sqlContext.createDataFrame(rdd_article)
result_rdd_article.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if rdd_schedule.count()>0:
result_rdd_schedule = sqlContext.createDataFrame(rdd_schedule)
result_rdd_schedule.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if rdd_article_by_created_at.count()>0:
result_rdd_article_by_created_at = sqlContext.createDataFrame(rdd_article_by_created_at)
result_rdd_article_by_created_at.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if rdd_article_by_url.count()>0:
result_rdd_article_by_url = sqlContext.createDataFrame(rdd_article_by_url)
result_rdd_article_by_url.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
def categoryInCache(x):
article_update=articles.join(x).map(lambda x:Row(id=x[1][0][0],source=x[1][0][5],thumbnail=x[1][0][1],title=x[1][0][2],url=x[1][0][3],created_at=x[1][0][4],category=x[1][1]['category'],channel=x[1][0][7]))
if article_update.count()>0:
result_article_update = sqlContext.createDataFrame(article_update)
result_article_update.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
def axesInCache(rdd):
if rdd.count()>0:
axes_rdd=rdd.map(lambda x:Row(article=x[0],at=datetime.strptime(parse(x[1]['at']).strftime('%Y-%m-%d %H:%M:%S'), "%Y-%m-%d %H:%M:%S")+timedelta(hours=8),comments=x[1]['comments'],likes=x[1]['likes'],reads=0,shares=0))
axesresult = sqlContext.createDataFrame(axes_rdd)
axesresult.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
def statistics(rdd):
article_channels = articlestat.join(channels).map(lambda x:(x[1][0]['id'],{'id':x[1][0]['id'],'thumbnail':x[1][0]['thumbnail'],'title':x[1][0]['title'],'url':x[1][0]['url'],'created_at':x[1][0]['created_at'],'source':x[1][0]['source'],'category':x[1][0]['category'],'author':x[1][1]['name']}))
speed_rdd = axes.map(lambda x:(x.article,[[x.at,x.comments,x.likes,x.reads,x.shares]])) \
.reduceByKey(lambda x,y:x+y) \
.map(lambda x:(x[0],sorted(x[1],key=lambda y:y[0],reverse = True)[0],sorted(x[1],key=lambda y:y[0],reverse = True)[1]) if len(x[1])>=2 else (x[0],sorted(x[1],key=lambda y:y[0],reverse = True)[0],[sorted(x[1],key=lambda y:y[0],reverse = True)[0][0]-timedelta(seconds=300),0,0,0,0])) \
.filter(lambda x:(x[1][0]-x[2][0]).seconds>0) \
.map(lambda x:(x[0],{'id':x[0],'comments':x[1][1],'likes':x[1][2],'reads':x[1][3],'shares':x[1][4],'speed':5*300*((x[1][1]-x[2][1])/((x[1][0]-x[2][0]).seconds/60.0))})) \
.filter(lambda x:x[1]['comments']>0)
statistics = article_channels.join(speed_rdd) \
.map(lambda x:{'id':x[1][0]['id'],'thumbnail':x[1][0]['thumbnail'],'title':x[1][0]['title'],'url':x[1][0]['url'],'created_at':x[1][0]['created_at'],'source':x[1][0]['source'],'category':x[1][0]['category'],'author':x[1][0]['author'],'comments':x[1][1]['comments'],'likes':x[1][1]['likes'],'reads':x[1][1]['reads'],'shares':x[1][1]['shares'],'speed':x[1][1]['speed']})
timeone=datetime.now()-timedelta(hours=1)
timethree = datetime.now()-timedelta(hours=3)
timesix = datetime.now()-timedelta(hours=6)
timetwelve = datetime.now()-timedelta(hours=12)
timetwentyfour = datetime.now()-timedelta(hours=24)
result1 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timeone).map(lambda x:Row(timespan='1',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre='',reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
result3 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timethree).map(lambda x:Row(timespan='3',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre='',reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
result6 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timesix).map(lambda x:Row(timespan='6',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre='',reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
result12 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timetwelve).map(lambda x:Row(timespan='12',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre='',reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
result24 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timetwentyfour).map(lambda x:Row(timespan='24',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre='',reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
if result1.count()>0:
session_statis.execute('DELETE FROM tablename WHERE source = %s and timespan= %s', (source,'1'))
resultschema1 = sqlContext.createDataFrame(result1)
resultschema1.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if result3.count()>0:
session_statis.execute('DELETE FROM tablename WHERE source = %s and timespan= %s', (source,'3'))
resultschema3 = sqlContext.createDataFrame(result3)
resultschema3.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if result6.count()>0:
session_statis.execute('DELETE FROM statistics WHERE source = %s and timespan= %s', (source,'6'))
resultschema6 = sqlContext.createDataFrame(result6)
resultschema6.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if result12.count()>0:
session_statis.execute('DELETE FROM statistics WHERE source = %s and timespan= %s', (source,'12'))
resultschema12 = sqlContext.createDataFrame(result12)
resultschema12.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
if result24.count()>0:
session_statis.execute('DELETE FROM statistics WHERE source = %s and timespan= %s', (source,'24'))
resultschema24 = sqlContext.createDataFrame(result24)
resultschema24.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
conf = SparkConf().setAppName(appname)
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc,30)
sqlContext = SQLContext(sc)
channels = sc.cassandraTable("keyspace","tablename").map(lambda x:(x.id,{'name':x.name}))
articles = sc.cassandraTable("keyspace","tablename").map(lambda x:(x.id,(x.id,x.thumbnail,x.title,x.url,x.created_at+timedelta(hours=8),x.source,x.category,x.channel)))
articlestat = sc.cassandraTable('keyspace','tablename').map(lambda x:(x.channel,{'id':x.id,'thumbnail':x.thumbnail,'title':x.title,'url':x.url,'created_at':x.created_at,'source':x.source,'category':x.category,'channel':x.channel}))
axes = sc.cassandraTable('keyspace','tablename')
topic = 'topic'
kafkaParams = {"metadata.broker.list": "localhost:9092"}
category = 'category_topic'
category_stream = KafkaUtils.createDirectStream(ssc, [category], kafkaParams)
category_join_stream = category_stream.map(lambda x:read_json(x[1])).filter(lambda x:x!=0).map(lambda x:categoryTransform(x)).filter(lambda x:x!=0).map(lambda x:(x['id'],x))
category_join_stream.transform(categoryInCache).pprint()
article_stream = KafkaUtils.createDirectStream(ssc, [topic], kafkaParams)
article_join_stream=article_stream.map(lambda x:read_json(x[1])).filter(lambda x: x!=0).map(lambda x:TransformInData(x)).filter(lambda x: x!=0).flatMap(lambda x:(a for a in x)).map(lambda x:(x['id'].encode("utf-8") ,x))
article_join_stream.transform(articleInCache).pprint()
axes_topic = 'axes_topic'
axes_stream = KafkaUtils.createDirectStream(ssc, [axes_topic], kafkaParams)
axes_join_stream = axes_stream.filter(lambda x:'delete' not in str(x)).map(lambda x:read_json(x[1])).filter(lambda x: x!=0).map(lambda x:axesTransformData(x)).filter(lambda x: x!=0).map(lambda x:(str(x['id']),x))#.map(lambda x:(x[0],{'id':x[0], 'attitudes':x[1]['likes'],'reposts':0,'comments':x[1]['comments'],'speed':x[1]['comments']}))
axes_join_stream.transform(axesInCache).pprint()
stat = article_join_stream.map(lambda x:x['id']).window(5*60,5*60)
stat.transform(statistics).pprint()
ssc.start() # Start the computation ssc.awaitTermination()
ssc.awaitTermination()
非常感谢您的回复 你从哪里读数据?从卡夫卡和卡桑德拉那里,你也可以添加代码吗?我刚刚更新了你要求的代码。谢谢你能再看看这个吗?谢谢