Pyspark 为什么我的Datastax Spark应用程序使用不寻常的内存量?

Pyspark 为什么我的Datastax Spark应用程序使用不寻常的内存量?,pyspark,spark-streaming,datastax,datastax-enterprise,Pyspark,Spark Streaming,Datastax,Datastax Enterprise,我的系统内存是32GB。我目前有3台机器,1台机器已经有4GB,另一台只有10GB,另一台14GB,而且还在不断减少。现在我的系统已经运行了44个小时,内存使用率已经非常高,并且还在不断增加,这将导致崩溃 这是我的spark executor照片: 这是我的pyspark代码,它运行的时间越长,占用的内存就越多,我不希望这样,因为我的系统会崩溃。对如何改变这一点有什么建议吗 begin = datetime.now(tz) print 'begin:%s' % (begin) article

我的系统内存是32GB。我目前有3台机器,1台机器已经有4GB,另一台只有10GB,另一台14GB,而且还在不断减少。现在我的系统已经运行了44个小时,内存使用率已经非常高,并且还在不断增加,这将导致崩溃

这是我的spark executor照片:

这是我的pyspark代码,它运行的时间越长,占用的内存就越多,我不希望这样,因为我的系统会崩溃。对如何改变这一点有什么建议吗

begin = datetime.now(tz)
print 'begin:%s' % (begin)

article_channels = articlestat.join(channels).map(lambda x:(x[1][0]['id'],{'id':x[1][0]['id'],'thumbnail':x[1][0]['thumbnail'],'title':x[1][0]['title'],'url':x[1][0]['url'],'created_at':x[1][0]['created_at'],'source':x[1][0]['source'],'category':x[1][0]['category'],'author':x[1][1]['name']}))

speed_rdd = axes.map(lambda x:(x.article,[[x.at,x.comments,x.likes,x.reads,x.shares]])) \
            .reduceByKey(lambda x,y:x+y) \
            .filter(lambda x:len(x[1])>=2) \
            .map(lambda x:(x[0],sorted(x[1],key=lambda y:y[1],reverse = True)[0],sorted(x[1],key=lambda y:y[1],reverse = True)[1])) \
            .filter(lambda x:(x[1][0]-x[2][0]).seconds>0) \
            .map(lambda x:(x[0],{'id':x[0],'comments':x[1][1],'likes':x[1][2],'reads':x[1][3],'shares':x[1][4],'speed':5*300*((x[1][1]-x[2][1])/((x[1][0]-x[2][0]).seconds/60.0))})) \
            .filter(lambda x:x[1]['comments']>0)

statistics = article_channels.join(speed_rdd)  \
            .map(lambda x:{'id':x[1][0]['id'],'thumbnail':x[1][0]['thumbnail'],'title':x[1][0]['title'],'url':x[1][0]['url'],'created_at':x[1][0]['created_at'],'source':x[1][0]['source'],'category':x[1][0]['category'],'author':x[1][0]['author'],'comments':x[1][1]['comments'],'likes':x[1][1]['likes'],'reads':x[1][1]['reads'],'shares':x[1][1]['shares'],'speed':x[1][1]['speed']})


timeone=datetime.now()-timedelta(hours=1)
timethree = datetime.now()-timedelta(hours=3)
timesix = datetime.now()-timedelta(hours=6)
timetwelve = datetime.now()-timedelta(hours=12)
timetwentyfour = datetime.now()-timedelta(hours=24)

result1 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timeone).map(lambda x:Row(timespan='1',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at'],genre='',reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
result3 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timethree).map(lambda x:Row(timespan='3',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at'],genre='',reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
result6 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timesix).map(lambda x:Row(timespan='6',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at'],genre='',reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
result12 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timetwelve).map(lambda x:Row(timespan='12',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at'],genre='',reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
result24 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timetwentyfour).map(lambda x:Row(timespan='24',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at'],genre='',reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))

if result1.count()>0:
    print 'result1 insert=======>',result1.take(1)
    session_statis.execute('DELETE FROM statistics WHERE source = %s and timespan= %s', (source,'1'))
    resultschema1 = sqlContext.createDataFrame(result1)
    resultschema1.write.format("org.apache.spark.sql.cassandra").options(table="statistics", keyspace = "statistics").save(mode ="append")
if result3.count()>0:   
    print 'result3 insert=======>',result3.take(1)
    session_statis.execute('DELETE FROM statistics WHERE source = %s and timespan= %s', (source,'3'))
    resultschema3 = sqlContext.createDataFrame(result3)
    resultschema3.write.format("org.apache.spark.sql.cassandra").options(table="statistics", keyspace = "statistics").save(mode ="append")

if result6.count()>0:
    print 'result6 insert=======>',result6.take(1)
    session_statis.execute('DELETE FROM statistics WHERE source = %s and timespan= %s', (source,'6'))
    resultschema6 = sqlContext.createDataFrame(result6)
    resultschema6.write.format("org.apache.spark.sql.cassandra").options(table="statistics", keyspace = "statistics").save(mode ="append")

if result12.count()>0:
    print 'result12 insert=======>',result12.take(1)
    session_statis.execute('DELETE FROM statistics WHERE source = %s and timespan= %s', (source,'12'))
    resultschema12 = sqlContext.createDataFrame(result12)
    resultschema12.write.format("org.apache.spark.sql.cassandra").options(table="statistics", keyspace = "statistics").save(mode ="append")

if result24.count()>0:
    print 'result24 insert=======>',result24.take(1)
    session_statis.execute('DELETE FROM statistics WHERE source = %s and timespan= %s', (source,'24'))
    resultschema24 = sqlContext.createDataFrame(result24)
    resultschema24.write.format("org.apache.spark.sql.cassandra").options(table="statistics", keyspace = "statistics").save(mode ="append")
end = datetime.now(tz)
print 'timeone:%s  timethree:%s timesix:%s timetwelve:%s timetwentyfour:%s' % (timeone,timethree,timesix,timetwelve,timetwentyfour)
print 'all is done==================================:time:%s' % (datetime.now(tz))
print 'TOTAL TIME IS:%s seconds' % ((end-begin).seconds)
编辑:

这是我的代码,有两部分。 1) 阅读卡夫卡的信息并保存给卡桑德拉。 2) 从Cassandra读取数据,然后分析数据,然后将分析后的数据保存回Cassandra

import sys
import json
from pyspark import SparkContext, SparkConf, rddsampler
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext, Row
from pyspark.streaming.kafka import OffsetRange, KafkaUtils, TopicAndPartition
from datetime import datetime, timedelta 
from dateutil.parser import parse 
import pickle
from cassandra.cluster import Cluster
from expiringdict import ExpiringDict
import pytz
from dateutil.tz import tzutc
tz = pytz.timezone('')
appname = str(sys.argv[1])
source = str(sys.argv[2])
cluster = Cluster(['localhost']);
session_statis = cluster.connect('keyspace')
def read_json(x):
    try:
        y = json.loads(x)
    except:
        y = 0
    return y
def categoryTransform(x):
    try:
        body = json.loads(x['body'])
        return (body['article'])
    except:
        return 0
def TransformInData(x):
    try:
        body = json.loads(x['body'])
        return (body['articles'])
    except:
        return 0
def axesTransformData(x):
    try:
        body = json.loads(x['body'])
        body['id'] = x['attrs']['id']
        return (body)
    except:
        return 0

def articleInCache(rdd):
    rdd_channel=rdd.map(lambda x:(x[1]['channel'],{'id':x[0],'title':x[1]['title'],'thumbnail':x[1]['thumbnail'],'url':x[1]['url'],'created_at':x[1]['created_at']})) \
        .join(channels).map(lambda x:{'id':x[1][0]['id'],'title':x[1][0]['title'],'thumbnail':x[1][0]['thumbnail'],'url':x[1][0]['url'],'created_at':x[1][0]['created_at'],'author':x[1][1]['name']})
    rdd_cassandra = rdd.map(lambda x:(x[0],(x[0],x[1]['thumbnail'] if x[1]['thumbnail'] else '',x[1]['title'],x[1]['url'],datetime.strptime(parse(x[1]['created_at']).strftime('%Y-%m-%d %H:%M:%S'), "%Y-%m-%d %H:%M:%S")+timedelta(hours=8),source,x[1]['category'] if x[1]['category'] else '',x[1]['channel']))) \
                        .subtract(articles)
    rdd_article = rdd_cassandra.map(lambda x:Row(id=x[1][0],source=x[1][5],thumbnail=x[1][1],title=x[1][2],url=x[1][3],created_at=x[1][4],category=x[1][6],channel=x[1][7]))
    rdd_schedule = rdd_cassandra.map(lambda x:Row(source=x[1][5],type='article',scheduled_for=x[1][4]+timedelta(minutes=5),id=x[1][0]))
    rdd_article_by_created_at = rdd_cassandra.map(lambda x:Row(source=x[1][5],created_at=x[1][4],article=x[1][0]))
    rdd_article_by_url = rdd_cassandra.map(lambda x:Row(url=x[1][3],article=x[1][0]))
    if rdd_article.count()>0:
        result_rdd_article = sqlContext.createDataFrame(rdd_article)
        result_rdd_article.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
    if rdd_schedule.count()>0:   
        result_rdd_schedule = sqlContext.createDataFrame(rdd_schedule)
        result_rdd_schedule.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
    if rdd_article_by_created_at.count()>0:  
        result_rdd_article_by_created_at = sqlContext.createDataFrame(rdd_article_by_created_at)
        result_rdd_article_by_created_at.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
    if rdd_article_by_url.count()>0:   
        result_rdd_article_by_url = sqlContext.createDataFrame(rdd_article_by_url)
        result_rdd_article_by_url.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")



def categoryInCache(x):
    article_update=articles.join(x).map(lambda x:Row(id=x[1][0][0],source=x[1][0][5],thumbnail=x[1][0][1],title=x[1][0][2],url=x[1][0][3],created_at=x[1][0][4],category=x[1][1]['category'],channel=x[1][0][7]))
    if article_update.count()>0:
        result_article_update = sqlContext.createDataFrame(article_update)
        result_article_update.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")

def axesInCache(rdd):
    if rdd.count()>0:
        axes_rdd=rdd.map(lambda x:Row(article=x[0],at=datetime.strptime(parse(x[1]['at']).strftime('%Y-%m-%d %H:%M:%S'), "%Y-%m-%d %H:%M:%S")+timedelta(hours=8),comments=x[1]['comments'],likes=x[1]['likes'],reads=0,shares=0))
        axesresult = sqlContext.createDataFrame(axes_rdd)
        axesresult.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
def statistics(rdd):
    article_channels = articlestat.join(channels).map(lambda x:(x[1][0]['id'],{'id':x[1][0]['id'],'thumbnail':x[1][0]['thumbnail'],'title':x[1][0]['title'],'url':x[1][0]['url'],'created_at':x[1][0]['created_at'],'source':x[1][0]['source'],'category':x[1][0]['category'],'author':x[1][1]['name']}))
    speed_rdd = axes.map(lambda x:(x.article,[[x.at,x.comments,x.likes,x.reads,x.shares]])) \
                .reduceByKey(lambda x,y:x+y) \
                .map(lambda x:(x[0],sorted(x[1],key=lambda y:y[0],reverse = True)[0],sorted(x[1],key=lambda y:y[0],reverse = True)[1]) if len(x[1])>=2 else (x[0],sorted(x[1],key=lambda y:y[0],reverse = True)[0],[sorted(x[1],key=lambda y:y[0],reverse = True)[0][0]-timedelta(seconds=300),0,0,0,0])) \
                .filter(lambda x:(x[1][0]-x[2][0]).seconds>0) \
                .map(lambda x:(x[0],{'id':x[0],'comments':x[1][1],'likes':x[1][2],'reads':x[1][3],'shares':x[1][4],'speed':5*300*((x[1][1]-x[2][1])/((x[1][0]-x[2][0]).seconds/60.0))})) \
                .filter(lambda x:x[1]['comments']>0)
    statistics = article_channels.join(speed_rdd)  \
                .map(lambda x:{'id':x[1][0]['id'],'thumbnail':x[1][0]['thumbnail'],'title':x[1][0]['title'],'url':x[1][0]['url'],'created_at':x[1][0]['created_at'],'source':x[1][0]['source'],'category':x[1][0]['category'],'author':x[1][0]['author'],'comments':x[1][1]['comments'],'likes':x[1][1]['likes'],'reads':x[1][1]['reads'],'shares':x[1][1]['shares'],'speed':x[1][1]['speed']})
    timeone=datetime.now()-timedelta(hours=1)
    timethree = datetime.now()-timedelta(hours=3)
    timesix = datetime.now()-timedelta(hours=6)
    timetwelve = datetime.now()-timedelta(hours=12)
    timetwentyfour = datetime.now()-timedelta(hours=24)
    result1 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timeone).map(lambda x:Row(timespan='1',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre='',reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
    result3 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timethree).map(lambda x:Row(timespan='3',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre='',reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
    result6 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timesix).map(lambda x:Row(timespan='6',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre='',reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
    result12 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timetwelve).map(lambda x:Row(timespan='12',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre='',reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
    result24 = statistics.filter(lambda x:x['created_at']+timedelta(hours=8)>=timetwentyfour).map(lambda x:Row(timespan='24',source=source,id=x['id'],title=x['title'],thumbnail=x['thumbnail'],url=x['url'],created_at=x['created_at']+timedelta(hours=8),genre='',reads=0,likes=x['likes'],comments=x['comments'],shares=x['shares'],speed=x['speed'],category=x['category'],author=x['author']))
    if result1.count()>0:
        session_statis.execute('DELETE FROM tablename WHERE source = %s and timespan= %s', (source,'1'))
        resultschema1 = sqlContext.createDataFrame(result1)
        resultschema1.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
    if result3.count()>0:   
        session_statis.execute('DELETE FROM tablename WHERE source = %s and timespan= %s', (source,'3'))
        resultschema3 = sqlContext.createDataFrame(result3)
        resultschema3.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")

    if result6.count()>0:
        session_statis.execute('DELETE FROM statistics WHERE source = %s and timespan= %s', (source,'6'))
        resultschema6 = sqlContext.createDataFrame(result6)
        resultschema6.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")

    if result12.count()>0:
        session_statis.execute('DELETE FROM statistics WHERE source = %s and timespan= %s', (source,'12'))
        resultschema12 = sqlContext.createDataFrame(result12)
        resultschema12.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")

    if result24.count()>0:
        session_statis.execute('DELETE FROM statistics WHERE source = %s and timespan= %s', (source,'24'))
        resultschema24 = sqlContext.createDataFrame(result24)
        resultschema24.write.format("org.apache.spark.sql.cassandra").options(table="tablename", keyspace = "keyspace").save(mode ="append")
conf = SparkConf().setAppName(appname)
sc = SparkContext(conf=conf)
ssc = StreamingContext(sc,30)
sqlContext = SQLContext(sc)
channels = sc.cassandraTable("keyspace","tablename").map(lambda x:(x.id,{'name':x.name}))
articles = sc.cassandraTable("keyspace","tablename").map(lambda x:(x.id,(x.id,x.thumbnail,x.title,x.url,x.created_at+timedelta(hours=8),x.source,x.category,x.channel)))
articlestat = sc.cassandraTable('keyspace','tablename').map(lambda x:(x.channel,{'id':x.id,'thumbnail':x.thumbnail,'title':x.title,'url':x.url,'created_at':x.created_at,'source':x.source,'category':x.category,'channel':x.channel}))
axes = sc.cassandraTable('keyspace','tablename')
topic = 'topic'
kafkaParams = {"metadata.broker.list": "localhost:9092"}
category = 'category_topic'
category_stream = KafkaUtils.createDirectStream(ssc, [category], kafkaParams)
category_join_stream = category_stream.map(lambda x:read_json(x[1])).filter(lambda x:x!=0).map(lambda x:categoryTransform(x)).filter(lambda x:x!=0).map(lambda x:(x['id'],x))
category_join_stream.transform(categoryInCache).pprint()
article_stream = KafkaUtils.createDirectStream(ssc, [topic], kafkaParams)
article_join_stream=article_stream.map(lambda x:read_json(x[1])).filter(lambda x: x!=0).map(lambda x:TransformInData(x)).filter(lambda x: x!=0).flatMap(lambda x:(a for a in x)).map(lambda x:(x['id'].encode("utf-8") ,x))
article_join_stream.transform(articleInCache).pprint()
axes_topic = 'axes_topic'
axes_stream = KafkaUtils.createDirectStream(ssc, [axes_topic], kafkaParams)
axes_join_stream = axes_stream.filter(lambda x:'delete' not in str(x)).map(lambda x:read_json(x[1])).filter(lambda x: x!=0).map(lambda x:axesTransformData(x)).filter(lambda x: x!=0).map(lambda x:(str(x['id']),x))#.map(lambda x:(x[0],{'id':x[0], 'attitudes':x[1]['likes'],'reposts':0,'comments':x[1]['comments'],'speed':x[1]['comments']}))
axes_join_stream.transform(axesInCache).pprint()
stat = article_join_stream.map(lambda x:x['id']).window(5*60,5*60)
stat.transform(statistics).pprint()
ssc.start()    # Start the computation ssc.awaitTermination()
ssc.awaitTermination()

非常感谢您的回复

你从哪里读数据?从卡夫卡和卡桑德拉那里,你也可以添加代码吗?我刚刚更新了你要求的代码。谢谢你能再看看这个吗?谢谢