Google colaboratory 代码占用了大量colab磁盘空间,我不知道';我不知道为什么

Google colaboratory 代码占用了大量colab磁盘空间,我不知道';我不知道为什么,google-colaboratory,disk,diskspace,Google Colaboratory,Disk,Diskspace,我正在colab上做n-gram建模。代码中没有任何内容会将任何内容保存到colab的磁盘上,但在编写运行代码时,该代码已经占用了超过10GB的磁盘空间。我完全不明白为什么要使用任何额外的磁盘空间,当然应该只使用内存 这是代码。我真的很蠢,错过了一些显而易见的事情吗?发生了什么事 drive.mount('/content/drive') i_path = '/content/drive/My Drive/corpora/reddit_comments/preprocessed' r_path

我正在colab上做n-gram建模。代码中没有任何内容会将任何内容保存到colab的磁盘上,但在编写运行代码时,该代码已经占用了超过10GB的磁盘空间。我完全不明白为什么要使用任何额外的磁盘空间,当然应该只使用内存

这是代码。我真的很蠢,错过了一些显而易见的事情吗?发生了什么事

drive.mount('/content/drive')
i_path = '/content/drive/My Drive/corpora/reddit_comments/preprocessed'
r_path = '/content/drive/My Drive/corpora/reddit_comments/results'

def n_grams(sentence, n):
    
    n_grams = []
    sentence = sentence.split()
    n_grams = [sentence[i:i+n] for i, _ in enumerate(sentence) if i <= len(sentence) - n]
    
    return n_grams


    #for some date, return Pr(sequence)
def seq_prob(sequence, n, date, i_path):
    
    pattern = '[^A-Za-z0-9 ]+'

    if n > len(sequence.split()):
        n = len(sequence.split())
    
    count1 = {}
    count2 = {}

    sequence = sequence.lower()
    seq_ngrams = n_grams(sequence, n)
#    print(seq_ngrams)
#    print('\n')

    for n_gram in seq_ngrams:
        
#        print(n_gram)
        count1[str(n_gram)] = 0
        count2[str(n_gram[:-1])] = 0
    
    for file in os.listdir(i_path):
        if date in file:
            
            with open(i_path + '/' + file, 'rb') as f:

                sentences = pickle.load(f)
                sentences = [re.sub(pattern, '', sentence).lower() for sentence in sentences]
                
                for sentence in sentences:
                    
                    multiplier = n - 1
                    sentence = multiplier * ' <s> ' + sentence + multiplier * ' </s> '
                    sent_ngrams = n_grams(sentence, n) 

                    for n_gram in seq_ngrams:

                        if n_gram in sent_ngrams:

                            k = sent_ngrams.count(n_gram)
                            count1[str(n_gram)] += k

                        if ' '.join(n_gram[:-1]) in sentence:

                            k = sentence.count(' '.join(n_gram[:-1]))
                            count2[str(n_gram[:-1])] += k       

    print('\n')
    print(count1)
    print(count2)
    print('\n')

    if 0 in count2.values():

        seq_prob = 0

    else:

        c_probs = [count1[str(n_gram)] / count2[str(n_gram[:-1])] for n_gram in seq_ngrams]
        seq_prob = reduce(lambda x, y: x*y, c_probs)

    return seq_prob

dates = sorted(list(set([file.split('_')[0] for file in os.listdir(i_path)])))
seq_prob_d = {}
sequence = 'austerity measures'

for date in dates:

    print('Analysing ' + date + '...')
    seq_prob_d[date] = seq_prob(sequence, 2, date, i_path)


print(seq_prob_d) 
drive.mount(“/content/drive”)
i_path='/content/drive/My drive/corpora/reddit_comments/preprocessed'
r_path='/content/drive/My drive/corpora/reddit_comments/results'
定义n_g(句子,n):
n_g=[]
句子=句子。拆分()
n_grams=[句子[i:i+n]表示i,u在枚举(句子)中,如果i len(sequence.split()):
n=len(sequence.split())
count1={}
count2={}
sequence=sequence.lower()
顺序=n克(顺序,n)
#打印(顺序图)
#打印(“\n”)
对于序列图中的n g:
#打印(n_克)
count1[str(n_gram)]=0
count2[str(n_gram[:-1])]=0
对于os.listdir(i_路径)中的文件:
如果文件中有日期:
将打开的(i_路径+'/'+文件'rb')作为f:
句子=pickle.load(f)
句子=[re.sub(句型,,,句子)。句子中句子的小写()
对于句子中的句子:
乘数=n-1
句子=乘数*“”+句子+乘数*“”
发送的随机数=n克(句子,n)
对于序列图中的n g:
如果发送的内存中有n个字符:
k=已发送内存数(纳克)
count1[str(n_gram)]+=k
如果“”。在句子中连接(n_gram[:-1]):
k=句子计数(“”.join(n_gram[:-1]))
count2[str(n_gram[:-1])]+=k
打印(“\n”)
打印(计数1)
打印(计数2)
打印(“\n”)
如果count2.values()中为0:
seq_prob=0
其他:
c_probs=[count1[str(n_gram)]/count2[str(n_gram[:-1])]表示序列中的n_gram]
seq_prob=减少(λx,y:x*y,c_probs)
返回顺序问题
日期=已排序(列表(对于os.listdir(i_路径)中的文件,设置为[file.split(“”“)[0]))
seq_prob_d={}
顺序=‘紧缩措施’
对于日期中的日期:
打印('分析'+日期+'…'))
seq_prob_d[date]=seq_prob(序列,2,日期,i_路径)
打印(顺序问题)