Python 口是心非
我得到一个关键错误:在一个大的文本文件上执行单词包时。这在几年前就已经奏效了,但我把它挖了回来,并用pycharm而不是emacs和Python2.7在Python3.7中重新编写了它。希望运行一个旧的NLP示例,最终与新的技术进行比较Python 口是心非,python,python-3.x,nlp,key,Python,Python 3.x,Nlp,Key,我得到一个关键错误:在一个大的文本文件上执行单词包时。这在几年前就已经奏效了,但我把它挖了回来,并用pycharm而不是emacs和Python2.7在Python3.7中重新编写了它。希望运行一个旧的NLP示例,最终与新的技术进行比较 KeyError: 'learning' line 12, in get_bag_of_words bag_of_words[word] += course_bag_of_words[word] 我怎么找到这个?真的不知道每天大部分时间都在读帖子该怎么
KeyError: 'learning'
line 12, in get_bag_of_words
bag_of_words[word] += course_bag_of_words[word]
我怎么找到这个?真的不知道每天大部分时间都在读帖子该怎么办
如果它是键或条目,我应该执行删除或弹出之类的操作吗
或者是单词功能袋有问题
def get_bag_of_words(titles_lines):
bag_of_words = {}
for line in titles_lines[1:]:
courseid, course_bag_of_words = get_course_bag_of_words(line)
for word in course_bag_of_words:
if word not in course_bag_of_words:
bag_of_words[word] = course_bag_of_words[word]
else:
bag_of_words[word] += course_bag_of_words[word]
return bag_of_words
我的github上的完整代码
全速运行.py
from myfuncs import *
# import myfuncs
# get keywords, inverted index and titles
f = open('s2-titles.txt', encoding = "utf8")
titles_lines = f.readlines()
f.close()
bag_of_words = get_bag_of_words(titles_lines)
keywords = get_keywords(titles_lines, bag_of_words)
inverted_index = get_inverted_index(keywords)
titles = get_titles(titles_lines)
# run search query
query = input('Input your search query: ')
while query != '':
query_terms = query.split()
sorted_results = get_search_results(query_terms,
keywords,
inverted_index)
print('==> search results for query:', query)
for result in sorted_results:
print(result, titles[result])
query = input('Input your search query [hit return to finish]: ')
# get unit vectors
f = open('s2-categories.tsv', encoding = "utf8")
categories_lines = f.readlines()
f.close()
unit_vectors = get_dot_product(keywords, categories_lines)
# run recommendation algorithm
seed_courseid = input('Input your seed courseid: ')
while seed_courseid != '':
sorted_results = get_recommendation_results(seed_courseid,
keywords,
inverted_index,
unit_vectors)
print('==> recommendation results:')
for result in sorted_results:
print(result, titles[result])
print(get_dot_product(seed_courseid, result, unit_vectors))
seed_courseid = input('Input seed courseid [hit return to finish]:')
完全myfuncs.py
#!/usr/bin/env python
# coding: utf-8
def get_bag_of_words(titles_lines):
bag_of_words = {}
for line in titles_lines[1:]:
courseid, course_bag_of_words = get_course_bag_of_words(line)
for word in course_bag_of_words:
if word not in course_bag_of_words:
bag_of_words[word] = course_bag_of_words[word]
else:
bag_of_words[word] += course_bag_of_words[word]
return bag_of_words
def get_course_bag_of_words(line):
course_bag_of_words = {}
# split by weirdcombo to prevent weird splits
courseid, title, description = line.split('XXXYYYZZZ')
title = title.lower()
description = description.lower()
wordlist = title.split() + description.split()
if len(wordlist) >= 10:
for word in wordlist:
if word not in course_bag_of_words:
course_bag_of_words[word] = 1
else:
course_bag_of_words[word] += 1
return courseid, course_bag_of_words
def get_sorted_results(d):
kv_list = d.items()
vk_list = []
for kv in kv_list:
k, v = kv
vk = v, k
vk_list.append(vk)
vk_list.sort()
vk_list.reverse()
k_list = []
for vk in vk_list[:10]:
v, k = vk
k_list.append(k)
return k_list
def get_keywords(titles_lines, bag_of_words):
n = sum(bag_of_words.values())
keywords = {}
for line in titles_lines[1:]:
courseid, course_bag_of_words = get_course_bag_of_words(line)
term_importance = {}
for word in course_bag_of_words:
tf_course = (float(course_bag_of_words[word]) /
sum(course_bag_of_words.values())
)
tf_overall = float(bag_of_words[word]) / n
term_importance[word] = tf_course / tf_overall
keywords[courseid] = get_sorted_results(term_importance)
return keywords
def get_inverted_index(keywords):
inverted_index = {}
for courseid in keywords:
for keyword in keywords[courseid]:
if keyword not in inverted_index:
inverted_index[keyword] = []
inverted_index[keyword].append(courseid)
return inverted_index
def get_search_results(query_terms, keywords, inverted_index):
search_results = {}
for term in query_terms:
if term in inverted_index:
for courseid in inverted_index[term]:
if courseid not in search_results:
search_results[courseid] = 0.0
search_results[courseid] += (
1 / float(keywords[courseid].index(term) + 1) *
1 / float(query_terms(term) + 1)
)
sorted_results = get_sorted_results(search_results)
return sorted_results
def get_titles(titles_lines):
titles = {}
for line in titles_lines[1:]:
courseid, title, description = line.split('XXXYYYZZZ')
titles[courseid] = title[:60] # take first 60 characters
return titles
def get_unit_vectors(keywords, categories_lines):
norm = 1.884
cat = {}
subcat = {}
for line in categories_lines[1:]:
courseid, category, subcategory = line.split('\t')
cat[courseid] = category.strip()
subcat[courseid] = subcategory.strip()
unit_vectors = {}
for courseid in keywords:
u = {}
if courseid in cat:
u[cat[courseid]] = 1 / norm
u[subcat[courseid]] = 1 / norm
for keyword in keywords[courseid]:
u[keyword] = (
1 /
float(keywords[courseid].index(keyword) + 1) /
norm
)
unit_vectors[courseid] = u
return unit_vectors
def get_dot_product(courseid1, courseid2, unit_vectors):
u1 = unit_vectors[courseid1]
u2 = unit_vectors[courseid2]
dot_product = 0.0
for dimension in u1:
if dimension in u2:
dot_product += u1[dimension] * u2[dimension]
return dot_product
def get_recommendation_results(seed_courseid,
keywords,
inverted_index,
unit_vectors):
courseids = []
for keyword in keywords[seed_courseid]:
for courseid in inverted_index[keyword]:
if courseid not in courseids and courseid != seed_courseid:
courseids.append(courseid)
dot_products = {}
for courseid in courseids:
dot_products[courseids] = get_dot_product(seed_courseid,
courseid,
unit_vectors)
sorted_results = get_sorted_results(dot_products)
return sorted_results
我想可能有一个小错误:
def get_bag_of_单词(标题行):
一袋字={}
对于标题中的行[1:]:
courseid,courseu-bag-of-of-words=获取courseu-bag-of-of-words(行)
对于课程中的单词\u包\u单词:
#应该在一袋单词中登记
如果单词不在单词包中:
单词中的单词[单词]=当然单词中的单词[单词]
其他:
单词袋[单词]+=当然单词袋[单词]
返回一袋单词
这应该是导致键错误的原因
没有检查您的其他功能。谢谢,修复了它。还有一些错误get_dot_product(),但至少它运行并解决了KeyError。再次感谢。