Python NLP聚类文档

Python NLP聚类文档,python,nlp,cluster-analysis,tf-idf,Python,Nlp,Cluster Analysis,Tf Idf,我正在使用HDBSCAN算法从我拥有的文档创建集群。但要从单词中创建向量矩阵,我使用tf idf算法,并希望使用GloVe或Word2vec(因为tf idf基于BoW,因此无法捕获语义) 我可以使用哪种方法-GloV、Word2vec或任何其他适合文本聚类的方法? 我如何实现它 任何帮助都将不胜感激 nltk.download('stopwords') title = [] synopses = [] filename = "twitter-test-dataset.csv" num_clu

我正在使用HDBSCAN算法从我拥有的文档创建集群。但要从单词中创建向量矩阵,我使用tf idf算法,并希望使用GloVe或Word2vec(因为tf idf基于BoW,因此无法捕获语义)

我可以使用哪种方法-GloV、Word2vec或任何其他适合文本聚类的方法? 我如何实现它

任何帮助都将不胜感激

nltk.download('stopwords')

title = []
synopses = []
filename = "twitter-test-dataset.csv"
num_clusters = 10
pkl_file = "doc_cluster.pkl"
generate_pkl = False

# pre-process data
with open(filename, 'r') as csvfile:
    # creating a csv reader object
    csvreader = csv.reader(csvfile)

    # extracting field names through first row
    fields = csvreader.next()

    # extracting each data row one by one
    duplicates = 0
    for row in csvreader:
        # removes the characters specified
        line = re.sub(r'[.,"!]+', '', row[2], flags=re.MULTILINE)
        line = re.sub(r'^RT[\s]+', '', line, flags=re.MULTILINE)  # removes RT
        line = re.sub(r'https?:\/\/.*[\r\n]*', '',
                    line, flags=re.MULTILINE)  # remove link
        line = re.sub(r'[:]+', '', line, flags=re.MULTILINE)
        line = (re.sub(
            "(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", line, flags=re.MULTILINE))
        line = filter(lambda x: x in string.printable,
                    line)  # filter non-ascii characers
        if line not in synopses:
            synopses.append(line)
            title.append(row[2])
        else:
            duplicates += 1

print("Removed " + str(duplicates) + " rows")


stopwords = nltk.corpus.stopwords.words('english')
stemmer = SnowballStemmer("english")


def tokenize_and_stem(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word for sent in nltk.sent_tokenize(
        text) for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for**strong text** token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    stems = [stemmer.stem(t) for t in filtered_tokens]
    return stems


def tokenize_only(text):
    # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
    tokens = [word.lower() for sent in nltk.sent_tokenize(text)
            for word in nltk.word_tokenize(sent)]
    filtered_tokens = []
    # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
    for token in tokens:
        if re.search('[a-zA-Z]', token):
            filtered_tokens.append(token)
    return filtered_tokens


totalvocab_stemmed = []
totalvocab_tokenized = []

for i in synopses:
    # for each item in 'synopses', tokenize/stem
    allwords_stemmed = tokenize_and_stem(i)
    # extend the 'totalvocab_stemmed' list
    totalvocab_stemmed.extend(allwords_stemmed)

    allwords_tokenized = tokenize_only(i)
    totalvocab_tokenized.extend(allwords_tokenized)

vocab_frame = pd.DataFrame(
    {'words': totalvocab_tokenized}, index=totalvocab_stemmed)

# print "there are " + str(vocab_frame.shape[0]) + " items in vocab_frame"


# define vectorizer parameters
tfidf_vectorizer = TfidfVectorizer(max_df=0.8, max_features=200000,
                                min_df=0.0, stop_words='english',
                                use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3))

#CREATE TFIDF MATRIX
tfidf_matrix = tfidf_vectorizer.fit_transform(synopses)
terms = tfidf_vectorizer.get_feature_names()


c = hdbscan.HDBSCAN(min_cluster_size=5)
#PASS TFIDF_MATRIX TO HDBSCAN
c.fit(tfidf_matrix)
print(c.labels_)
sys.exit()