Python 词干，用频率将单词语法化（过滤和分析）_Python_Mysql_Xml_Nltk_Word

Python 词干，用频率将单词语法化（过滤和分析）

python mysql xml

Python 词干，用频率将单词语法化（过滤和分析）,python,mysql,xml,nltk,word,Python,Mysql,Xml,Nltk,Word,好的。我正在尝试添加一个单词标签，但我不确定这是否正确。（对不起，我是新手）我想制作一个DB表，上面有词干/语法化单词的频率和标记的词性标记（动词、名词、ADV等等），如下所示。。如何解决错误？在mySQL数据库上，使用#| word | POS标记频率我也在寻找一种方法来删除字典中没有的单词（artistessex，asifyou），因为我使用len解析单词 ## import re import MySQLdb as mdb import xml.etree.Eleme

好的。我正在尝试添加一个单词标签，但我不确定这是否正确。（对不起，我是新手）

我想制作一个DB表，上面有词干/语法化单词的频率和标记的词性标记（动词、名词、ADV等等），如下所示。。

如何解决错误？在mySQL数据库上，使用#| word | POS标记频率我也在寻找一种方法来删除字典中没有的单词（artistessex，asifyou），因为我使用len解析单词

    ##
import re
import MySQLdb as mdb
import xml.etree.ElementTree as ET    
import requests, re
from xml.etree import ElementTree
from collections import Counter
from lxml import html
import nltk
from nltk.corpus import wordnet
from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer, WordNetLemmatizer

##    

def is_noun(tag):
        return tag in ['NN', 'NNS', 'NNP', 'NNPS']


    def is_verb(tag):
        return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']


    def is_adverb(tag):
        return tag in ['RB', 'RBR', 'RBS']


    def is_adjective(tag):
        return tag in ['JJ', 'JJR', 'JJS']


    def penn_to_wn(tag):
        if is_adjective(tag):
            return wn.ADJ
        elif is_noun(tag):
            return wn.NOUN
        elif is_adverb(tag):
            return wn.ADV
        elif is_verb(tag):
            return wn.VERB
        return None


    stemmer = PorterStemmer()
    lemmatiser = WordNetLemmatizer()



    ## XML PARSING
    def main(n=10):

        # A list of feeds to process and their xpath


        feeds = [
            {'url': 'http://www.nyartbeat.com/list/event_type_print_painting.en.xml', 'xpath': './/Description'},
            {'url': 'http://feeds.feedburner.com/FriezeMagazineUniversal?format=xml', 'xpath': './/description'}
        ]



        # A place to hold all feed results
        results = []

        # Loop all the feeds
        for feed in feeds:
            # Append feed results together
            results = results + process(feed['url'], feed['xpath'])

        # Join all results into a big string
        contents=",".join(map(str, results))

        # Remove double+ spaces
        contents = re.sub('\s+', ' ', contents)

        # Remove everything that is not a character or whitespace
        contents = re.sub('[^A-Za-z ]+', '', contents)

        # Create a list of lower case words that are at least 8 characters
        words=[w.lower() for w in contents.split() if len(w) >=8 ]


        # Count the words
        word_count = Counter(words)

        # POS_TAG the words

        word_stem = stemmer.stem(words)
        word_refine = lemmatiser.lemmatize(word_stem)
    #    tokens = word_tokenize(words) # Generate list of tokens
    #    tokens_pos = pos_tag(tokens)


        # Clean the content a little
        filter_words = ['artists']
        for word in filter_words:
            if word in word_refine:
                del word_refine[word]


        # And the survey says...

        print("The Top {0} words".format(n))
        for word, pos in word_refine.stemmer.stem(n):

            for word, count in word_count.most_common(n):
                print("{0}: {1, 2}".format(word, pos, count))



    def process(url, xpath):
        """
        Downloads a feed url and extracts the results with a variable path
        :param url: string
        :param xpath: string
        :return: list
        """
        contents = requests.get(url)
        root = ElementTree.fromstring(contents.content)
        return [element.text.encode('utf8') if element.text is not None else '' for element in root.findall(xpath)]


    # Add to DB
        for word, count in word_count.most_common(n):

                sql = """INSERT INTO Table1 (keyword, pos, freq) VALUES(%s, %s, %s)"""
                cursor.execute(sql, (word, pos, count))
                db.commit()


    if __name__ == "__main__":
        main()

你能截图你在链接中指的是什么吗？我点击了链接，但不清楚你在说什么我在看TextBlob或NLTK。。不确定哪一个对我的目标更有利。不确定你在问什么。你说

我该如何解决错误？

但错误到底是什么？有什么错误消息吗？我一直在寻找一种方法来添加其他单词分析以存储在我的数据库中，并使用您帮助的代码。除了当前的代码之外，我不知道如何对db进行更多的分析。逻辑可以是word中的

，word中的count\u count。最常见的（n）：

。我建议使用一个关系模型，这样就可以创建两个新表

tag

和

tag\u word

，使用第三方API获取给定单词的标记，比如说您返回['ADJ'，'NOUN'，'VERB']对于单词

House

您将在

tag\u word

中插入三条记录，它们是

tag

记录和

word

记录的id。

    ##
import re
import MySQLdb as mdb
import xml.etree.ElementTree as ET    
import requests, re
from xml.etree import ElementTree
from collections import Counter
from lxml import html
import nltk
from nltk.corpus import wordnet
from nltk import word_tokenize, sent_tokenize, pos_tag
from nltk.corpus import wordnet as wn
from nltk.stem import PorterStemmer, WordNetLemmatizer

##    

def is_noun(tag):
        return tag in ['NN', 'NNS', 'NNP', 'NNPS']


    def is_verb(tag):
        return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']


    def is_adverb(tag):
        return tag in ['RB', 'RBR', 'RBS']


    def is_adjective(tag):
        return tag in ['JJ', 'JJR', 'JJS']


    def penn_to_wn(tag):
        if is_adjective(tag):
            return wn.ADJ
        elif is_noun(tag):
            return wn.NOUN
        elif is_adverb(tag):
            return wn.ADV
        elif is_verb(tag):
            return wn.VERB
        return None


    stemmer = PorterStemmer()
    lemmatiser = WordNetLemmatizer()



    ## XML PARSING
    def main(n=10):

        # A list of feeds to process and their xpath


        feeds = [
            {'url': 'http://www.nyartbeat.com/list/event_type_print_painting.en.xml', 'xpath': './/Description'},
            {'url': 'http://feeds.feedburner.com/FriezeMagazineUniversal?format=xml', 'xpath': './/description'}
        ]



        # A place to hold all feed results
        results = []

        # Loop all the feeds
        for feed in feeds:
            # Append feed results together
            results = results + process(feed['url'], feed['xpath'])

        # Join all results into a big string
        contents=",".join(map(str, results))

        # Remove double+ spaces
        contents = re.sub('\s+', ' ', contents)

        # Remove everything that is not a character or whitespace
        contents = re.sub('[^A-Za-z ]+', '', contents)

        # Create a list of lower case words that are at least 8 characters
        words=[w.lower() for w in contents.split() if len(w) >=8 ]


        # Count the words
        word_count = Counter(words)

        # POS_TAG the words

        word_stem = stemmer.stem(words)
        word_refine = lemmatiser.lemmatize(word_stem)
    #    tokens = word_tokenize(words) # Generate list of tokens
    #    tokens_pos = pos_tag(tokens)


        # Clean the content a little
        filter_words = ['artists']
        for word in filter_words:
            if word in word_refine:
                del word_refine[word]


        # And the survey says...

        print("The Top {0} words".format(n))
        for word, pos in word_refine.stemmer.stem(n):

            for word, count in word_count.most_common(n):
                print("{0}: {1, 2}".format(word, pos, count))



    def process(url, xpath):
        """
        Downloads a feed url and extracts the results with a variable path
        :param url: string
        :param xpath: string
        :return: list
        """
        contents = requests.get(url)
        root = ElementTree.fromstring(contents.content)
        return [element.text.encode('utf8') if element.text is not None else '' for element in root.findall(xpath)]


    # Add to DB
        for word, count in word_count.most_common(n):

                sql = """INSERT INTO Table1 (keyword, pos, freq) VALUES(%s, %s, %s)"""
                cursor.execute(sql, (word, pos, count))
                db.commit()


    if __name__ == "__main__":
        main()