
时间:2014-02-10 15:09:44

标签: python nltk


import collections
import os.path
import glob
import nltk

wdict = set()

path = "C://Python27//Corpus Files//*.*"

#this function cleans up a doc (removes stopwords etc)
def cleanDoc(doc):
    stopset = set(nltk.corpus.stopwords.words('english'))
    stemmer = nltk.PorterStemmer()
    tokens = nltk.WordPunctTokenizer().tokenize(doc)
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 3 and token.isalpha()]
    final = [stemmer.stem(word) for word in clean]
    return final

for text in glob.glob(path):

    f = open(text)
    data= f.read()
    words = cleanDoc(data)

1 个答案:

答案 0 :(得分:0)

您可以使用FreqDist nltk.probability对象来计算这些字数。稍后,您可以使用类似dict的键值界面和方法(例如freq.items()freq['word'])在其中导航,或者您甚至可以使用matplotlib绘制结果。

import collections
import os.path
import glob
import nltk
from nltk.probability import FreqDist

term_frequency = {}

path = "C://Python27//Corpus Files//*.*"

#this function cleans up a doc (removes stopwords etc)
def cleanDoc(doc):
    stopset = set(nltk.corpus.stopwords.words('english'))
    stemmer = nltk.PorterStemmer()
    tokens = nltk.WordPunctTokenizer().tokenize(doc)
    clean = [token.lower() for token in tokens if token.lower() not in stopset and len(token) > 3 and token.isalpha()]
    final = [stemmer.stem(word) for word in clean]
    return final

for text in glob.glob(path):
    f = open(text)
    data = f.read()
    words = cleanDoc(data)
    numbers_of_words = len(words)
    freq = FreqDist(all_words)
    # term_frequency is a dict which structure is like:
    # {
    #     'path_to_file': 
    #         {'term': 13.4, 'another_term': 15}, 
    #     'another_file': 
    #         {'term2': 12, 'foo': 15}
    #  } 
    for term in freq.keys():
        if isintance(term_frequency[text], dict):
            term_frequency[text][term] = freq[term]/numbers_of_words
            term_frequency[text] = {term: freq[term]/numbers_of_words}
