
时间:2014-12-10 16:22:05

标签: python machine-learning nlp


编辑:我还找到了a tutorial on how to get started with it

可以与pip install multi-rake

from multi_rake import Rake

text_en = (
    'Compatibility of systems of linear constraints over the set of '
    'natural numbers. Criteria of compatibility of a system of linear '
    'Diophantine equations, strict inequations, and nonstrict inequations '
    'are considered. Upper bounds for components of a minimal set of '
    'solutions and algorithms of construction of minimal generating sets '
    'of solutions for all types of systems are given. These criteria and '
    'the corresponding algorithms for constructing a minimal supporting '
    'set of solutions can be used in solving all the considered types of '
    'systems and systems of mixed types.'

rake = Rake()

keywords = rake.apply(text_en)


#  ('minimal generating sets', 8.666666666666666),
#  ('linear diophantine equations', 8.5),
#  ('minimal supporting set', 7.666666666666666),
#  ('minimal set', 4.666666666666666),
#  ('linear constraints', 4.5),
#  ('natural numbers', 4.0),
#  ('strict inequations', 4.0),
#  ('nonstrict inequations', 4.0),
#  ('upper bounds', 4.0),
#  ('mixed types', 3.666666666666667)

## loading some dependencies
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer

## our dataset
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train' , shuffle = True , categories =  [ "alt.atheism" ])
## defining a stemmer to use
stemmer = SnowballStemmer("english")

## this dictiaoniary will come in handy later on ..
stemmed_to_original = {}

## Basic Preprocessings Functions ##
def lemmatize_stemming(text):
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))

def preprocess(text):
    for token in gensim.utils.simple_preprocess(text) :

        if token not in gensim.parsing.preprocessing.STOPWORDS and len(token) > 3:
            stemmed_token = lemmatize_stemming(token)
            stemmed_to_original[stemmed_token] = token
    return result

news_data = [ preprocess(i) for i in newsgroups_train.data  ]
## notice, min_df and max_df parameters are really important in getting the most important keywords out of your corpus
vectorizer = TfidfVectorizer(   stop_words= gensim.parsing.preprocessing.STOPWORDS , min_df = 20 , max_df = 0.72, tokenizer= lambda x : x , lowercase= False   )
vectorizer.fit_transform( news_data  )

## get idf values of all the corresponding tokens used by vectorizer and sort them in ascending order
## Depends on how you define it, but for most of cases while working in text corpus,  after unnecessary stopwords and  ( really high / really rare ) frequent words have been filtered out
## by parameters we used in our vectorizer above,  this type of sorting gets you important keywords

## make a dictionairy of words and corresponding idf weight
word_to_idf = {  i:j for i,j in zip(vectorizer.get_feature_names() , vectorizer.idf_ ) }
## sort the dictionairy in ascending order of idf weights
word_to_idf = sorted(   word_to_idf.items() ,key = lambda x : x[1]  ,  reverse = False )


for k,v in word_to_idf[:5]:
    print( '{} ---> {} ----> {}'.format( k , stemmed_to_original[k] , v    )  ) 



post ---> posting ----> 1.4392949726265691
articl ---> article ----> 1.4754236967150747
host ---> host ----> 1.7035965964342865
nntp ---> nntp ----> 1.7248288165400607
think ---> think ----> 1.8287597393882924
peopl ---> people ----> 1.887600239411226
know ---> know ----> 1.994083719813676
univers ---> universe ----> 1.994083719813676
atheist ---> atheists ----> 2.011081296182247
like ---> like ----> 2.016811970891232
thing ---> things ----> 2.094462905121298
time ---> time ----> 2.199133527685187
mean ---> means ----> 2.2271073797275927
believ ---> believe ----> 2.2705924916673315

from gensim.summarization import keywords

text_en = text_en = (
    'Compatibility of systems of linear constraints over the set of'
    'natural numbers. Criteria of compatibility of a system of linear '
    'Diophantine equations, strict inequations, and nonstrict inequations '
    'are considered. Upper bounds for components of a minimal set of '
    'solutions and algorithms of construction of minimal generating sets '
    'of solutions for all types of systems are given. These criteria and '
    'the corresponding algorithms for constructing a minimal supporting '
    'set of solutions can be used in solving all the considered types of '
    'systems and systems of mixed types.')

print(keywords(text_en,words = 10,scores = True, lemmatize = True))


[('numbers', 0.31009020729627595),
('types', 0.2612797117033426),
('upper', 0.26127971170334247),
('considered', 0.2539581373644024),
('minimal', 0.25089449987505835),
('sets', 0.2508944998750583),
('inequations', 0.25051980840329924),
('linear', 0.2505198084032991),
('strict', 0.23778663563992564),
('diophantine', 0.23778663563992555)]

from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)




['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

在上面的输出停用词中,例如“ is”和“ the”的出现是因为语料库很小。使用大型语料库,您可以按优先级顺序获得最重要的关键字。请检查TfidfVectorizer以获得更多说明。