基于分类的分块 - NLTK Cookbook - Evaluate()不起作用

时间:2014-08-06 16:35:50

标签: python nltk



NotImplementedError                       Traceback (most recent call last)
<ipython-input-64-201b22386c9f> in <module>()
      1 chunker = ClassifierChunker(train_chunks)
----> 2 score = chunker.evaluate(test_chunks)
      3 score.accuracy()

//anaconda/lib/python2.7/site-packages/nltk/chunk/api.pyc in evaluate(self, gold)
     47         chunkscore = ChunkScore()
     48         for correct in gold:
---> 49             chunkscore.score(correct, self.parse(correct.leaves()))
     50         return chunkscore

//anaconda/lib/python2.7/site-packages/nltk/chunk/api.pyc in parse(self, tokens)
     32         :rtype: Tree
     33         """
---> 34         raise NotImplementedError()
     36     def evaluate(self, gold):


#from chunkers import TagChunker
from nltk.corpus import treebank_chunk
train_chunks = treebank_chunk.chunked_sents()[:3000]
test_chunks = treebank_chunk.chunked_sents()[3000:]

import nltk.chunk
from nltk.tag import ClassifierBasedTagger

def chunk_trees2train_chunks(chunk_sents):
    tag_sents = [nltk.chunk.tree2conlltags(sent) for sent in chunk_sents]
    return [[((w,t),c) for (w,t,c) in sent] for sent in tag_sents]

def prev_next_pos_iob(tokens, index, history):
    word, pos = tokens[index]
    if index == 0:
        prevword, prevpos, previob = ('<START>',)*3
        prevword, prevpos = tokens[index-1]
        previob = history[index-1]
    if index == len(tokens) - 1:
        nextword, nextpos = ('<END>',)*2
        nextword, nextpos = tokens[index+1]

    feats = {
    'word': word,
    'pos': pos,
    'nextword': nextword,
    'nextpos': nextpos,
    'prevword': prevword,
    'prevpos': prevpos,
    'previob': previob

    return feats

class ClassifierChunker(nltk.chunk.ChunkParserI):
    def __init__(self, train_sents, feature_detector=prev_next_pos_iob,
        if not feature_detector:
            feature_detector = self.feature_detector

        train_chunks = chunk_trees2train_chunks(train_sents)
        self.tagger = ClassifierBasedTagger(train=train_chunks,
                        feature_detector=feature_detector, **kwargs)

    def parse(self, tagged_sent):
       if not tagged_sent: return None
       chunks = self.tagger.tag(tagged_sent)
       return nltk.chunk.conlltags2tree([(w,t,c) for ((w,t),c) in

#the following is copy/pasted from chunkers.py 

import nltk.tag
from nltk.chunk import ChunkParserI
from nltk.chunk.util import conlltags2tree, tree2conlltags
from nltk.tag import UnigramTagger, BigramTagger, ClassifierBasedTagger
#from .transforms import node_label

## tree conversion ##

def chunk_trees2train_chunks(chunk_sents):
    tag_sents = [tree2conlltags(sent) for sent in chunk_sents]
    return [[((w,t),c) for (w,t,c) in sent] for sent in tag_sents]

def conll_tag_chunks(chunk_sents):
    '''Convert each chunked sentence to list of (tag, chunk_tag) tuples,
    so the final result is a list of lists of (tag, chunk_tag) tuples.
    >>> from nltk.tree import Tree
    >>> t = Tree('S', [Tree('NP', [('the', 'DT'), ('book', 'NN')])])
    >>> conll_tag_chunks([t])
    [[('DT', 'B-NP'), ('NN', 'I-NP')]]
    tagged_sents = [tree2conlltags(tree) for tree in chunk_sents]
    return [[(t, c) for (w, t, c) in sent] for sent in tagged_sents]

def ieertree2conlltags(tree, tag=nltk.tag.pos_tag):
    # tree.pos() flattens the tree and produces [(word, label)] where label is
    # from the word's parent tree label. words in a chunk therefore get the
    # chunk tag, while words outside a chunk get the same tag as the tree's
    # top label
    words, ents = zip(*tree.pos())
    iobs = []
    prev = None
    # construct iob tags from entity names
    for ent in ents:
        # any entity that is the same as the tree's top label is outside a chunk
        if ent == node_label(tree):
            prev = None
        # have a previous entity that is equal so this is inside the chunk
        elif prev == ent:
            iobs.append('I-%s' % ent)
        # no previous equal entity in the sequence, so this is the beginning of
        # an entity chunk
            iobs.append('B-%s' % ent)
            prev = ent
    # get tags for each word, then construct 3-tuple for conll tags
    words, tags = zip(*tag(words))
    return zip(words, tags, iobs)

## tag chunker ##

class TagChunker(ChunkParserI):
    '''Chunks tagged tokens using Ngram Tagging.'''
    def __init__(self, train_chunks, tagger_classes=[UnigramTagger, BigramTagger]):
        '''Train Ngram taggers on chunked sentences'''
        train_sents = conll_tag_chunks(train_chunks)
        self.tagger = None

        for cls in tagger_classes:
            self.tagger = cls(train_sents, backoff=self.tagger)

    def parse(self, tagged_sent):
        '''Parsed tagged tokens into parse Tree of chunks'''
        if not tagged_sent: return None
        (words, tags) = zip(*tagged_sent)
        chunks = self.tagger.tag(tags)
        # create conll str for tree parsing
        return conlltags2tree([(w,t,c) for (w,(t,c)) in zip(words, chunks)])

## classifier chunker ##

def prev_next_pos_iob(tokens, index, history):
    word, pos = tokens[index]

    if index == 0:
        prevword, prevpos, previob = ('<START>',)*3
        prevword, prevpos = tokens[index-1]
        previob = history[index-1]

    if index == len(tokens) - 1:
        nextword, nextpos = ('<END>',)*2
        nextword, nextpos = tokens[index+1]

    feats = {
        'word': word,
        'pos': pos,
        'nextword': nextword,
        'nextpos': nextpos,
        'prevword': prevword,
        'prevpos': prevpos,
        'previob': previob

    return feats

class ClassifierChunker(ChunkParserI):
    def __init__(self, train_sents, feature_detector=prev_next_pos_iob, **kwargs):
        if not feature_detector:
            feature_detector = self.feature_detector

        train_chunks = chunk_trees2train_chunks(train_sents)
        self.tagger = ClassifierBasedTagger(train=train_chunks,
            feature_detector=feature_detector, **kwargs)

    def parse(self, tagged_sent):
        if not tagged_sent: return None
        chunks = self.tagger.tag(tagged_sent)
        return conlltags2tree([(w,t,c) for ((w,t),c) in chunks])

## pattern ##

class PatternChunker(ChunkParserI):
    def parse(self, tagged_sent):
        # don't import at top since don't want to fail if not installed
        from pattern.en import parse
        s = ' '.join([word for word, tag in tagged_sent])
        # not tokenizing ensures that the number of tagged tokens returned is
        # the same as the number of input tokens
        sents = parse(s, tokenize=False).split()
        if not sents: return None
        return conlltags2tree([(w, t, c) for w, t, c, p in sents[0]])

1 个答案:

答案 0 :(得分:2)


class ChunkParserI(ParserI):
    A processing interface for identifying non-overlapping groups in
    unrestricted text.  Typically, chunk parsers are used to find base
    syntactic constituents, such as base noun phrases.  Unlike
    ``ParserI``, ``ChunkParserI`` guarantees that the ``parse()`` method
    will always generate a parse.
    def parse(self, tokens):
        Return the best chunk structure for the given tokens
        and return a tree.

        :param tokens: The list of (word, tag) tokens to be chunked.
        :type tokens: list(tuple)
        :rtype: Tree
        raise NotImplementedError()


class ClassifierChunker(nltk.chunk.ChunkParserI):
    def __init__(self, train_sents, feature_detector=prev_next_pos_iob,
        if not feature_detector:
            feature_detector = self.feature_detector

        train_chunks = chunk_trees2train_chunks(train_sents)
        self.tagger = ClassifierBasedTagger(train=train_chunks,
                        feature_detector=feature_detector, **kwargs)

    def parse(self, tagged_sent):  # indent inside the class
        if not tagged_sent: return None
        chunks = self.tagger.tag(tagged_sent)
        return nltk.chunk.conlltags2tree([(w,t,c) for ((w,t),c) in



return nltk.chunk.util.conlltags2tree([(w,t,c) for ((w,t),c) in