查找文件中最常用的单词

时间:2017-02-13 18:41:44

标签: python-3.x nlp

我有一个文件,我想找到最常用的10个单词。我省略了停用词和标点符号,然后将结果放入列表中。每行包含一个波斯语句子,一个标签,然后是一个英文单词。问题是,下面的代码返回每行的一个单词。例如,如果行数是12,则返回12个单词。我认为缩进有问题。我该如何解决?

.
.
.
def train ():
    RemStopWords (file1, file2)  # the function for removing stop words and punctuation at the start of the code
    for line in witoutStops:
        line = line.strip().split("\t")
        words = line[0].split()
        uniques = []
        q = []
        for word in words:
            if word not in uniques:
                uniques.append(word)
        counts = []
        for unique in uniques:
            count = 0              
            for word in words:     
                if word == unique:   
                    count += 1         
            counts.append((count, unique))
            counts.sort()        
            counts.reverse()
            for i in range(min(10, len(counts))):
                count, word = counts[i]
            print('%s %d' % (word, count))
            #q.append(word)
            #print (q)

3 个答案:

答案 0 :(得分:2)

您可以使用collections.Counter

|0%

答案 1 :(得分:0)

编辑:路易斯安那州哈塞克的答案是一种更简单,更优雅的方式,并且具有相同的输出,所以你应该明确地检查一下!

有一种更简单的方法:)

import operator  # we will use this later for sorting dictionaries

def train():
    # assuming this returns the string of the text
    textWithoutStops = RemStopWords(file1, file2)

    # dictionary were words are keys and number of time they appear are values
    wordCount = {}
    for word in textWithoutStops.split(' '):  # convert string to list, using spaces as separators
        if not word in wordCount:
            wordCount[word] = 1
        else:
            wordCount[word] += 1

    # we sort from less to more frequency
    sortedWordCount = sorted(wordCount.items(), key=operator.itemgetter(1))
    # and reverse the list so it's from more to less frequent
    sortedWordCount = sortedWordCount[::-1]

    # we take only the first 10, if it has more than 10
    if len(sortedWordCount) > 10:
        sortedWordCount = sortedWordCount[:10]

    # Here we go, a list containing tuples with the structure: (word, count)
    return sortedWordCount

例如,如果文件包含您的问题

  

我有一个文件,我想找到最常用的10个单词。一世   省略了停用词和标点符号,然后将结果放入列表中。   每行包含一个波斯语句子,一个标签,然后是一个英文单词。   问题是,下面的代码返回每行的一个单词。对于   例如,如果行数为12,则返回12个单词。我觉得   缩进有问题。我该如何解决?

输出将是:

[('the', 5), ('I', 4), ('a', 4), ('and', 4), ('in', 2), ('of', 2), ('then', 2), ('returns', 2), ('words', 2), ('fix', 1)]

注意:要打开文本文件并将其所有内容转换为字符串,您可以(并且可能已经执行)以下操作:

with open(file, 'r') as f:
    text = f.read()

希望这会对你有帮助!

答案 2 :(得分:0)

基于C ++的解决方案,使用优先级队列,映射和特里 这是使用优先级队列,映射和特里的类似c ++代码。为简单起见,可以从向量字符串中读取内容,但可以轻松修改以从文件中读取单词。

使用C ++查找文件或流中的前K个常见单词 这是priority_queue的可行解决方案,供您参考。

#include <iostream>
#include <vector>
#include <queue>
#include <unordered_map>
using namespace std;

#define K_TH 3


class TrieNode;
typedef struct HeapNode
{
    string word;
    int frequency;
    HeapNode(): frequency(0), word(""){} ;
    TrieNode *trieNode;

}HeapNode;


class TrieNode
{
    private:
        int frequency = 0;
        bool m_isLeaf = false;
        string word = "";
        unordered_map<char, TrieNode*> children;
        HeapNode *heapNode = NULL;

    public:
        TrieNode() {}
        TrieNode(char c)
        {
            children[c] = new TrieNode();
            this->m_isLeaf = false;
        }

        void setWord(string word)
        {
            this->word = word;
        }
        string getWord()
        {
            return this->word;
        }
        bool isLeaf(void)
        {
            return this->m_isLeaf;
        }
        void setLeaf(bool leaf)
        {
            this->m_isLeaf = leaf;
        }
        TrieNode* getChild(char c)
        {
            if (children[c] != NULL)
                return children[c];
            return NULL;
        }
        void insert(char c)
        {
            children[c] = new TrieNode();
        }
        int getFrequency()
        {
            return this->frequency;
        }
        void setFrequency(int frequency)
        {
            this->frequency = frequency;
        }
        void setHeapNode(HeapNode *heapNode)
        {
            this->heapNode = heapNode;
        }
        HeapNode* getHeapNode()
        {
            return heapNode;
        }
        bool operator()(HeapNode* &a, HeapNode* &b)
        {
            return (a->frequency > b->frequency);
        }
};

class Trie
{
    private:
        TrieNode *root = NULL;

    public:
        Trie()
        {
            if (!root)
            {
                this->root = new TrieNode();
            }
        }
        TrieNode* insert(string word)
        {
            if (!root)
                root = new TrieNode();
            TrieNode* current = root;
            int length = word.length();
            //insert "abc"
            for(int i = 0; i < length; ++i)
            {
                if (current->getChild(word.at(i)) == NULL)
                {
                    current->insert(word.at(i));
                }
                current = current->getChild(word.at(i));
            }
            current->setLeaf(true);
            current->setWord(word);
            current->setFrequency(current->getFrequency() + 1);
            return current;
        }
};



struct cmp
{
    bool operator()(HeapNode* &a, HeapNode* &b)
    {
        return (a->frequency > b->frequency);
    }
};
typedef priority_queue<HeapNode*, vector<HeapNode*>, cmp > MinHeap;


void insertUtils(Trie *root, MinHeap &pq, string word )
{
    if (!root)
        return;

    TrieNode* current = root->insert(word);
    HeapNode *heapNode = current->getHeapNode();
    if(heapNode)// if word already present in heap 
    {
        heapNode->frequency += 1;
    }else if (pq.empty() || pq.size() < K_TH)
    {// if word not present in heap and heap is not full;
        heapNode = new HeapNode();
        heapNode->word = word;
        heapNode->frequency = 1;
        heapNode->trieNode = current;
        current->setHeapNode(heapNode);
        pq.push(heapNode);
    }else if (pq.top()->frequency < current->getFrequency())
    {   // if word is not present and heap is full;
        HeapNode *temp = pq.top();
        //remove first element and add current word
        pq.pop();
        delete temp;
        heapNode = new HeapNode();
        current->setHeapNode(heapNode);
        pq.push(heapNode);
    }
}


void printKMostFrequentWords(vector<std::string> input)
{

    Trie *root = new Trie();
    MinHeap minHeap;
    for (vector<string>::iterator it = input.begin(); it != input.end(); ++it)
    {
        insertUtils(root, minHeap, *it);
    }

    while(!minHeap.empty())
    {
        HeapNode *heapNode = minHeap.top();
        cout << heapNode->word << ":" << heapNode->frequency << endl;
        minHeap.pop();
    }


}

int main() {

vector<std::string>input( {
    "abc", "def", "ghi",
    "jkl", "abc", "def",
    "mno", "xyz", "abc"

} ) ;
printKMostFrequentWords(input);
}