HashMaps,HashSets,FileReader和FileWriter?

时间:2016-04-08 04:14:00

标签: java hashmap filereader hashset

这段代码是在名为docs的文件夹中的40个文本文件中找到单词的TF-IDF,每当我使用这个程序时,我一直在获取空指针异常。我相信它来自computeTermFrequencies方法。我希望它能打印每个文件中的前5个TF-IDF单词。

任何帮助将不胜感激!谢谢!

import java.util.*;
import java.io.*;
    public class KeywordExtractor {
      public static void main(String[] args) {
        String dir = args[0]; // name of directory with input files
        HashMap<String, Integer> dfs;
        dfs = readDocumentFrequencies("freqs.txt");

        for(int i = 1; i <= 40; i++){
          String name = dir + "/" + i + ".txt";

          HashMap<String,Integer> tfs = computeTermFrequencies(name);
          HashMap<String,Double> tfidf = computeTFIDF(tfs,dfs,40);

          System.out.println(i + ".txt");
          printTopKeywords(tfidf,5);
          System.out.println();
        }  
      }
      //method to that takes string as input and returns hashmap with amount of times
      //each word appears in the file
      public static HashMap<String, Integer> computeTermFrequencies(String filename) {
        HashMap<String, Integer> hm2 = new HashMap<String, Integer>();

        try{
        FileReader fr = new FileReader(filename);
        BufferedReader br = new BufferedReader(fr);
        String line = "";    
        line = normalize(line);
        //for(String line = br.readLine(); line != null; line = br.readLine()){
        while((line=br.readLine())!=null){
          String[] words = line.split(" ");
          for(int i = 0; i < words.length; i++){
            String word = words[i];
             if(hm2.containsKey(word)){
               int x = hm2.get(word);
               x++;
               hm2.put(word,x);
             }else{
               hm2.put(word,1);
             } 
            }  //end for 
        }//end for
        }catch(IOException e){
        //error
        }

        return hm2;
      }
      //method to read frequency file created in another class, it returns a hashMap
      public static HashMap<String, Integer> readDocumentFrequencies(String filename){
        HashMap<String, Integer> hm = new HashMap<String, Integer>();
        //try block
        try{
          //read file
          FileReader fr = new FileReader(filename);
          BufferedReader br = new BufferedReader(fr);
          //for loop to loop through and take words and put in hashmap
          for(String line = br.readLine(); line != null; line = br.readLine()){
            String[] a = line.split(" ");
            String word = a[0];
            int number = Integer.parseInt(a[1]);
            //put word in hashmap with the frequency of the word
            hm.put(word,number);
            if(hm.get(word)==null){
              System.out.println("sads");
            }
          }//end for
        }
        catch(IOException e){
          //error
        }
        return hm;
      }

      public static HashMap<String, Double> computeTFIDF(HashMap<String, Integer> tfs, HashMap<String, Integer> dfs, 
                                                         double nDocs) {
        HashMap<String, Double> hm3 = new HashMap<String, Double>();

        for(String key:tfs.keySet()){
          /*if(dfs.get(key)==null){
            System.out.println(key);
          }*/
          double idf = Math.log(nDocs/dfs.get(key));
          double tf = tfs.get(key);    
          hm3.put(key,tf*idf);


        }
        return hm3;
      }

      /**
       * This method prints the top K keywords by TF-IDF in descending order.
       */
      public static void printTopKeywords(HashMap<String, Double> tfidfs, int k) {
        ValueComparator vc =  new ValueComparator(tfidfs);
        TreeMap<String, Double> sortedMap = new TreeMap<String, Double>(vc);
        sortedMap.putAll(tfidfs);

        int i = 0;
        for(Map.Entry<String, Double> entry: sortedMap.entrySet()){
          String key = entry.getKey();
          Double value = entry.getValue();

          System.out.println(key + " " + value);
          i++;
          if (i >= k) {
            break;
          }
        }
      } 
      public static String normalize(String word) {
        return word.replaceAll("[^a-zA-Z ']", "").toLowerCase();
      }
    }

    /*
     * This class makes printTopKeywords work. Do not modify.
     */
    class ValueComparator implements Comparator<String> {

        Map<String, Double> map;

        public ValueComparator(Map<String, Double> base) {
          this.map = base;
        }

        public int compare(String a, String b) {
          if (map.get(a) >= map.get(b)) {
            return -1;
          } else {
            return 1;
          } // returning 0 would merge keys 
        }
      }

0 个答案:

没有答案