长时间按字符读取一个巨大的文本文件字符

时间:2014-08-07 19:07:30

标签: java

我正在使用FileInputStream逐个字符地阅读Java中的9 KB文本文件,并且花了将近一分钟的时间来阅读。这种表现是否良好,或者可以通过使用其他Stream BufferdReader并在内存中一次读取整个数据来优化这一点。

// This method is used to read the Brown Corpus
public void readBrownCorpus(String corpusPath) throws IOException {
    FileInputStream inputStream = null;
    try {
        inputStream = new FileInputStream(corpusPath);
        int letter = 0; // denote current read letter
        String previousTag = "^";
        StringBuilder wordWithTag = new StringBuilder(); // denote the string which
        while((letter = inputStream.read()) != -1) {
            if(((char) letter) != ' ')
                wordWithTag.append((char) letter);
            else {
                String word[] = wordWithTag.substring(0).split("_");
                if(word != null && word.length != 2)
                    throw new Exception("Error in the Format of Corpus");
                // If new tag found, insert this in both transitionTable and emissionTable
                if(transitionTable.get(word[1]) == null) {
                    insertTagInTransitionTable(previousTag, word[1]);
                }

                updateTranstionTable(previousTag, word[1]);
                updateEmissionTable(word[0], word[1]);

                // update the previous Tag
                if(word[1].equals("."))
                    previousTag = "^";
                else
                    previousTag = word[1];
                wordWithTag.setLength(0); //empty the wordWithTag for new word
                System.out.println(transitionTable.size());
            }
        }
    } catch(IOException ioException) {
        ioException.printStackTrace();
    } catch(Exception exception) {
        exception.printStackTrace();
    }
    finally {
        inputStream.close();
    }
}

/*
 * To change this template, choose Tools | Templates
 * and open the template in the editor.
 */

package demo;
import java.util.*;
import java.io.*;

/**
 *
 * @author Jatin Khurana
 */
public class Main {

    public HashMap<String,Row> transitionTable;  // Transition Table
    public HashMap<String,Row> emissionTable; // Emission Table

    // Constructor
    public Main()
    {
        transitionTable=new HashMap<String,Row>();
        emissionTable=new HashMap<String,Row>();
        prepareInitialTransitionTable();
        //prepareInitialEmissionTable();
    }

    // This method prepare the initial transition Table
    private void prepareInitialTransitionTable()
    {
        Row row1=new Row();
        row1.tagCount.put("^", 0.0f);
        row1.tagCount.put(".", 0f);
                Row row2=new Row();
        row2.tagCount.put("^", 0f);
        row2.tagCount.put(".", 0f);
        transitionTable.put("^", row1);
        transitionTable.put(".", row2);
    }

    /**
     * @param args the command line arguments
     */
    public static void main(String[] args) throws IOException{

        Main m=new Main();

        BufferedReader inputStream=null;
        try
        {
            inputStream=new BufferedReader(new FileReader("d://postagger//corpus//brown.txt"));
            String corpusData = inputStream.readLine();
            String previousTag="^";
                        String wordWithTag[] = corpusData.split(" ");
            for(int i=0;i<wordWithTag.length;i++)
                        {
                                        String word[]=wordWithTag[i].split("_");
                    if(word!=null && word.length!=2)
                        throw new Exception("Error in the Format of Corpus");
                    // If new tag found,insert this in both transitionTable and emissionTable
                    if(m.transitionTable.get(word[1])==null)
                    {
                        m.insertTagInTransitionTable(previousTag,word[1]);
                    }

                    m.updateTranstionTable(previousTag,word[1]);
                    m.updateEmissionTable(word[0],word[1]);

                    // update the previous Tag
                                        if(word[1].equals("."))
                                        {
                                            previousTag="^";
                                        }
                                        else
                                        {
                                            previousTag=word[1];
                                        }
                    System.out.println(m.transitionTable.size());
                }
            }
        catch(IOException ioException)
        {
            ioException.printStackTrace();
        }
        catch(Exception exception)
        {
            exception.printStackTrace();
        }
        finally
        {
            inputStream.close();
        }
    }

    private void insertTagInTransitionTable(String previousTag,String newTag) throws CloneNotSupportedException
    {
            Row row = (Row)transitionTable.get(previousTag);
            row.tagCount.put(newTag,0f);
                        Row newRow=new Row();
            transitionTable.put(newTag, newRow);
    }

    // This method is used to update the transitionTable
    private void updateTranstionTable(String previousTag,String currentTag)
    {
        Row row = transitionTable.get(previousTag);
                if(row.tagCount.get(currentTag)==null)
                {
                    row.tagCount.put(currentTag, 1f);
                }
                else
                {
                    row.tagCount.put(currentTag, row.tagCount.get(currentTag)+1);
                }

    }

    // This method is used to update the emission table
    private void updateEmissionTable(String word,String tag)
    {
                Row row = emissionTable.get(word);
                if(row==null)
                {
                    Row newRow=new Row();
                    newRow.tagCount.put(tag, 1f);
                    emissionTable.put(word, newRow);
                }
                else
                {
                    if(row.tagCount.get(tag)==null)
                    {
                        row.tagCount.put(tag, 1f);
                    }
                    else
                    {
                        row.tagCount.put(tag,row.tagCount.get(tag)+1);
                    }
                }
    }

}

我的导师说我必须在3到5秒内完成同样的操作。这是怎么做到的?

1 个答案:

答案 0 :(得分:0)

使用BufferedInputStream包装FileInputStream以进行简单的快速修复。然后考虑使用readLine()