Question

我尝试使用PDFBox 2.0进行文本提取。我想获得有关特定字符的字体大小和页面上该字符的位置矩形的信息。我已经使用PDFTextStripper在PDFBox 1.6中实现了这个：

    PDFParser parser = new PDFParser(is);
    try{
        parser.parse();
    }catch(IOException e){

    }
    COSDocument cosDoc = parser.getDocument();
    PDDocument pdd = new PDDocument(cosDoc);
    final StringBuffer extractedText = new StringBuffer();
    PDFTextStripper textStripper = new PDFTextStripper(){
        @Override
        protected void processTextPosition(TextPosition text) {
            extractedText.append(text.getCharacter());
            logger.debug("text position: "+text.toString());
        }
    };
    textStripper.setSuppressDuplicateOverlappingText(false);
    for(int pageNum = 0;pageNum<pdd.getNumberOfPages();pageNum++){
        PDPage page = (PDPage) pdd.getDocumentCatalog().getAllPages().get(pageNum);
        textStripper.processStream(page, page.findResources(), page.getContents().getStream());
    }
    pdd.close();

但是在2.0版本的PDFBox中，processStream方法已被删除。如何使用PDFBox 2.0实现相同的目标？

我尝试过以下方法：

        PDDocument pdd = PDDocument.load(inputStream);
        PDFTextStripper textStripper = new PDFTextStripper(){
            @Override
            protected void processTextPosition(TextPosition text){
                int pos = PDFdocument.length();
                String textadded = text.getUnicode();
                Range range = new Range(pos,pos+textadded.length());
                int pagenr = this.getCurrentPageNo();
                Rectangle2D rect = new Rectangle2D.Float(text.getX(),text.getY(),text.getWidth(),text.getHeight());
            }
        };
        textStripper.setSuppressDuplicateOverlappingText(false);
        for(int pageNum = 0;pageNum<pdd.getNumberOfPages();pageNum++){
            PDPage page = (PDPage) pdd.getDocumentCatalog().getPages().get(pageNum);
            textStripper.processPage(page);
        }
        pdd.close();

不会调用processTextPosition(TextPosition text)方法。任何建议都会非常受欢迎。

Answer 1

@tilmanhausherr建议的DrawPrintTextLocations example为我的问题提供了解决方案。

使用以下代码启动解析器（inputStream是来自PDF文件的URL的输入流）：

    PDDocument pdd = null;
    try {
        pdd = PDDocument.load(inputStream);
        PDFParserTextStripper stripper = new PDFParserTextStripper(PDFdocument,pdd);
        stripper.setSortByPosition(true);
        for (int i=0;i<pdd.getNumberOfPages();i++){
            stripper.stripPage(i);
        }
    } catch (IOException e) {
        // throw error
    } finally {
        if (pdd!=null) {
            try {
                pdd.close();
            } catch (IOException e) {

            }
        }
    }

此代码使用PDFTextStripper的自定义子类：

class PDFParserTextStripper extends PDFTextStripper {

    public PDFParserTextStripper() throws IOException {
        super();
    }


    public void stripPage(int pageNr) throws IOException {
        this.setStartPage(pageNr+1);
        this.setEndPage(pageNr+1);
        Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
        writeText(document,dummy); // This call starts the parsing process and calls writeString repeatedly.
    }



    @Override
    protected void writeString(String string,List<TextPosition> textPositions) throws IOException {
        for (TextPosition text : textPositions) {
            System.out.println("String[" + text.getXDirAdj()+","+text.getYDirAdj()+" fs="+text.getFontSizeInPt()+" xscale="+text.getXScale()+" height="+text.getHeightDir()+" space="+text.getWidthOfSpace()+" width="+text.getWidthDirAdj()+" ] "+text.getUnicode());
        }
    }

}

Answer 2

这是一个使用@tilmanhausherr建议的实现：

import java.io.ByteArrayOutputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.List;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.text.PDFTextStripper;
import org.apache.pdfbox.text.TextPosition;

class PDFParserTextStripper extends PDFTextStripper 
{
    public PDFParserTextStripper(PDDocument pdd) throws IOException 
    {  
        super();
        document = pdd;
    }

    public void stripPage(int pageNr) throws IOException 
    {
        this.setStartPage(pageNr+1);
        this.setEndPage(pageNr+1);
        Writer dummy = new OutputStreamWriter(new ByteArrayOutputStream());
        writeText(document,dummy); // This call starts the parsing process and calls writeString repeatedly.
    }

    @Override
    protected void writeString(String string,List<TextPosition> textPositions) throws IOException 
    {
        for (TextPosition text : textPositions) {
            System.out.println("String[" + text.getXDirAdj()+","+text.getYDirAdj()+" fs="+text.getFontSizeInPt()+" xscale="+text.getXScale()+" height="+text.getHeightDir()+" space="+text.getWidthOfSpace()+" width="+text.getWidthDirAdj()+" ] "+text.getUnicode());
        }
    }

    public static void extractText(InputStream inputStream)
    {
        PDDocument pdd = null;

        try 
        {
            pdd = PDDocument.load(inputStream);
            PDFParserTextStripper stripper = new PDFParserTextStripper(pdd);
            stripper.setSortByPosition(true);
            for (int i=0; i<pdd.getNumberOfPages(); i++)
            {
                stripper.stripPage(i);
            }
        } 
        catch (IOException e) 
        {
            // throw error
        } 
        finally 
        {
            if (pdd != null) 
            {
                try 
                {
                    pdd.close();
                } 
                catch (IOException e) 
                {

                }
            }
        }
    }

    public static void main(String[] args) throws IOException
    {
        File f = new File("C:\\PathToYourPDF\\pdfFile.pdf");
        FileInputStream fis = null;

        try 
        {
            fis = new FileInputStream(f);
            extractText(fis);
        } 
        catch(IOException e) 
        {
            e.printStackTrace();
        } 
        finally 
        {
            try 
            {
                if(fis != null)
                    fis.close();
            } 
            catch(IOException ex)
            {
                ex.printStackTrace();
            }
        }
    }
}

使用PDFBox 2.0从PDF中提取文本

2 个答案: