
时间:2011-07-30 08:52:02

标签: c# .net itextsharp


以下是我用于确切文本的代码段 -

PdfReader reader = new PdfReader("F:\\EBooks\\AspectsOfAjax.pdf");
textBox1.Text = ExtractTextFromPDFBytes(reader.GetPageContent(1));

private string ExtractTextFromPDFBytes(byte[] input)
    if (input == null || input.Length == 0) return "";
        string resultString = "";
        // Flag showing if we are we currently inside a text object
        bool inTextObject = false;
        // Flag showing if the next character is literal  e.g. '\\' to get a '\' character or '\(' to get '('
        bool nextLiteral = false;
        // () Bracket nesting level. Text appears inside ()
        int bracketDepth = 0;
        // Keep previous chars to get extract numbers etc.:
        char[] previousCharacters = new char[_numberOfCharsToKeep];
        for (int j = 0; j < _numberOfCharsToKeep; j++) previousCharacters[j] = ' ';
        for (int i = 0; i < input.Length; i++)
            char c = (char)input[i];
            if (inTextObject)
                // Position the text
                if (bracketDepth == 0)
                    if (CheckToken(new string[] { "TD", "Td" }, previousCharacters))
                        resultString += "\n\r";
                        if (CheckToken(new string[] {"'", "T*", "\""}, previousCharacters))
                            resultString += "\n";
                            if (CheckToken(new string[] { "Tj" }, previousCharacters))
                                resultString += " ";
                // End of a text object, also go to a new line.
                if (bracketDepth == 0 && CheckToken( new string[]{"ET"}, previousCharacters))
                    inTextObject = false;
                    resultString += " ";
                    // Start outputting text
                    if ((c == '(') && (bracketDepth == 0) && (!nextLiteral))
                        bracketDepth = 1;
                        // Stop outputting text
                        if ((c == ')') && (bracketDepth == 1) && (!nextLiteral))
                            bracketDepth = 0;
                            // Just a normal text character:
                            if (bracketDepth == 1)
                                // Only print out next character no matter what. 
                                // Do not interpret.
                                if (c == '\\' && !nextLiteral)
                                    nextLiteral = true;
                                    if (((c >= ' ') && (c <= '~')) || ((c >= 128) && (c < 255)))
                                        resultString += c.ToString();
                                    nextLiteral = false;
            // Store the recent characters for when we have to go back for a checking
            for (int j = 0; j < _numberOfCharsToKeep - 1; j++)
                previousCharacters[j] = previousCharacters[j + 1];
            previousCharacters[_numberOfCharsToKeep - 1] = c;

            // Start of a text object
            if (!inTextObject && CheckToken(new string[]{"BT"}, previousCharacters))
                inTextObject = true;
        return resultString;
        return "";

private bool CheckToken(string[] tokens, char[] recent)
    foreach(string token in tokens)
        if ((recent[_numberOfCharsToKeep - 3] == token[0]) &&
            (recent[_numberOfCharsToKeep - 2] == token[1]) &&
            ((recent[_numberOfCharsToKeep - 1] == ' ') ||
            (recent[_numberOfCharsToKeep - 1] == 0x0d) ||
            (recent[_numberOfCharsToKeep - 1] == 0x0a)) &&
            ((recent[_numberOfCharsToKeep - 4] == ' ') ||
            (recent[_numberOfCharsToKeep - 4] == 0x0d) ||
            (recent[_numberOfCharsToKeep - 4] == 0x0a))
            return true;
    return false;

2 个答案:

答案 0 :(得分:37)

让我试着指向不同的方向。 iTextSharp有一个非常漂亮和简单的文本提取系统,可以处理一些基本的令牌。不幸的是,它不处理颜色信息,而是according to @Mark Storer it might not be too hard to implement yourself


我开始着手实现颜色信息。有关详细信息,请参阅my blog post here。 (抱歉格式不好,现在去吃饭。)


下面的代码结合了几个问题和答案,其中包括this one to get the font height(尽管不完全正确)以及另一个(我生活中似乎无法再找到它),它们展示了如何检测为了人造大胆。




<强> Screenshot of sample PDF


<span style="font-family:NJNSWD+Papyrus-Regular;font-size:11.61407">Hello </span>
<span style="font-family:NJNSWD+Papyrus-Regular-Bold;font-size:11.61407">w</span>
<span style="font-family:NJNSWD+Papyrus-Regular-Bold;font-size:37.87201">o</span>
<span style="font-family:NJNSWD+Papyrus-Regular-Bold;font-size:11.61407">rl</span>
<span style="font-family:NJNSWD+Papyrus-Regular;font-size:11.61407">d </span>
<br />
<span style="font-family:NJNSWD+Papyrus-Regular;font-size:11.61407">Test </span>


using System;
using System.Collections.Generic;
using System.Text;
using System.Windows.Forms;
using iTextSharp.text.pdf.parser;
using iTextSharp.text.pdf;

namespace WindowsFormsApplication2
    public partial class Form1 : Form
        public Form1()

        private void Form1_Load(object sender, EventArgs e)
            PdfReader reader = new PdfReader(System.IO.Path.Combine(Environment.GetFolderPath(Environment.SpecialFolder.Desktop), "Document.pdf"));
            TextWithFontExtractionStategy S = new TextWithFontExtractionStategy();
            string F = iTextSharp.text.pdf.parser.PdfTextExtractor.GetTextFromPage(reader, 1, S);


        public class TextWithFontExtractionStategy : iTextSharp.text.pdf.parser.ITextExtractionStrategy
            //HTML buffer
            private StringBuilder result = new StringBuilder();

            //Store last used properties
            private Vector lastBaseLine;
            private string lastFont;
            private float lastFontSize;

            private enum TextRenderMode
                FillText = 0,
                StrokeText = 1,
                FillThenStrokeText = 2,
                Invisible = 3,
                FillTextAndAddToPathForClipping = 4,
                StrokeTextAndAddToPathForClipping = 5,
                FillThenStrokeTextAndAddToPathForClipping = 6,
                AddTextToPaddForClipping = 7

            public void RenderText(iTextSharp.text.pdf.parser.TextRenderInfo renderInfo)
                string curFont = renderInfo.GetFont().PostscriptFontName;
                //Check if faux bold is used
                if ((renderInfo.GetTextRenderMode() == (int)TextRenderMode.FillThenStrokeText))
                    curFont += "-Bold";

                //This code assumes that if the baseline changes then we're on a newline
                Vector curBaseline = renderInfo.GetBaseline().GetStartPoint();
                Vector topRight = renderInfo.GetAscentLine().GetEndPoint();
                iTextSharp.text.Rectangle rect = new iTextSharp.text.Rectangle(curBaseline[Vector.I1], curBaseline[Vector.I2], topRight[Vector.I1], topRight[Vector.I2]);
                Single curFontSize = rect.Height;

                //See if something has changed, either the baseline, the font or the font size
                if ((this.lastBaseLine == null) || (curBaseline[Vector.I2] != lastBaseLine[Vector.I2]) || (curFontSize != lastFontSize) || (curFont != lastFont))
                    //if we've put down at least one span tag close it
                    if ((this.lastBaseLine != null))
                    //If the baseline has changed then insert a line break
                    if ((this.lastBaseLine != null) && curBaseline[Vector.I2] != lastBaseLine[Vector.I2])
                        this.result.AppendLine("<br />");
                    //Create an HTML tag with appropriate styles
                    this.result.AppendFormat("<span style=\"font-family:{0};font-size:{1}\">", curFont, curFontSize);

                //Append the current text

                //Set currently used properties
                this.lastBaseLine = curBaseline;
                this.lastFontSize = curFontSize;
                this.lastFont = curFont;

            public string GetResultantText()
                //If we wrote anything then we'll always have a missing closing tag so close it here
                if (result.Length > 0)
                return result.ToString();

            //Not needed
            public void BeginTextBlock() { }
            public void EndTextBlock() { }
            public void RenderImage(ImageRenderInfo renderInfo) { }

答案 1 :(得分:0)


import com.itextpdf.text.Rectangle;
import com.itextpdf.text.pdf.parser.ImageRenderInfo;
import com.itextpdf.text.pdf.parser.TextExtractionStrategy;
import com.itextpdf.text.pdf.parser.TextRenderInfo;
import com.itextpdf.text.pdf.parser.Vector;

public class TextWithFontExtractionStategy implements TextExtractionStrategy {
//HTML buffer
private StringBuilder result = new StringBuilder();

//Store last used properties
private Vector lastBaseLine;
private String lastFont;
private float lastFontSize;

private enum TextRenderMode

    private int value;

    TextRenderMode(int value) {
        this.value = value;

    public int getValue() {
        return value;

    public void renderText(TextRenderInfo renderInfo)
        String curFont = renderInfo.getFont().getPostscriptFontName();
        //Check if faux bold is used
        if ((renderInfo.getTextRenderMode() == TextRenderMode.FillThenStrokeText.getValue()))
            curFont += "-Bold";

        //This code assumes that if the baseline changes then we're on a newline
        Vector curBaseline = renderInfo.getBaseline().getStartPoint();
        Vector topRight = renderInfo.getAscentLine().getEndPoint();
        Rectangle rect = new Rectangle(curBaseline.get(Vector.I1), curBaseline.get(Vector.I2), topRight.get(Vector.I1), topRight.get(Vector.I2));
        float curFontSize = rect.getHeight();

        //See if something has changed, either the baseline, the font or the font size
        if ((this.lastBaseLine == null) || (curBaseline.get(Vector.I2) != lastBaseLine.get(Vector.I2)) || (curFontSize != lastFontSize) || (curFont != lastFont))
            //if we've put down at least one span tag close it
            if ((this.lastBaseLine != null))
            //If the baseline has changed then insert a line break
            if ((this.lastBaseLine != null) && curBaseline.get(Vector.I2) != lastBaseLine.get(Vector.I2))
                this.result.append("<br />").append("\n");
            //Create an HTML tag with appropriate styles
            this.result.append(String.format("<span style=\"font-family:{%s};font-size:{%s}\">", curFont, curFontSize));

        //Append the current text
        this.result.append(renderInfo.getText() + " ");

        //Set currently used properties
        this.lastBaseLine = curBaseline;
        this.lastFontSize = curFontSize;
        this.lastFont = curFont;

    public String getResultantText()
        //If we wrote anything then we'll always have a missing closing tag so close it here
        if (result.length() > 0)
        return result.toString();

    //Not needed
    public void beginTextBlock() { }
    public void endTextBlock() { }
    public void renderImage(ImageRenderInfo renderInfo) { }
