使用<span>&#39>在<p>标签中包装句子,但保留其他标签</span> </p>

时间:2013-05-15 05:23:29

标签: javascript jquery regex parsing html-parsing

为了让您知道我需要什么,我一直在使用以下代码来解析

标签内的内容并将每个句子包装在标签中,以便我可以与页面上的句子进行交互。 / p>

$('p').each(function() {
        var sentences = $(this)
            .text()
            .replace(/(((?![.!?]['"]?\s).)*[.!?]['"]?)(\s|$)/g, 
                     '<span class="sentence">$1</span>$3');
        $(this).html(sentences);
    });

但是,以下一行说明了我的问题:

<p>This is a <a href="#">link</a> and it is removed with the above code! Here is another sentence.</p>

嵌套代码,例如&lt; a&gt;,&lt; img&gt;等...在&lt; p&gt;内我正在搜索的标签将使用我正在使用的代码删除。我需要保持这些标签的完整性,因此内容在&lt; p&gt;内保持不变。标签。

我需要:

<p><span class="sentence">This is a <a href="#">link</a> and it is removed with the above code!</sentence><sentence>Here is another sentence.</sentence></p>

在阅读this barn-burner关于使用正则表达式解析HTML之后,我得出结论,我需要使用某种HTML解析器的组合来遍历&lt; p&gt;内的子标记。标记,然后使用正则表达式来查找句子。我认为上面列出的正则表达式应该适用于我的大部分用途,如果这有用的话。

所以:我该怎么做?

2 个答案:

答案 0 :(得分:0)

tokenise语言很可靠,很可靠地进入句子,并且没有将html投入等式的额外复杂性。有一些应用程序等尝试处理Natural Language Processing,一个例子是Stanford Tokenizer在Java上运行(不是Javascript)

正如人们不断提到的那样,正则表达式不是这个问题的解决方案,语言不规则,所以不要指望只使用正则表达式解决方案。

这里有一个关于SO的问题,Basic NLP in CoffeeScript or JavaScript — Punkt tokenizaton, simple trained Bayes models — where to start?我认为这些问题很简单地总结了Javascript。

无论如何,至少给你一些你可以玩的东西,我为你敲了一些代码。这种方法很有效,直到标记/语言开始类似于任何稍微复杂或不同的东西,但最终会在很长一段时间内失败。但是,对于你需要的东西,我可能已经足够了,我不知道。

CSS

.emphasis {
    font-style: italic;
}
.bold {
    font-weight: bold;
}
.emphasis.bold {
    font-style: italic;
    font-weight: bold;
}
.unidentified {
    background-color: pink;
}
.sentence0 {
    background-color: yellow;
}
.sentence1 {
    background-color: green;
}
.sentence2 {
    background-color: red;
}
.whitespace {
    white-space: pre;
    background-color: blue;
}

的Javascript

/*jslint maxerr: 50, indent: 4, browser: true */

(function () {
    "use strict";

    var rxOpen = new RegExp("<[^\\/].+?>"),
        rxClose = new RegExp("<\\/.+?>"),
        rxWhitespace = new RegExp("^\\s+?"),
        rxSupStart = new RegExp("^<sup\\b[^>]*>"),
        rxSupEnd = new RegExp("<\/sup>"),
        sentenceEnd = [],
        color = 0,
        rxIndex;

    sentenceEnd.push(new RegExp("[^\\d][\\.!\\?]+"));
    sentenceEnd.push(new RegExp("(?=([^\\\"]*\\\"[^\\\"]*\\\")*[^\\\"]*?$)"));
    sentenceEnd.push(new RegExp("(?![^\\(]*?\\))"));
    sentenceEnd.push(new RegExp("(?![^\\[]*?\\])"));
    sentenceEnd.push(new RegExp("(?![^\\{]*?\\})"));
    sentenceEnd.push(new RegExp("(?![^\\|]*?\\|)"));
    //sentenceEnd.push(new RegExp("(?![^\\\\]*?\\\\)"));
    //sentenceEnd.push(new RegExp("(?![^\\/.]*\\/)")); // all could be a problem, but this one is problematic

    rxIndex = new RegExp(sentenceEnd.reduce(function (previousValue, currentValue) {
        return previousValue + currentValue.source;
    }, ""));

    function indexSentenceEnd(html) {
        var index = html.search(rxIndex);

        if (index !== -1) {
            index += html.match(rxIndex)[0].length - 1;
        }

        return index;
    }

    function pushSpan(array, className, string, classNameOpt) {
        if (className === "sentence") {
            className += color % 2;
            if (classNameOpt) {
                className += " " + classNameOpt;
            }

            color += 1;
        }

        array.push('<span class="' + className + '">' + string + '</span>');
    }

    function addSupToPrevious(html, array) {
        var sup = html.search(rxSupStart),
            end = 0,
            last;

        if (sup !== -1) {
            end = html.search(rxSupEnd);
            if (end !== -1) {
                last = array.pop();
                end = end + 6;
                array.push(last.slice(0, -7) + html.slice(0, end) + last.slice(-7));
            }
        }

        return html.slice(end);
    }

    function leadingWhitespaces(html, array) {
        var whitespace = html.search(rxWhitespace),
            count = 0;

        if (whitespace !== -1) {
            count = html.match(rxWhitespace)[0].length;
            pushSpan(array, "whitespace", html.slice(0, count));
        }

        return html.slice(count);
    }

    function paragraphIsSentence(html, array) {
        var index = indexSentenceEnd(html);

        if (index === -1 || index === html.length) {
            pushSpan(array, "sentence", html, "paragraphIsSentence");
            html = "";
        }

        return html;
    }

    function paragraphNoMarkup(html, array) {
        var open = html.search(rxOpen),
            index = 0;

        if (open === -1) {
            index = indexSentenceEnd(html);
            if (index === -1) {
                index = html.length;
            }

            pushSpan(array, "sentence", html.slice(0, index += 1), "paragraphNoMarkup");
        }

        return html.slice(index);
    }

    function sentenceUncontained(html, array) {
        var open = html.search(rxOpen),
            index = 0,
            close;

        if (open !== -1) {
            index = indexSentenceEnd(html);
            if (index === -1) {
                index = html.length;
            }

            close = html.search(rxClose);
            if (index < open || index > close) {
                pushSpan(array, "sentence", html.slice(0, index += 1), "sentenceUncontained");
            } else {
                index = 0;
            }
        }

        return html.slice(index);
    }

    function sentenceContained(html, array) {
        var open = html.search(rxOpen),
            index = 0,
            close,
            count;

        if (open !== -1) {
            index = indexSentenceEnd(html);
            if (index === -1) {
                index = html.length;
            }

            close = html.search(rxClose);
            if (index > open && index < close) {
                count = html.match(rxClose)[0].length;
                pushSpan(array, "sentence", html.slice(0, close + count), "sentenceContained");
                index = close + count;
            } else {
                index = 0;
            }
        }

        return html.slice(index);
    }

    function anythingElse(html, array) {
        pushSpan(array, "sentence2", html, "anythingElse");

        return "";
    }

    function guessSenetences() {
        var paragraphs = document.getElementsByTagName("p");

        Array.prototype.forEach.call(paragraphs, function (paragraph) {
            var html = paragraph.innerHTML,
                length = html.length,
                array = [],
                safety = 100;

            while (length && safety) {
                html = addSupToPrevious(html, array);
                if (html.length === length) {
                    html = leadingWhitespaces(html, array);
                    if (html.length === length) {
                        html = paragraphIsSentence(html, array);
                        if (html.length === length) {
                            html = paragraphNoMarkup(html, array);
                            if (html.length === length) {
                                html = sentenceUncontained(html, array);
                                if (html.length === length) {
                                    html = sentenceContained(html, array);
                                    if (html.length === length) {
                                        html = anythingElse(html, array);
                                    }
                                }
                            }
                        }
                    }
                }

                length = html.length;
                safety -= 1;
            }

            paragraph.innerHTML = array.join("");
        });
    }

    guessSenetences();
}());

jsfiddle

答案 1 :(得分:-1)

如果要保持标签不变,则需要使用.html()而不是.text()。 检查下面的代码,让我知道它是否不起作用。 的 DEMO

$('p').each(function() {
        var sentences = $(this)
            .html()
            .replace(/(((?![.!?]['"]?\s).)*[.!?]['"]?)(\s|$)/g, 
                     '<span class="sentence">$1</span>$3');
        $(this).html(sentences);
    });