Question

我有一些像这样的转义HTML：

&lt;img border=&#039;0&#039; /&gt;

我尝试匹配并替换完整的转义序列，例如'但不是部分转义序列，例如39，因为39实际上不在非转义字符串中。基本上，每个转义序列应该被视为一个令牌。

这是一个JS正则表达式。有没有办法排除&和;之间的匹配，同时仍然接受包含这两个字符的序列？

期望的结果：

在<img border='0' />搜索lt：不匹配。
在<img border='0' />搜索39：不匹配。
在<img border='0' />搜索'：匹配。
在<img border='0' />搜索border='：匹配。

当前代码：

> var str = '&lt;img border=&#039;0&#039; /&gt;'
> str.replace(/(border)/gi, '|$1|')
'&lt;img |border|=&#039;0&#039; /&gt;'  // ok
> str.replace(/(39)/gi, '|$1|')
'&lt;img border=&#0|39|;0&#0|39|; /&gt;'  // not ok

注意：我无法撤离，然后重新逃脱以匹配。它必须逃脱。

Answer 1

OP希望JavaScript正则表达式匹配并替换转义HTML中的字符串，同时将转义序列（例如<，'或）视为单个字符，而不是unescape替换过程中的HTML字符串。

这意味着替换

"lt" "[lt]" "< lt"会导致"< [lt]"（避免在实体内匹配）
"<" "[<]" "< lt"会导致"[<] lt"（匹配实体）
"&l" "[&l]" "< &lt"会导致"< [&l]t"（不匹配部分实体）
"t;" "[t;]" "< lt;"会导致"< l[t;]"（不匹配部分实体）
"< l" "[< l]" "< lt"会导致"[< l]t"（匹配包含实体）
"lt; &l" "[lt; &l]" "< &lt"会导致"< &lt"（不匹配部分实体）
"t; <" "[t; <]" "lt; <"会导致"l[t; <]"（匹配包含实体）
"t; &lt" "[t; &lt]" "lt; <"会导致"lt; <"（不匹配部分实体）

使用以下正则表达式捕获转义序列（例如<，'或），

/&[a-z]+;|&#x[a-f\d]+;|&#\d+;/gi

我们可以使用以下函数作为处理上述大多数情况的起点（＃1，＃2，＃4，＃5和＃7）：

function searchAndReplace(searchFor, replacement, str) {
  return str.replace(
    new RegExp(
      prepare(searchFor) + 
      "|(&[a-z]+;|&#x[a-f\\d]+;|&#\\d+;)", // consume entities
      "gi"
    ),
    function(m, entity) {
      return entity || replacement;
    }
  );
}

function prepare(str) {
  return str.replace(/[^\w\s]/g, "\\$&"); //escape regex metachars [1]
}

// [1] from http://eloquentjavascript.net/09_regexp.html#h_Rhu25fogrG

其余案例（＃3，＃6，＃8）涉及搜索字符串末尾的潜在部分转义序列。

对此的解决方案是在最后检查searchFor字符串中潜在的部分转义序列，并附加相应的否定前瞻(?!)以防止匹配有效的转义序列。完整的解决方案（通过一组约40个测试用例）如下所示，并且应该比.exec()方法更快，更简单：

＆＃13;

function searchAndReplace(searchFor, replacement, str) {
  return str.replace(
    new RegExp(
      prepare(searchFor) + 
      "|(&[a-z]+;|&#x[a-f0-9]+;|&#\\d+;)", 
      "gi"
    ),
    function(m, entity) {
      return entity || replacement;
    }
  );
}

function prepare(str) {
  var add = "";
  if (/&$/.test(str)) {
    add = "(?!#x[a-z\\d]+;|#\\d+;|[a-z]+;)";
  } else if (/&[a-z]+$/i.test(str)) {
    add = "(?![a-z]*;)";
  } else if (/&#$/.test(str)) {
    add = "(?!x[a-f\\d]+;|\\d+;)";
  } else if (/&#x$/.test(str)) {
    add = "(?![a-f\\d]+;)";
  } else if (/&#x[a-f\d]+$/i.test(str)) {
    add = "(?![a-f\\d]*;)";
  }
  return str.replace(/[^\w\s]/g, "\\$&") + add;
}

// test function

function test(searchFor, replacement, str, expected) {
  var result = searchAndReplace(searchFor, replacement, str);
  console.log(
    searchFor +
      ": " +
      (result === expected ? "Passed" : "Failed: " + [expected, result])
  );
}

// test cases

test("lt", "[lt]", "&lt;img border=&#039;0&#039; /&gt;", "&lt;img border=&#039;0&#039; /&gt;");
test("39", "[39]", "&lt;img border=&#039;0&#039; /&gt;", "&lt;img border=&#039;0&#039; /&gt;");
test("&#039;", "[&#039;]", "&lt;img border=&#039;0&#039; /&gt;", "&lt;img border=[&#039;]0[&#039;] /&gt;");
test("border=&#039;", "[border=&#039;]", "&lt;img border=&#039;0&#039; /&gt;", "&lt;img [border=&#039;]0&#039; /&gt;");
test("39&", "[39&]", "39&lt;img border=39&#039;&gt&gt&&#039 t; 0&#039;&39; /&gt;", "39&lt;img border=39&#039;&gt&gt&&#039 t; 0&#039;&39; /&gt;")
test("0&#", "[0&#]", "39&lt;img border=39&#039;&gt&gt&&#039 t; 0&#039;&39; /&gt;", "39&lt;img border=39&#039;&gt&gt&&#039 t; 0&#039;&39; /&gt;")
test("lt", "[]", "&lt&lt;t;t&l", "&[]&lt;t;t&l");
test("&lt;", "[]", "&lt&lt;t;t&l", "&lt[]t;t&l");
test("&l", "[]", "&lt&lt;t;t&l", "[]t&lt;t;t[]");
test("t;", "[]", "&lt&lt;t;t&l", "&lt&lt;[]t&l");
test("t&", "[]", "&lt&lt;t;t&l", "&lt&lt;t;[]l");
test("&lt;t", "[]", "&lt&lt;t;t&l", "&lt[];t&l");
test("t&lt;", "[]", "&lt&lt;t;t&l", "&l[]t;t&l");
test("t;t", "[]", "&lt&lt;t;t&l", "&lt&lt;[]&l");
test("t&l", "[]", "&lt&lt;t;t&l", "&lt&lt;t;[]");
test("39", "[]", "&#039&#039;9;9&#", "&#0[]&#039;9;9&#");
test("&#039;", "[]", "&#039&#039;9;9&#", "&#039[]9;9&#");
test("&", "[]", "&#039&#039;9;9&#", "[]#039&#039;9;9[]#");
test("&#", "[]", "&#039&#039;9;9&#", "[]039&#039;9;9[]");
test("9;", "[]", "&#039&#039;9;9&#", "&#039&#039;[]9&#");
test("9&", "[]", "&#039&#039;9;9&#", "&#039&#039;9;[]#");
test("&#039;9", "[]", "&#039&#039;9;9&#", "&#039[];9&#");
test("9&#039;", "[]", "&#039&#039;9;9&#", "&#03[]9;9&#");
test("9;9", "[]", "&#039&#039;9;9&#", "&#039&#039;[]&#");
test("9&#", "[]", "&#039&#039;9;9&#", "&#039&#039;9;[]");
test("x7", "[]", "&#x7f&#x7f;f;f&#x", "&#[]f&#x7f;f;f&#x");
test("&#x7f;", "[]", "&#x7f&#x7f;f;f&#x", "&#x7f[]f;f&#x");
test("&", "[]", "&#x7f&#x7f;f;f&#x", "[]#x7f&#x7f;f;f[]#x");
test("&#", "[]", "&#x7f&#x7f;f;f&#x", "[]x7f&#x7f;f;f[]x");
test("&#x", "[]", "&#x7f&#x7f;f;f&#x", "[]7f&#x7f;f;f[]");
test("&#x7", "[]", "&#x7f&#x7f;f;f&#x", "[]f&#x7f;f;f&#x");
test("f;", "[]", "&#x7f&#x7f;f;f&#x", "&#x7f&#x7f;[]f&#x");
test("f&", "[]", "&#x7f&#x7f;f;f&#x", "&#x7f&#x7f;f;[]#x");
test("&#x7f;f", "[]", "&#x7f&#x7f;f;f&#x", "&#x7f[];f&#x");
test("f&#x7f;", "[]", "&#x7f&#x7f;f;f&#x", "&#x7[]f;f&#x");
test("f;f", "[]", "&#x7f&#x7f;f;f&#x", "&#x7f&#x7f;[]&#x");
test("f&#", "[]", "&#x7f&#x7f;f;f&#x", "&#x7f&#x7f;f;[]x");
test("f&#x", "[]", "&#x7f&#x7f;f;f&#x", "&#x7f&#x7f;f;[]");
test("t; &lt; lt &l", "[]", "&lt; &lt; lt &lt;lt; &lt; lt &lt", "&lt; &lt; lt &lt;l[]t");

＆＃13;

Answer 2

此处的一个选项是在执行实际替换之前，在转义字符序列中出现的任何位置临时替换正在搜索的字符串。 “虚拟”字符串将需要不太可能出现在HTML中的任何位置。在执行了实际替换之后，可以进行进一步的替换以将“虚拟”字符串更改回正在搜索的字符串。

以下是此方法的演示，它会生成请求的结果。当需要不使用正则表达式的全局替换和this useful technique将任何字符串转换为字符串文字以便在正则表达式中使用（任何特殊字符被适当地转义）时，它使用this useful technique。

var html = "&lt;img border=&#039;0&#039; /&gt;"
replaceInHtml(html, 'lt', 'replacement');
replaceInHtml(html, '39', 'replacement');
replaceInHtml(html, '&#039;', 'replacement');
replaceInHtml(html, 'border=&#039;', 'replacement');

function replaceInHtml(html, str, replacement) {
  // A unique string that is unlikely to appear in the HTML
  var dummyStr = '!*&$^£"^';

  var strInRegex = escapeRegExp(str);
  var dummyRegex = new RegExp('(&[#a-zA-Z0-9]*)'
      + strInRegex + '([#a-zA-Z0-9]*;)', 'g');

  var replaced = html.replace(dummyRegex, '$1' + dummyStr + '$2');
  replaced = replaced.split(str).join(replacement);
  replaced = replaced.split(dummyStr).join(str);
  console.log('Source:  ' + html
          + '\nReplace: ' + str
          + '\nWith:    ' + replacement
          + '\nGives:   ' + replaced);
}

function escapeRegExp(str) {
  return str.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&");
}

Answer 3

我首先匹配&和;之间的所有内容：

＆＃13;

let str = "39&lt;img border=39&#039;0&#039;&39; /&gt;39";
let search = '39';
let regexp = new RegExp('&[^&;]*?(' + search + ')[^&;]*?;', 'g'); // /&[^&;]*?(SEARCH)[^&;]*?;/g
let match = str.match(regexp);

console.log(match);

＆＃13;

然后暗示，我想在这两个字符之间匹配不的所有内容：

＆＃13;

const prepareRegexp = searchStr => new RegExp('(?:^&[^&;]*?)?('+searchStr+')(?!(?:[^&;]*?;))|(?:(?:^|;)(?:[^&;]*?)('+searchStr+'))', 'gm'); ///(?:^&[^&;]*?)?(SEARCH)(?!(?:[^&;]*?;))|(?:(?:^|;)(?:[^&;]*?)(SEARCH))/g

let find = (str, searchStr) => {
  let regexp = prepareRegexp(searchStr);
  let foundItemsArray;
  let allFoundItems = [];

  while ((foundItemsArray = regexp.exec(str)) !== null) {
    //foundItemsArray returns as follows:
    // [0] - full match
    // [1] - first capturing group
    // [2] - second capturing group
    // To get indexes of found strings you have to use: regexp.lastIndex
    // and take into account that second case, matches everything between the last ; or start of a line
    // and the searched string
    if (foundItemsArray[0] === searchStr) { //case for the first capturing group
      allFoundItems.push(foundItemsArray[0]); //0 or 1 it doesn't matter here as the matching group is the same as the capturing group
    } else { //case for the second capturing group
      allFoundItems.push(foundItemsArray[2]);
    }
  }
  
  return allFoundItems.length ? allFoundItems : null;
}

//Function 'find' refactored to avoid loop:
find = (str, searchStr) => {
  let regexp = prepareRegexp(searchStr);
  let allFoundItems = [];
  
  str.replace(prepareRegexp(searchStr), (match, p1, p2) => {
    if (p1) {
      allFoundItems.push(p1);
    } else {
      allFoundItems.push(p2);
    }
  });
  
  return allFoundItems;
}

//And function to replace the searched string:
const replace = (str, searchStr, replaceWith) =>
  str.replace(prepareRegexp(searchStr), (match, p1, p2) => {
    if (p1) {
      return replaceWith;
    } else {
      return match.replace(searchStr, replaceWith);
    }
  });

let str = "39&lt;img border=39&#039;0&#039;&39; width: 50%; /&gt;39";
//console.log('Searching "39":', find(str, '39'));
console.log('Searching "&#039;":', find(str, '&#039;'));
//Search &lt;img border=&#039;0&#039; width: 50%; /&gt; for 50:
console.log('Searching "50":', find(str, '50'));

console.log('Replacing "39" with "|39|":', replace(str, '39', '|39|'));
console.log('Replacing "50" with "|50|":', replace(str, '50', '|50|'));

//Now test the string against given examples:
str = '&lt;img border=&#039;0&#039;';
//Search &lt;img border=&#039;0&#039; /&gt; for lt: No match.
console.log('Searching "lt":', find(str, 'lt'));
//Search &lt;img border=&#039;0&#039; /&gt; for 39: No match.
console.log('Searching "39":', find(str, '39'));
//Search &lt;img border=&#039;0&#039; /&gt; for &#039;: Match.
console.log('Searching "&#039;":', find(str, '&#039;'));
console.log('Replacing "&#039;" with "|&#039;|":', replace(str, '&#039;', '|&#039;|'));
//Search &lt;img border=&#039;0&#039; /&gt; for border=&#039;: Match.
console.log('Searching "border=&#039;":', find(str, 'border=&#039;'));
console.log('Replacing "border=&#039;" with "|border=&#039;|":', replace(str, 'border=&#039;', '|border=&#039;|'));

＆＃13;

.as-console-wrapper {
  max-height: 100% !important;
}

＆＃13;

正则表达式的崩溃： https://regex101.com/r/UCNnu1/2

//编辑：

然而，如果搜索字符串后跟;，则不匹配搜索字符串，因此为了捕获此类字符串，我们需要扩展我们的正则表达式以匹配另一组字符并使用regexp.exec只捕捉有趣的部分。扩展的正则表达式是：

https://regex101.com/r/UCNnu1/3

我更新了代码以使用regexp进行替换。

Answer 4

RegEx应该用于检查~~但它不能涵盖所有可能的实体~~，并且不是这项工作的最佳工具。虽然以下方法适用于所有HTML实体。

我正在尝试匹配和替换完整的转义序列，例如'但不是部分转义序列，例如39，因为39实际上不在非转义字符串中。

基本上，您希望用不受限制的形式替换HTML实体。这就是下面的功能，你不需要RegEx。

我将使用来自this answer

的Web_Designer的unescapeHTML功能

var escape = document.createElement('textarea');

function unescapeHTML(html) {
    escape.innerHTML = html;
    return escape.textContent;
}

首先创建一个新的<textarea>元素。在函数内部，作为参数传递的字符串然后被指定为此textarea的innerHTML，然后返回它的textContent。这是用于unescape HTML实体的技巧。

我们可以重用它来确定字符串是否是有效的HTML实体。如果函数能够取消它，那么它是有效的HTML实体，否则它不是。这是你想要确定的。

var escape = document.createElement('textarea');

function unescapeHTML(html) {
  escape.innerHTML = html;
  return escape.textContent;
}

var str = '&lt;img border=&#039;0&#039; /&gt;';

console.log(unescapeHTML('lt') !== 'lt');
console.log(unescapeHTML('39') !== '39');
console.log(unescapeHTML('&#039;') !== '&#039;');
console.log(unescapeHTML('border=&#039;') !== 'border=&#039;');

Answer 5

有没有办法排除&和;之间的匹配，同时仍然接受包含这两个字符的序列？基本上，每个转义序列应该被视为一个令牌。

为了将实体视为单独的标记，我们可以构建一个在任何目标子字符串之前捕获实体的正则表达式，然后使用回调函数将未修改的捕获实体返回给字符串。

示例，当"39"不在实体内时替换str.replace( /(&[a-z]+;|&#[0-9a-f]+;)|39/gi, function(m, entity){ return entity || replacement; } );：

&#039;

我正在尝试匹配并替换完整的转义序列，例如39但不是部分转义序列，例如'

更换实体时，例如function searchAndReplace(str, searchFor, replacement){ return /^&([a-z]+|#[\da-f]+);/i.test(searchFor) ? // if searchFor equals or starts with an entity str.split(searchFor).join(replacement) : // else str.replace( new RegExp( '(&[a-z]+;|&#[0-9a-f]+;)|' + searchFor.replace(/[^\w\s]/g, "\\$&"), //escape metachars 'gi' ), function(m, entity){ return entity || replacement; } ); } // test cases console.log('Search for "border": \n' + searchAndReplace( '<img border='0' />', 'border', '{{border}}' ) + '\nmatch'); //matches console.log('Search for "0": \n' + searchAndReplace( '<img border='0' />', '0', '' ) + '\nmatch'); //matches outside entities console.log('Search for "'": \n' + searchAndReplace( '<img border='0' />', ''', '{{'}}' ) + '\nmatch'); //matches console.log('Search for "39": \n' + searchAndReplace( '<img border='0' />', '39', '{{39}}' ) + '\nno match'); //does not match console.log('Search for "lt": \n' + searchAndReplace( '<img border='0' />', 'lt', '{{lt}}' ) + '\nno match'); //does not match console.log('Search for "<": \n' + searchAndReplace( '<img border='0' />', '<', '{{<}}' ) + '\nmatch'); //matches console.log('Search for "border='": \n' + searchAndReplace( '<img border='0' />', 'border='', '{{border='}}' ) + '\nmatch'); //matches console.log('Search for "<img": \n' + searchAndReplace( '<img border='0' />', '<img', '{{<img}}' ) + '\nmatch'); //matches，需要采用不同的方法。以下工作演示处理此问题，并从提供的搜索字符串动态构建正则表达式，处理所有OP测试用例：

<reference path="./../node_modules/@types/mongodb/index.d.ts" />

Answer 6

我认为您指的是非捕获群体：http://www.regular-expressions.info/brackets.html，在几个堆叠溢出帖子（Why regular expression's "non-capturing" group is not working和Regular expression, "just group, don't capture", doesn't seem to work）中轻易解决。

即，未捕获的群组不会获得他们自己的群组选择器（例如/ a（？：[XZ]）（[ac]）/ g将匹配＆＃34; aZb＆＃34;但是\ 1等于＆＃34; b＆＃34;，而不是＆＃34; Z＆＃34;。

Answer 7

这是你想要做的吗？

＆＃13;

var str = "&lt;img border-color=&#039;0&#039;"
console.log(str)
console.log(str.match(/((?:[a-z-]+=)?&#.+?;)/gi))
console.log(str.replace(/((?:[a-z-]+=)?&#.+?;)/gi, "|$1|"))

＆＃13;

Answer 8

我认为如果我们能够使用回顾是可能的。鉴于正则表达式的味道是JavaScript，在这里，我认为我们不能。这非常接近： [^&;]*(string)[^&;]*(?!9;|t;|;)

Answer 9

最终版本候选人

4/29

此版本应在搜索字符串的末尾处理部分实体其中partial有预实体字符，如xxx&yyy或a&#00等。

这是@TomasLangkaas发现的最后一个案例鉴于涵盖了所有其他案例，这是最终候选人为@athancahill或任何其他感兴趣的人。

（参见评论和以前的版本）

模型已从 String.Replace（）更改为而（match = Rx.exec（））

在此解释，但请参阅JS代码以实现它仍然使用搜索字符串作为第一个交替
以实体为第二。

    (?=
         # This is the optional entity captured at
         # the same position where the search string starts.
         # If this entity matches, it means the search string
         # matches. Either one may be a partial of the other.

         # (1) The container for pre-entity / entity
         (                             
              # (2) Pre-entity characters 
              ( sLongest )                

              # Entity   
              (?:&(?:[a-z_:][a-zd_:.-]*|(?:\#(?:[0-9]+|x[0-9a-f]+)))|%[a-z_:][a-zd_:.-]*);
         )?                            
    )

    # (3) The search string ( consumes )
    ( sToFind )                        
 | 

    # (4) Or, the entity last  ( consumes ) 
    ( (?:&(?:[a-z_:][a-zd_:.-]*|(?:\#(?:[0-9]+|x[0-9a-f]+)))|%[a-z_:][a-zd_:.-]*); )

请注意，您不能将实体语法分解为正则表达式的一部分它必须完全匹配，作为一个独特的项目（沿着这条路走了一百次，无法做到。）。

请注意，这是一个单程，纯正则表达式解决方案，速度非常快如果你拿出所有的评论，它实际上只有几行代码您可以修改实体子表达式并使用您想要的任何内容代码结构不需要改变。

//=========================================================
// http://jsfiddle.net/b4b28a38/95/
//=========================================================

// ------------------------
// These are only used when pre-entity partials are detected
var RxEntPartial = new RegExp( '(?:&(?:[a-z_:][a-zd_:.-]*|(?:\#(?:[0-9]+|x[0-9a-f]*)?))?|%(?:[a-z_:][a-zd_:.-]*)?)$', 'ig' );
var RxEntFull = new RegExp( '(?:&(?:[a-z_:][a-zd_:.-]*|(?:\#(?:[0-9]+|x[0-9a-f]+)))|%[a-z_:][a-zd_:.-]*);', 'ig' );
// ------------------------

function MakeRegex( FindAry ) {
   // Escape metachars
     var longest = 0;
     for (var i = 0; i < FindAry.length; i++ )
     {
         if ( FindAry[i].length > longest )
            longest = FindAry[i].length;
         FindAry[i] = FindAry[i].replace(/(?!\s)\W/g, "\\$&"); 
     }
   // Make 'longest' sub-expression
     longest -= 1; 
     var sLongest = '';
     if ( longest > 0 )
         sLongest = '.{0,' + longest.toString() + '}?';
   // Join array using alternations
     var sToFind = FindAry.join('|');
   // Return new regex object
     var rx =  new RegExp( '(?=((' + sLongest + ')(?:&(?:[a-z_:][a-zd_:.-]*|(?:\#(?:[0-9]+|x[0-9a-f]+)))|%[a-z_:][a-zd_:.-]*);)?)(' + sToFind + ')|((?:&(?:[a-z_:][a-zd_:.-]*|(?:\#(?:[0-9]+|x[0-9a-f]+)))|%[a-z_:][a-zd_:.-]*);)',  
   'ig');
   //console.log( rx);
   return rx;
}  


function GetReplace( str, Rx )
{
   var sResult = '';    // New modified string to return
   var _M;              // Match object
   var ndxLast = 0;     // Previous Rx.lastIndex value when valid match
                        // ( where next match starts )

   Rx.lastIndex = 0;
   
   while ( _M = Rx.exec( str ) )
   {
       // Alternation 1: (1) = container (optiopnal), p2 = pre-entity, entity, p3 = search string
       // Alternation 2: p4 = entity
       // Form:      
       //     (?=
       //          (                    # (1) start container
       //            ( pre-entity )            # (2)
       //            entity
       //          )?                       # (1) end
       //     )
       //     ( search )                 # (3)
       //  |  
       //     ( entity )                 # (4)
       
       if ( _M[4] )
       {
          // Entity, continue unchanged.
          sResult += str.substr( ndxLast , _M.index - ndxLast ) + _M[4];
          ndxLast = Rx.lastIndex;
          continue;
       }
       // Check if entity container captured inside zero length assertion matched 
       if ( _M[1] )
       {
           // Get some lengths 
      
           var L1 = _M[1].length;
           var L2 = _M[2].length;
           var L3 = _M[3].length;

           if ( L1 == L3 )
           {
              // Ok - This means it matched a trailing full entity
              // Intended, modify the search string
              sResult += str.substr( ndxLast , _M.index - ndxLast ) + '[' + _M[3] + ']' ;
              ndxLast = Rx.lastIndex;
              continue;
           }

           // Pre entity check  ( pre-entity ) 
           if ( L2 > 0 )  
           {
               // This is a rare case and should not slow anything down.
               // End entity condition to check

               var sMatched = _M[3];
               var mpartial;
               RxEntPartial.lastIndex = 0;

               // Verify the match had a partial entity at the end
               if ( mpartial = RxEntPartial.exec( sMatched ) )
               {
                   // Check partial entity is not at the  beginning                   
                   if ( mpartial.index > 0 )
                   {
                       // Location in target string to check
                       // for a full entity.
                       var loc = _M.index + mpartial.index;

                       // Assure there is no full entity
                       RxEntFull.lastIndex = loc;
                       var mfull;
                       if ( mfull = RxEntFull.exec( str ) )
                       {
                           if ( mfull.index == loc )
                           {
                               // Not valid, move past it
                               RxEntFull.lastIndex = 0;
                               Rx.lastIndex += (L1 - L3);
                               continue;
                           }
                       }
                  }
               }
               // Ok - This definitely passes.
               // Intended, modify the search string
               sResult += str.substr( ndxLast , _M.index - ndxLast ) + '[' + _M[3] + ']' ;
               ndxLast = Rx.lastIndex;
               continue;
           }

           // Normal checks
           // -------------------------------------------------------

           // If the length of the search >= the entity length
           // then the search includes an entity at the begining
       

           if ( L3 >= L1 )
           {
              // Intended, modify the search string
              sResult += str.substr( ndxLast , _M.index - ndxLast ) + '[' + _M[3] + ']' ;
              ndxLast = Rx.lastIndex;
              continue;
           }

          // Uh oh, the search is a partial entity (from the beginning).
          // Since we see that it is part of an entity, we have to go past it.
          // The match position reflects the partial entity.
          // Adjust (advance) the match position by the difference
          // to go past the entity.

          Rx.lastIndex += ( L1 - L3 );
          continue;
       }

       // Here, the search string is pure, just modify it
       sResult += str.substr( ndxLast , _M.index - ndxLast ) + '[' + _M[3] + ']' ;
       ndxLast = Rx.lastIndex;
   }
   sResult += str.substr( ndxLast , str.length - ndxLast );
   return sResult;
}

var TargetStr = "39&lt;img border=39&#039;&gt&gt&&#039 t; xx&gt x&gt; 0&# r&#039;&39; cad&#092; r&#FFd0&#22 /&gtttttt;  39; end";
console.log( 'Target:\r\n' + TargetStr );

// Always put the longest first of/if alphabetically sorted (ie. aaa|aa|a, etc ..)

var rx = MakeRegex( ['39', '&lt;img', 'cad&#092;', '&gt', 't;', 'x&', '0&#', 'r&#'] );

var NewString = GetReplace( TargetStr, rx );

console.log('Find any of:  39, &lt;img, cad&#092;, &gt, t;, x&, 0&#, r&#' );
console.log( NewString );

输出

 Target:
 39&lt;img border=39&#039;&gt&gt&&#039 t; xx&gt x&gt; 0&# r&#039;&39; cad&#092; r&#FFd0&#22 /&gtttttt;  39; end

 Find any of:  39, &lt;img, cad&#092;, &gt, t;, x&, 0&#, r&#

 [39][&lt;img] border=[39]&#039;[&gt][&gt]&&#0[39] [t;] x[x&]gt x&gt; [0&#] r&#039;&[39]; [cad&#092;] [r&#]FFd[0&#]22 /&gtttttt;  [39]; end

Answer 10

更新2017-04-28
添加39&和0&#测试用例 - 无需更改代码 - 哇，这很幸运：）

重要的是要注意，我故意不允许＆符号中的＆符号存在，除了作为转义序列的开始，无论它是否是有效的转义序列，即im允许{{1即使在技术上无效，它也是一个转义序列。这样可以很容易地说，如果我们被要求找到一个带有＆符号的字符串，这个字符串不是完整转义序列的一部分（例如下面的&39; AmpExcape`来执行此操作，而不是终止并返回未更改的字符串，这是一个方便，因为这个正则表达式的其他用例（即外部JavaScript）不允许我条件分支语句（或匹配的函数回调）。为了我的目的，这来到定义我工作反对转义的壁橱HTML。

更新2017-04-26
添加0&#') then this should not be matched, and is an invalid search string. The use of测试用例并通过转义查找字符串来编辑该案例的答案/代码

更新2017-04-25

两遍正则表达式解决方案：

&g

我一直在寻找一种仅使用正则表达式的方法，虽然@sln和@ tomas-langkaas的解决方案很有用（好吧，令人兴奋），我仍然想要一种我可以使用的方法在JS及以后。

我有一些适用于我的测试用例的东西，但它被迫对文本进行多次传递。最初我有三次传球，但我认为我有两次传球。显然不如其他答案有效。

我试图匹配：function do_replace(test_str, find_str, replace_str) { let escaped_find_str = AmpExcape(find_str); escaped_find_str = RegExcape(escaped_find_str); let escaped_replace_str = RegExcape(replace_str); let first_regex = new RegExp('(' + escaped_find_str + ')|(&\\w*;|&#[0-9a-fA-F]*;)','gi'); let first_pass = test_str.replace(first_regex,'&&$1'+replace_str+'&&$2'); let second_regex = new RegExp('&&(?:'+escaped_replace_str+'&&|(' + escaped_find_str + ')?('+escaped_replace_str + ')?&&)','gi'); let second_pass = first_pass.replace(second_regex,'$2'); return second_pass; }作为那些答案，并将订单翻转为实体，因为@sln在最新更新中做了。但我使用<target string> | <entity>作为新的＆＃39;逃脱序列。策略是：

最初我们转义查找和替换字符串以供在regex中使用。对于&&，如果<target string>不是complte实体的一部分，那么我们将其替换为双&，以防止它被匹配，即我考虑到在转义HTML中搜索部分实体永远不可能
第一遍用
替换任何匹配（实体或目标字符串）
```
&&
```
在目标字符串的匹配项上，&&<target group value><replacement string>&&<entity group value>将为空，我们将返回<entity group value>转义符，目标字符串，替换字符串和最终{{1}逃跑
对于实体的匹配，&&现在将为空，因此我们最终返回&&后跟实体值。
第二次传递可以查找所有<target group value>并替换为&&<replacement str>&&
同样在第二遍中我们可以查找&&<target string><replacement string>&&并知道它应该替换为空白。
这次我们没有为实体比赛做任何事情，因为我们在第1阶段没有触及它们

带有测试用例的完整代码（来自@ tomas-langkaas维护的RexExcape的属性）：

<replacement string>

和输出

&&<replacement string>&&

更新2017-04-24

我放弃了下面的方法有几个原因，但最重要的是，它只会在没有符号或分号的文本中找到第一次出现的查找字符串。例如，如果匹配是// helper for building regexes from strings // http://stackoverflow.com/a/3561711/6738706 function AmpExcape(str) { return str.replace(/&(\w*;|#[0-9a-fA-F]*;)|(&)/g, '&$2$1'); } function RegExcape(str) { return str.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'); }; function do_replace(test_str, find_str, replace_str) { let escaped_find_str = AmpExcape(find_str); escaped_find_str = RegExcape(escaped_find_str); let escaped_replace_str = RegExcape(replace_str); let first_regex = new RegExp('(' + escaped_find_str + ')|(&\\w*;|&#[0-9a-fA-F]*;)','gi'); let first_pass = test_str.replace(first_regex,'&&$1'+replace_str+'&&$2'); let second_regex = new RegExp('&&(?:'+escaped_replace_str+'&&|(' + escaped_find_str + ')?('+escaped_replace_str + ')?&&)','gi'); let second_pass = first_pass.replace(second_regex,'$2'); return second_pass; } let str = '39<img style='width: 39;' bor9;wder=39'0'&39; />39;'; let test_list = ['39','39;','9;','9;w','39&','0&#']; run_test(str,test_list); str = '<img border='0' /$gt;'; test_list = ['lt','39',''','border='']; run_test(str,test_list); str = 'test string ring ring'; test_list = ['ring']; run_test(str,test_list); str = '39<img style='width: 39;' border='0'&39; />39;'; test_list = ['border','0',''','39','lt','<','border='','<img','&g','t;']; run_test(str,test_list); function run_test(base_str, find_list) { let orig_str = 'original'; let max_len = find_list.concat(orig_str).reduce(function(a,b) { return a > b.length ? a : b.length; },0); console.log(); console.log(pad(orig_str,max_len) + ': ' + str); find_list.map(function(gstr) { console.log( pad( gstr, max_len) + ': ' + do_replace(str, gstr, '|' + gstr + '|')); }); } function pad(str,len) { while ( str.length < len) { str = str + ' ' }; return str; }，则字符串original: 39<img style='width: 39;' bor9;wder=39'0'&39; />39; 39 : |39|<img style='width: |39|;' bor9;wder=|39|'0'&39; />|39|; 39; : 39<img style='width: |39;|' bor9;wder=39'0'&39; />|39;| 9; : 39<img style='width: 3|9;|' bor|9;|wder=39'0'&39; />3|9;| 9;w : 39<img style='width: 39;' bor|9;w|der=39'0'&39; />39; 39& : 39<img style='width: 39;' bor9;wder=39'0'&39; />39; 0&# : 39<img style='width: 39;' bor9;wder=39'0'&39; />39; original : <img border='0' /$gt; lt : <img border='0' /$gt; 39 : <img border='0' /$gt; ' : <img border=|'|0|'| /$gt; border=': <img |border='|0' /$gt; original: test string ring ring ring : test st|ring| |ring| |ring| original : 39<img style='width: 39;' border='0'&39; />39; border : 39<img style='width: 39;' |border|='0'&39; />39; 0 : 39<img style='width: 39;' border='|0|'&39; />39; ' : 39<img style=|'|width: 39;|'| border=|'|0|'|&39; />39; 39 : |39|<img style='width: |39|;' border='0'&39; />|39|; lt : 39<img style='width: 39;' border='0'&39; />39; < : 39|<|img style='width: 39;' border='0'&39; />39; border=': 39<img style='width: 39;' |border='|0'&39; />39; <img : 39|<img| style='width: 39;' border='0'&39; />39; &g : 39<img style='width: 39;' border='0'&39; />39; t; : 39<img style='width: 39;' border='0'&39; />39;将仅匹配为test string ring ring - 这对于查找和替换似乎毫无用处 - 我更新答案以便它可以用于匹配{ {1}}至少第一次，因为我之前错过了线路开始作为允许的终端字符，但我不认为这是所有可能的文本的有效解决方案。

原始回答（有修改）

如果你关心文本中出现的分号而不是相应的test st|ring| ring ring的终结字符，就像内联样式可能会说ring那样，那么你需要一些有点复杂的东西：

ring

下面提供的代码将输出原始测试用例：

它还会显示文本和搜索字词中出现的针对分号的特定输出。

style="width: 39;"

注意这就是方法崩溃的例子：

'((?:^|(?!(?:[^&;]+' + str + ')))(?=(?:(?:&|;|^)[^&;]*))(?:(?!(?:&))(?:(?:^|[;]?)[^&;]*?))?)(' + str + ')'

没有任何背后的东西可以让记忆变得更加容易。效果区分上一个匹配后的零个字符和零个字符，但在转义的序列中。因此，如果不匹配字符串"original": <img border='0' /$gt; "lt": <img border='0' /$gt; "39": <img border='0' /$gt; "'": <img border=|'|0|'| /$gt; "border='": <img |border='|0' /$gt;中的"original": 39<img style='width: 39;' bor;der=39'0'&39; />39; test string that may be followed by semi-colon : |39|<img style='width: |39|;' bor;der=|39|'0'&39; />|39|; test match with semi-colon: 39<img style='width: |39;|' bor;der=39'0'&39; />|39;| test match with semi-colon mid string 39<img style='width: 39;' |bor;der|=39'0'&39; />39; <{1}}，则无法匹配第二次出现"original": test string ring ring test st|ring| ring ring

下面是示例代码：

ring

重要的是要注意正则表达式不仅捕获您想要的文本，还捕获此前的文本块。这会影响您使用39作为替换值，因为您想要的文字现在是'

对这些术语的解释并不简单，但可以分解为：

包含要查找的字符串的否定前瞻，以防止在非终端字符上开始匹配

function make_regex(str) {
  let regexp = new RegExp('((?:^|(?!(?:[^&;]+' + str + ')))(?=(?:(?:&|;|^)[^&;]*))(?:(?!(?:&))(?:(?:^|[;]?)[^&;]*?))?)(' + str + ')','gi');
  return regexp;
}

function do_replace(test_str, find_str, replace_str) {
        let new_reg = make_regex(find_str);
        return  test_str.replace(new_reg,'$1' + replace_str);
}

let str = '39&lt;img style=&#039;width: 39;&#039; bor;der=39&#039;0&#039;&39; /&gt;39;';
console.log();
console.log('"original":     ', str);
console.log('test string that may be followed by semi-colon :');
console.log(do_replace(str, '39', '|$2|' ));
console.log('test match with semi-colon:');
console.log(do_replace(str, '39;', '|39;|' ));
console.log('test match with semi-colon mid string');
console.log(do_replace(str, 'bor;der', '|bor;der|' ))

str = '&lt;img border=&#039;0&#039; /$gt;';
console.log();
console.log('"original":     ', str);
console.log('"lt":           ', do_replace(str, 'lt', '|lt|' ));
console.log('"39":           ', do_replace(str, '39', '|39|' ));
console.log('"&#039;":       ', do_replace(str, '&#039;', '|&#039;|' ));
console.log('"border=&#039;":', do_replace(str, 'border=&#039;', '|border=&#039;|' ));
str = 'test string ring ring';
console.log();
console.log('"original":     ', str);
console.log(do_replace(str, 'ring', '|$2|'));

强制前瞻，强制匹配在终端字符或行的开头

开始

$1

一种负向前瞻，阻止匹配从$2开始，但允许行开头或前一个分号

(?:^|(?!(?:[^&;]+' + str + ')))

然后最后匹配的字符串

效果是我们匹配包含我们想要的文本的文本的整个部分，从前一个终端值到要匹配的字符串的结尾。因此，javascript最终会匹配匹配的完整块。例如，当我们要求(?=(?:(?:&|;|^)[^&;]*))时，我们将以块&结束。所以我们定义了两个捕获组，一个用于我们不感兴趣的块的一部分，另一个用于匹配。这允许我们使用(?:(?!(?:&))(?:(?:^|[;]?)[^&;]*?))?重新创建字符串或border='来仅替换我们想要的部分。然后我们可以将;img border='与此策略一起使用

Answer 11

这将有助于您确定

pattern=new RegExp('\&\#.*?\;',"g") str="<img border='0' />" str.replace(pattern,"yourReplacementText")

正则表达式不匹配部分序列，但匹配完整序列

11 个答案: