正则表达式不匹配部分序列,但匹配完整序列

时间:2017-04-14 18:07:48

标签: javascript regex

我有一些像这样的转义HTML:

<img border='0' />

我尝试匹配并替换完整的转义序列,例如'但不是部分转义序列,例如39,因为39实际上不在非转义字符串中。基本上,每个转义序列应该被视为一个令牌。

这是一个JS正则表达式。有没有办法排除&;之间的匹配,同时仍然接受包含这两个字符的序列?

期望的结果:

  • <img border='0' />搜索lt:不匹配。
  • <img border='0' />搜索39:不匹配。
  • <img border='0' />搜索':匹配。
  • <img border='0' />搜索border=':匹配。

当前代码:

> var str = '<img border='0' />'
> str.replace(/(border)/gi, '|$1|')
'<img |border|='0' />'  // ok
> str.replace(/(39)/gi, '|$1|')
'<img border=&#0|39|;0&#0|39|; />'  // not ok

注意:我无法撤离,然后重新逃脱以匹配。它必须逃脱。

11 个答案:

答案 0 :(得分:3)

OP希望JavaScript正则表达式匹配并替换转义HTML中的字符串,同时将转义序列(例如<')视为单个字符,而不是unescape替换过程中的HTML字符串。

这意味着替换

  1. "lt" "[lt]" "< lt"会导致"< [lt]"(避免在实体内匹配)
  2. "<" "[<]" "< lt"会导致"[<] lt"(匹配实体)
  3. "&l" "[&l]" "< &lt"会导致"< [&l]t"(不匹配部分实体)
  4. "t;" "[t;]" "< lt;"会导致"< l[t;]"(不匹配部分实体)
  5. "< l" "[< l]" "< lt"会导致"[< l]t"(匹配包含实体)
  6. "lt; &l" "[lt; &l]" "< &lt"会导致"< &lt"(不匹配部分实体)
  7. "t; <" "[t; <]" "lt; <"会导致"l[t; <]"(匹配包含实体)
  8. "t; &lt" "[t; &lt]" "lt; <"会导致"lt; <"(不匹配部分实体)
  9. 使用以下正则表达式捕获转义序列(例如<'),

    /&[a-z]+;|&#x[a-f\d]+;|&#\d+;/gi
    

    我们可以使用以下函数作为处理上述大多数情况的起点(#1,#2,#4,#5和#7):

    function searchAndReplace(searchFor, replacement, str) {
      return str.replace(
        new RegExp(
          prepare(searchFor) + 
          "|(&[a-z]+;|&#x[a-f\\d]+;|&#\\d+;)", // consume entities
          "gi"
        ),
        function(m, entity) {
          return entity || replacement;
        }
      );
    }
    
    function prepare(str) {
      return str.replace(/[^\w\s]/g, "\\$&"); //escape regex metachars [1]
    }
    
    // [1] from http://eloquentjavascript.net/09_regexp.html#h_Rhu25fogrG
    

    其余案例(#3,#6,#8)涉及搜索字符串末尾的潜在部分转义序列。

    对此的解决方案是在最后检查searchFor字符串中潜在的部分转义序列,并附加相应的否定前瞻(?!)以防止匹配有效的转义序列。完整的解决方案(通过一组约40个测试用例)如下所示,并且应该比.exec()方法更快,更简单:

    
    
    function searchAndReplace(searchFor, replacement, str) {
      return str.replace(
        new RegExp(
          prepare(searchFor) + 
          "|(&[a-z]+;|&#x[a-f0-9]+;|&#\\d+;)", 
          "gi"
        ),
        function(m, entity) {
          return entity || replacement;
        }
      );
    }
    
    function prepare(str) {
      var add = "";
      if (/&$/.test(str)) {
        add = "(?!#x[a-z\\d]+;|#\\d+;|[a-z]+;)";
      } else if (/&[a-z]+$/i.test(str)) {
        add = "(?![a-z]*;)";
      } else if (/&#$/.test(str)) {
        add = "(?!x[a-f\\d]+;|\\d+;)";
      } else if (/&#x$/.test(str)) {
        add = "(?![a-f\\d]+;)";
      } else if (/&#x[a-f\d]+$/i.test(str)) {
        add = "(?![a-f\\d]*;)";
      }
      return str.replace(/[^\w\s]/g, "\\$&") + add;
    }
    
    // test function
    
    function test(searchFor, replacement, str, expected) {
      var result = searchAndReplace(searchFor, replacement, str);
      console.log(
        searchFor +
          ": " +
          (result === expected ? "Passed" : "Failed: " + [expected, result])
      );
    }
    
    // test cases
    
    test("lt", "[lt]", "<img border='0' />", "<img border='0' />");
    test("39", "[39]", "<img border='0' />", "<img border='0' />");
    test("'", "[']", "<img border='0' />", "<img border=[']0['] />");
    test("border='", "[border=']", "<img border='0' />", "<img [border=']0' />");
    test("39&", "[39&]", "39<img border=39'&gt&gt&&#039 t; 0'&39; />", "39<img border=39'&gt&gt&&#039 t; 0'&39; />")
    test("0&#", "[0&#]", "39<img border=39'&gt&gt&&#039 t; 0'&39; />", "39<img border=39'&gt&gt&&#039 t; 0'&39; />")
    test("lt", "[]", "&lt<t;t&l", "&[]<t;t&l");
    test("<", "[]", "&lt<t;t&l", "&lt[]t;t&l");
    test("&l", "[]", "&lt<t;t&l", "[]t<t;t[]");
    test("t;", "[]", "&lt<t;t&l", "&lt<[]t&l");
    test("t&", "[]", "&lt<t;t&l", "&lt<t;[]l");
    test("<t", "[]", "&lt<t;t&l", "&lt[];t&l");
    test("t<", "[]", "&lt<t;t&l", "&l[]t;t&l");
    test("t;t", "[]", "&lt<t;t&l", "&lt<[]&l");
    test("t&l", "[]", "&lt<t;t&l", "&lt<t;[]");
    test("39", "[]", "&#039'9;9&#", "&#0[]'9;9&#");
    test("'", "[]", "&#039'9;9&#", "&#039[]9;9&#");
    test("&", "[]", "&#039'9;9&#", "[]#039'9;9[]#");
    test("&#", "[]", "&#039'9;9&#", "[]039'9;9[]");
    test("9;", "[]", "&#039'9;9&#", "&#039'[]9&#");
    test("9&", "[]", "&#039'9;9&#", "&#039'9;[]#");
    test("'9", "[]", "&#039'9;9&#", "&#039[];9&#");
    test("9'", "[]", "&#039'9;9&#", "&#03[]9;9&#");
    test("9;9", "[]", "&#039'9;9&#", "&#039'[]&#");
    test("9&#", "[]", "&#039'9;9&#", "&#039'9;[]");
    test("x7", "[]", "߿f&#x", "&#[]ff;f&#x");
    test("", "[]", "߿f&#x", "&#x7f[]f;f&#x");
    test("&", "[]", "߿f&#x", "[]#x7ff;f[]#x");
    test("&#", "[]", "߿f&#x", "[]x7ff;f[]x");
    test("&#x", "[]", "߿f&#x", "[]7ff;f[]");
    test("&#x7", "[]", "߿f&#x", "[]ff;f&#x");
    test("f;", "[]", "߿f&#x", "&#x7f[]f&#x");
    test("f&", "[]", "߿f&#x", "߿[]#x");
    test("f", "[]", "߿f&#x", "&#x7f[];f&#x");
    test("f", "[]", "߿f&#x", "&#x7[]f;f&#x");
    test("f;f", "[]", "߿f&#x", "&#x7f[]&#x");
    test("f&#", "[]", "߿f&#x", "߿[]x");
    test("f&#x", "[]", "߿f&#x", "߿[]");
    test("t; < lt &l", "[]", "< < lt <lt; < lt &lt", "< < lt <l[]t");
    
    
    

答案 1 :(得分:1)

此处的一个选项是在执行实际替换之前,在转义字符序列中出现的任何位置临时替换正在搜索的字符串。 “虚拟”字符串将需要不太可能出现在HTML中的任何位置。在执行了实际替换之后,可以进行进一步的替换以将“虚拟”字符串更改回正在搜索的字符串。

以下是此方法的演示,它会生成请求的结果。当需要不使用正则表达式的全局替换和this useful technique将任何字符串转换为字符串文字以便在正则表达式中使用(任何特殊字符被适当地转义)时,它使用this useful technique

var html = "<img border='0' />"
replaceInHtml(html, 'lt', 'replacement');
replaceInHtml(html, '39', 'replacement');
replaceInHtml(html, ''', 'replacement');
replaceInHtml(html, 'border='', 'replacement');

function replaceInHtml(html, str, replacement) {
  // A unique string that is unlikely to appear in the HTML
  var dummyStr = '!*&$^£"^';

  var strInRegex = escapeRegExp(str);
  var dummyRegex = new RegExp('(&[#a-zA-Z0-9]*)'
      + strInRegex + '([#a-zA-Z0-9]*;)', 'g');

  var replaced = html.replace(dummyRegex, '$1' + dummyStr + '$2');
  replaced = replaced.split(str).join(replacement);
  replaced = replaced.split(dummyStr).join(str);
  console.log('Source:  ' + html
          + '\nReplace: ' + str
          + '\nWith:    ' + replacement
          + '\nGives:   ' + replaced);
}

function escapeRegExp(str) {
  return str.replace(/[\-\[\]\/\{\}\(\)\*\+\?\.\\\^\$\|]/g, "\\$&");
}

答案 2 :(得分:1)

我首先匹配&;之间的所有内容:



let str = "39<img border=39'0'&39; />39";
let search = '39';
let regexp = new RegExp('&[^&;]*?(' + search + ')[^&;]*?;', 'g'); // /&[^&;]*?(SEARCH)[^&;]*?;/g
let match = str.match(regexp);

console.log(match);




然后暗示,我想在这两个字符之间匹配的所有内容:



const prepareRegexp = searchStr => new RegExp('(?:^&[^&;]*?)?('+searchStr+')(?!(?:[^&;]*?;))|(?:(?:^|;)(?:[^&;]*?)('+searchStr+'))', 'gm'); ///(?:^&[^&;]*?)?(SEARCH)(?!(?:[^&;]*?;))|(?:(?:^|;)(?:[^&;]*?)(SEARCH))/g

let find = (str, searchStr) => {
  let regexp = prepareRegexp(searchStr);
  let foundItemsArray;
  let allFoundItems = [];

  while ((foundItemsArray = regexp.exec(str)) !== null) {
    //foundItemsArray returns as follows:
    // [0] - full match
    // [1] - first capturing group
    // [2] - second capturing group
    // To get indexes of found strings you have to use: regexp.lastIndex
    // and take into account that second case, matches everything between the last ; or start of a line
    // and the searched string
    if (foundItemsArray[0] === searchStr) { //case for the first capturing group
      allFoundItems.push(foundItemsArray[0]); //0 or 1 it doesn't matter here as the matching group is the same as the capturing group
    } else { //case for the second capturing group
      allFoundItems.push(foundItemsArray[2]);
    }
  }
  
  return allFoundItems.length ? allFoundItems : null;
}

//Function 'find' refactored to avoid loop:
find = (str, searchStr) => {
  let regexp = prepareRegexp(searchStr);
  let allFoundItems = [];
  
  str.replace(prepareRegexp(searchStr), (match, p1, p2) => {
    if (p1) {
      allFoundItems.push(p1);
    } else {
      allFoundItems.push(p2);
    }
  });
  
  return allFoundItems;
}

//And function to replace the searched string:
const replace = (str, searchStr, replaceWith) =>
  str.replace(prepareRegexp(searchStr), (match, p1, p2) => {
    if (p1) {
      return replaceWith;
    } else {
      return match.replace(searchStr, replaceWith);
    }
  });

let str = "39<img border=39'0'&39; width: 50%; />39";
//console.log('Searching "39":', find(str, '39'));
console.log('Searching "'":', find(str, '''));
//Search <img border='0' width: 50%; /> for 50:
console.log('Searching "50":', find(str, '50'));

console.log('Replacing "39" with "|39|":', replace(str, '39', '|39|'));
console.log('Replacing "50" with "|50|":', replace(str, '50', '|50|'));

//Now test the string against given examples:
str = '<img border='0'';
//Search <img border='0' /> for lt: No match.
console.log('Searching "lt":', find(str, 'lt'));
//Search <img border='0' /> for 39: No match.
console.log('Searching "39":', find(str, '39'));
//Search <img border='0' /> for ': Match.
console.log('Searching "'":', find(str, '''));
console.log('Replacing "'" with "|'|":', replace(str, ''', '|'|'));
//Search <img border='0' /> for border=': Match.
console.log('Searching "border='":', find(str, 'border=''));
console.log('Replacing "border='" with "|border='|":', replace(str, 'border='', '|border='|'));

.as-console-wrapper {
  max-height: 100% !important;
}




正则表达式的崩溃: https://regex101.com/r/UCNnu1/2

//编辑:

然而,如果搜索字符串后跟;,则不匹配搜索字符串,因此为了捕获此类字符串,我们需要扩展我们的正则表达式以匹配另一组字符并使用regexp.exec只捕捉有趣的部分。 扩展的正则表达式是:

https://regex101.com/r/UCNnu1/3

我更新了代码以使用regexp进行替换。

答案 3 :(得分:1)

RegEx应该用于检查但它不能涵盖所有可能的实体,并且不是这项工作的最佳工具。虽然以下方法适用于所有HTML实体。

  

我正在尝试匹配和替换完整的转义序列,例如'但不是部分转义序列,例如39,因为39实际上不在非转义字符串中。

基本上,您希望用不受限制的形式替换HTML实体。这就是下面的功能,你不需要RegEx。

我将使用来自this answer

Web_DesignerunescapeHTML功能
var escape = document.createElement('textarea');

function unescapeHTML(html) {
    escape.innerHTML = html;
    return escape.textContent;
}

首先创建一个新的<textarea>元素。在函数内部,作为参数传递的字符串然后被指定为此textarea的innerHTML,然后返回它的textContent。这是用于unescape HTML实体的技巧。

我们可以重用它来确定字符串是否是有效的HTML实体。如果函数能够取消它,那么它是有效的HTML实体,否则它不是。这是你想要确定的。

var escape = document.createElement('textarea');

function unescapeHTML(html) {
  escape.innerHTML = html;
  return escape.textContent;
}

var str = '&lt;img border=&#039;0&#039; /&gt;';

console.log(unescapeHTML('lt') !== 'lt');
console.log(unescapeHTML('39') !== '39');
console.log(unescapeHTML('&#039;') !== '&#039;');
console.log(unescapeHTML('border=&#039;') !== 'border=&#039;');

答案 4 :(得分:1)

  

有没有办法排除&;之间的匹配,同时仍然接受包含这两个字符的序列?基本上,每个转义序列应该被视为一个令牌。

为了将实体视为单独的标记,我们可以构建一个在任何目标子字符串之前捕获实体的正则表达式,然后使用回调函数将未修改的捕获实体返回给字符串。

示例,当"39"不在实体内时替换str.replace( /(&[a-z]+;|&#[0-9a-f]+;)|39/gi, function(m, entity){ return entity || replacement; } );

&#039;
  

我正在尝试匹配并替换完整的转义序列,例如39但不是部分转义序列,例如&#039;

更换实体时,例如function searchAndReplace(str, searchFor, replacement){ return /^&([a-z]+|#[\da-f]+);/i.test(searchFor) ? // if searchFor equals or starts with an entity str.split(searchFor).join(replacement) : // else str.replace( new RegExp( '(&[a-z]+;|&#[0-9a-f]+;)|' + searchFor.replace(/[^\w\s]/g, "\\$&"), //escape metachars 'gi' ), function(m, entity){ return entity || replacement; } ); } // test cases console.log('Search for "border": \n' + searchAndReplace( '&lt;img border=&#039;0&#039; /&gt;', 'border', '{{border}}' ) + '\nmatch'); //matches console.log('Search for "0": \n' + searchAndReplace( '&lt;img border=&#039;0&#039; /&gt;', '0', 'enter image description here' ) + '\nmatch'); //matches outside entities console.log('Search for "&#039;": \n' + searchAndReplace( '&lt;img border=&#039;0&#039; /&gt;', '&#039;', '{{&#039;}}' ) + '\nmatch'); //matches console.log('Search for "39": \n' + searchAndReplace( '&lt;img border=&#039;0&#039; /&gt;', '39', '{{39}}' ) + '\nno match'); //does not match console.log('Search for "lt": \n' + searchAndReplace( '&lt;img border=&#039;0&#039; /&gt;', 'lt', '{{lt}}' ) + '\nno match'); //does not match console.log('Search for "&lt;": \n' + searchAndReplace( '&lt;img border=&#039;0&#039; /&gt;', '&lt;', '{{&lt;}}' ) + '\nmatch'); //matches console.log('Search for "border=&#039;": \n' + searchAndReplace( '&lt;img border=&#039;0&#039; /&gt;', 'border=&#039;', '{{border=&#039;}}' ) + '\nmatch'); //matches console.log('Search for "&lt;img": \n' + searchAndReplace( '&lt;img border=&#039;0&#039; /&gt;', '&lt;img', '{{&lt;img}}' ) + '\nmatch'); //matches,需要采用不同的方法。以下工作演示处理此问题,并从提供的搜索字符串动态构建正则表达式,处理所有OP测试用例:

<reference path="./../node_modules/@types/mongodb/index.d.ts" />

答案 5 :(得分:0)

我认为您指的是非捕获群体:http://www.regular-expressions.info/brackets.html,在几个堆叠溢出帖子(Why regular expression's "non-capturing" group is not workingRegular expression, "just group, don't capture", doesn't seem to work)中轻易解决。

即,未捕获的群组不会获得他们自己的群组选择器(例如/ a(?:[XZ])([ac])/ g将匹配&#34; aZb&#34;但是\ 1等于&#34; b&#34;,而不是&#34; Z&#34;。

答案 6 :(得分:0)

这是你想要做的吗?

&#13;
&#13;
var str = "&lt;img border-color=&#039;0&#039;"
console.log(str)
console.log(str.match(/((?:[a-z-]+=)?&#.+?;)/gi))
console.log(str.replace(/((?:[a-z-]+=)?&#.+?;)/gi, "|$1|"))
&#13;
&#13;
&#13;

答案 7 :(得分:0)

我认为如果我们能够使用回顾是可能的。鉴于正则表达式的味道是JavaScript,在这里,我认为我们不能。这非常接近: [^&;]*(string)[^&;]*(?!9;|t;|;)

答案 8 :(得分:0)

最终版本候选人

4/29

此版本应在搜索字符串的末尾处理部分实体 其中partial有预实体字符,如xxx&yyya&#00等。

这是@TomasLangkaas发现的最后一个案例 鉴于涵盖了所有其他案例,这是最终候选人 为@athancahill或任何其他感兴趣的人。

(参见评论和以前的版本)

模型已从 String.Replace()更改为而(match = Rx.exec())

在此解释,但请参阅JS代码以实现 它仍然使用搜索字符串作为第一个交替
以实体为第二。

    (?=
         # This is the optional entity captured at
         # the same position where the search string starts.
         # If this entity matches, it means the search string
         # matches. Either one may be a partial of the other.

         # (1) The container for pre-entity / entity
         (                             
              # (2) Pre-entity characters 
              ( sLongest )                

              # Entity   
              (?:&(?:[a-z_:][a-zd_:.-]*|(?:\#(?:[0-9]+|x[0-9a-f]+)))|%[a-z_:][a-zd_:.-]*);
         )?                            
    )

    # (3) The search string ( consumes )
    ( sToFind )                        
 | 

    # (4) Or, the entity last  ( consumes ) 
    ( (?:&(?:[a-z_:][a-zd_:.-]*|(?:\#(?:[0-9]+|x[0-9a-f]+)))|%[a-z_:][a-zd_:.-]*); )

请注意,您不能将实体语法分解为正则表达式的一部分 它必须完全匹配,作为一个独特的项目 (沿着这条路走了一百次,无法做到。)。

请注意,这是一个单程,纯正则表达式解决方案,速度非常快 如果你拿出所有的评论,它实际上只有几行代码 您可以修改实体子表达式并使用您想要的任何内容 代码结构不需要改变。

//=========================================================
// http://jsfiddle.net/b4b28a38/95/
//=========================================================

// ------------------------
// These are only used when pre-entity partials are detected
var RxEntPartial = new RegExp( '(?:&(?:[a-z_:][a-zd_:.-]*|(?:\#(?:[0-9]+|x[0-9a-f]*)?))?|%(?:[a-z_:][a-zd_:.-]*)?)$', 'ig' );
var RxEntFull = new RegExp( '(?:&(?:[a-z_:][a-zd_:.-]*|(?:\#(?:[0-9]+|x[0-9a-f]+)))|%[a-z_:][a-zd_:.-]*);', 'ig' );
// ------------------------

function MakeRegex( FindAry ) {
   // Escape metachars
     var longest = 0;
     for (var i = 0; i < FindAry.length; i++ )
     {
         if ( FindAry[i].length > longest )
            longest = FindAry[i].length;
         FindAry[i] = FindAry[i].replace(/(?!\s)\W/g, "\\$&"); 
     }
   // Make 'longest' sub-expression
     longest -= 1; 
     var sLongest = '';
     if ( longest > 0 )
         sLongest = '.{0,' + longest.toString() + '}?';
   // Join array using alternations
     var sToFind = FindAry.join('|');
   // Return new regex object
     var rx =  new RegExp( '(?=((' + sLongest + ')(?:&(?:[a-z_:][a-zd_:.-]*|(?:\#(?:[0-9]+|x[0-9a-f]+)))|%[a-z_:][a-zd_:.-]*);)?)(' + sToFind + ')|((?:&(?:[a-z_:][a-zd_:.-]*|(?:\#(?:[0-9]+|x[0-9a-f]+)))|%[a-z_:][a-zd_:.-]*);)',  
   'ig');
   //console.log( rx);
   return rx;
}  


function GetReplace( str, Rx )
{
   var sResult = '';    // New modified string to return
   var _M;              // Match object
   var ndxLast = 0;     // Previous Rx.lastIndex value when valid match
                        // ( where next match starts )

   Rx.lastIndex = 0;
   
   while ( _M = Rx.exec( str ) )
   {
       // Alternation 1: (1) = container (optiopnal), p2 = pre-entity, entity, p3 = search string
       // Alternation 2: p4 = entity
       // Form:      
       //     (?=
       //          (                    # (1) start container
       //            ( pre-entity )            # (2)
       //            entity
       //          )?                       # (1) end
       //     )
       //     ( search )                 # (3)
       //  |  
       //     ( entity )                 # (4)
       
       if ( _M[4] )
       {
          // Entity, continue unchanged.
          sResult += str.substr( ndxLast , _M.index - ndxLast ) + _M[4];
          ndxLast = Rx.lastIndex;
          continue;
       }
       // Check if entity container captured inside zero length assertion matched 
       if ( _M[1] )
       {
           // Get some lengths 
      
           var L1 = _M[1].length;
           var L2 = _M[2].length;
           var L3 = _M[3].length;

           if ( L1 == L3 )
           {
              // Ok - This means it matched a trailing full entity
              // Intended, modify the search string
              sResult += str.substr( ndxLast , _M.index - ndxLast ) + '[' + _M[3] + ']' ;
              ndxLast = Rx.lastIndex;
              continue;
           }

           // Pre entity check  ( pre-entity ) 
           if ( L2 > 0 )  
           {
               // This is a rare case and should not slow anything down.
               // End entity condition to check

               var sMatched = _M[3];
               var mpartial;
               RxEntPartial.lastIndex = 0;

               // Verify the match had a partial entity at the end
               if ( mpartial = RxEntPartial.exec( sMatched ) )
               {
                   // Check partial entity is not at the  beginning                   
                   if ( mpartial.index > 0 )
                   {
                       // Location in target string to check
                       // for a full entity.
                       var loc = _M.index + mpartial.index;

                       // Assure there is no full entity
                       RxEntFull.lastIndex = loc;
                       var mfull;
                       if ( mfull = RxEntFull.exec( str ) )
                       {
                           if ( mfull.index == loc )
                           {
                               // Not valid, move past it
                               RxEntFull.lastIndex = 0;
                               Rx.lastIndex += (L1 - L3);
                               continue;
                           }
                       }
                  }
               }
               // Ok - This definitely passes.
               // Intended, modify the search string
               sResult += str.substr( ndxLast , _M.index - ndxLast ) + '[' + _M[3] + ']' ;
               ndxLast = Rx.lastIndex;
               continue;
           }

           // Normal checks
           // -------------------------------------------------------

           // If the length of the search >= the entity length
           // then the search includes an entity at the begining
       

           if ( L3 >= L1 )
           {
              // Intended, modify the search string
              sResult += str.substr( ndxLast , _M.index - ndxLast ) + '[' + _M[3] + ']' ;
              ndxLast = Rx.lastIndex;
              continue;
           }

          // Uh oh, the search is a partial entity (from the beginning).
          // Since we see that it is part of an entity, we have to go past it.
          // The match position reflects the partial entity.
          // Adjust (advance) the match position by the difference
          // to go past the entity.

          Rx.lastIndex += ( L1 - L3 );
          continue;
       }

       // Here, the search string is pure, just modify it
       sResult += str.substr( ndxLast , _M.index - ndxLast ) + '[' + _M[3] + ']' ;
       ndxLast = Rx.lastIndex;
   }
   sResult += str.substr( ndxLast , str.length - ndxLast );
   return sResult;
}

var TargetStr = "39&lt;img border=39&#039;&gt&gt&&#039 t; xx&gt x&gt; 0&# r&#039;&39; cad&#092; r&#FFd0&#22 /&gtttttt;  39; end";
console.log( 'Target:\r\n' + TargetStr );

// Always put the longest first of/if alphabetically sorted (ie. aaa|aa|a, etc ..)

var rx = MakeRegex( ['39', '&lt;img', 'cad&#092;', '&gt', 't;', 'x&', '0&#', 'r&#'] );

var NewString = GetReplace( TargetStr, rx );

console.log('Find any of:  39, &lt;img, cad&#092;, &gt, t;, x&, 0&#, r&#' );
console.log( NewString );

输出

 Target:
 39&lt;img border=39&#039;&gt&gt&&#039 t; xx&gt x&gt; 0&# r&#039;&39; cad&#092; r&#FFd0&#22 /&gtttttt;  39; end

 Find any of:  39, &lt;img, cad&#092;, &gt, t;, x&, 0&#, r&#

 [39][&lt;img] border=[39]&#039;[&gt][&gt]&&#0[39] [t;] x[x&]gt x&gt; [0&#] r&#039;&[39]; [cad&#092;] [r&#]FFd[0&#]22 /&gtttttt;  [39]; end

答案 9 :(得分:0)

更新2017-04-28
添加39&0&#测试用例 - 无需更改代码 - 哇,这很幸运:)

重要的是要注意,我故意不允许&符号中的&符号存在,除了作为转义序列的开始,无论它是否是有效的转义序列,即im允许{{1即使在技术上无效,它也是一个转义序列。这样可以很容易地说,如果我们被要求找到一个带有&符号的字符串,这个字符串不是完整转义序列的一部分(例如下面的&39; AmpExcape`来执行此操作,而不是终止并返回未更改的字符串,这是一个方便,因为这个正则表达式的其他用例(即外部JavaScript)不允许我条件分支语句(或匹配的函数回调)。为了我的目的,这来到定义我工作反对转义的壁橱HTML。

更新2017-04-26
添加0&#') then this should not be matched, and is an invalid search string. The use of测试用例并通过转义查找字符串来编辑该案例的答案/代码

更新2017-04-25

两遍正则表达式解决方案:

&g

我一直在寻找一种仅使用正则表达式的方法,虽然@sln和@ tomas-langkaas的解决方案很有用(好吧,令人兴奋),我仍然想要一种我可以使用的方法在JS及以后。

我有一些适用于我的测试用例的东西,但它被迫对文本进行多次传递。最初我有三次传球,但我认为我有两次传球。显然不如其他答案有效。

我试图匹配:function do_replace(test_str, find_str, replace_str) { let escaped_find_str = AmpExcape(find_str); escaped_find_str = RegExcape(escaped_find_str); let escaped_replace_str = RegExcape(replace_str); let first_regex = new RegExp('(' + escaped_find_str + ')|(&\\w*;|&#[0-9a-fA-F]*;)','gi'); let first_pass = test_str.replace(first_regex,'&&$1'+replace_str+'&&$2'); let second_regex = new RegExp('&&(?:'+escaped_replace_str+'&&|(' + escaped_find_str + ')?('+escaped_replace_str + ')?&&)','gi'); let second_pass = first_pass.replace(second_regex,'$2'); return second_pass; } 作为那些答案,并将订单翻转为实体,因为@sln在最新更新中做了。但我使用<target string> | <entity>作为新的&#39;逃脱序列。策略是:

  • 最初我们转义查找和替换字符串以供在regex中使用。对于&&,如果<target string>不是complte实体的一部分,那么我们将其替换为双&,以防止它被匹配,即我考虑到在转义HTML中搜索部分实体永远不可能

  • 第一遍用

    替换任何匹配(实体或目标字符串)
    &&
  • 在目标字符串的匹配项上,&&<target group value><replacement string>&&<entity group value> 将为空,我们将返回<entity group value>转义符,目标字符串,替换字符串和最终{{1}逃跑

  • 对于实体的匹配,&&现在将为空,因此我们最终返回&&后跟实体值。
  • 第二次传递可以查找所有<target group value>并替换为&&<replacement str>&&
  • 同样在第二遍中我们可以查找&&<target string><replacement string>&&并知道它应该替换为空白。
  • 这次我们没有为实体比赛做任何事情,因为我们在第1阶段没有触及它们

带有测试用例的完整代码(来自@ tomas-langkaas维护的RexExcape的属性):

<replacement string>

和输出

&&<replacement string>&&

更新2017-04-24

我放弃了下面的方法有几个原因,但最重要的是,它只会在没有符号或分号的文本中找到第一次出现的查找字符串。例如,如果匹配是// helper for building regexes from strings // http://stackoverflow.com/a/3561711/6738706 function AmpExcape(str) { return str.replace(/&(\w*;|#[0-9a-fA-F]*;)|(&)/g, '&$2$1'); } function RegExcape(str) { return str.replace(/[-\/\\^$*+?.()|[\]{}]/g, '\\$&'); }; function do_replace(test_str, find_str, replace_str) { let escaped_find_str = AmpExcape(find_str); escaped_find_str = RegExcape(escaped_find_str); let escaped_replace_str = RegExcape(replace_str); let first_regex = new RegExp('(' + escaped_find_str + ')|(&\\w*;|&#[0-9a-fA-F]*;)','gi'); let first_pass = test_str.replace(first_regex,'&&$1'+replace_str+'&&$2'); let second_regex = new RegExp('&&(?:'+escaped_replace_str+'&&|(' + escaped_find_str + ')?('+escaped_replace_str + ')?&&)','gi'); let second_pass = first_pass.replace(second_regex,'$2'); return second_pass; } let str = '39&lt;img style=&#039;width: 39;&#039; bor9;wder=39&#039;0&#039;&39; /&gt;39;'; let test_list = ['39','39;','9;','9;w','39&','0&#']; run_test(str,test_list); str = '&lt;img border=&#039;0&#039; /$gt;'; test_list = ['lt','39','&#039;','border=&#039;']; run_test(str,test_list); str = 'test string ring ring'; test_list = ['ring']; run_test(str,test_list); str = '39&lt;img style=&#039;width: 39;&#039; border=&#039;0&#039;&39; /&gt;39;'; test_list = ['border','0','&#039;','39','lt','&lt;','border=&#039;','&lt;img','&g','t;']; run_test(str,test_list); function run_test(base_str, find_list) { let orig_str = 'original'; let max_len = find_list.concat(orig_str).reduce(function(a,b) { return a > b.length ? a : b.length; },0); console.log(); console.log(pad(orig_str,max_len) + ': ' + str); find_list.map(function(gstr) { console.log( pad( gstr, max_len) + ': ' + do_replace(str, gstr, '|' + gstr + '|')); }); } function pad(str,len) { while ( str.length < len) { str = str + ' ' }; return str; } ,则字符串original: 39&lt;img style=&#039;width: 39;&#039; bor9;wder=39&#039;0&#039;&39; /&gt;39; 39 : |39|&lt;img style=&#039;width: |39|;&#039; bor9;wder=|39|&#039;0&#039;&39; /&gt;|39|; 39; : 39&lt;img style=&#039;width: |39;|&#039; bor9;wder=39&#039;0&#039;&39; /&gt;|39;| 9; : 39&lt;img style=&#039;width: 3|9;|&#039; bor|9;|wder=39&#039;0&#039;&39; /&gt;3|9;| 9;w : 39&lt;img style=&#039;width: 39;&#039; bor|9;w|der=39&#039;0&#039;&39; /&gt;39; 39& : 39&lt;img style=&#039;width: 39;&#039; bor9;wder=39&#039;0&#039;&39; /&gt;39; 0&# : 39&lt;img style=&#039;width: 39;&#039; bor9;wder=39&#039;0&#039;&39; /&gt;39; original : &lt;img border=&#039;0&#039; /$gt; lt : &lt;img border=&#039;0&#039; /$gt; 39 : &lt;img border=&#039;0&#039; /$gt; &#039; : &lt;img border=|&#039;|0|&#039;| /$gt; border=&#039;: &lt;img |border=&#039;|0&#039; /$gt; original: test string ring ring ring : test st|ring| |ring| |ring| original : 39&lt;img style=&#039;width: 39;&#039; border=&#039;0&#039;&39; /&gt;39; border : 39&lt;img style=&#039;width: 39;&#039; |border|=&#039;0&#039;&39; /&gt;39; 0 : 39&lt;img style=&#039;width: 39;&#039; border=&#039;|0|&#039;&39; /&gt;39; &#039; : 39&lt;img style=|&#039;|width: 39;|&#039;| border=|&#039;|0|&#039;|&39; /&gt;39; 39 : |39|&lt;img style=&#039;width: |39|;&#039; border=&#039;0&#039;&39; /&gt;|39|; lt : 39&lt;img style=&#039;width: 39;&#039; border=&#039;0&#039;&39; /&gt;39; &lt; : 39|&lt;|img style=&#039;width: 39;&#039; border=&#039;0&#039;&39; /&gt;39; border=&#039;: 39&lt;img style=&#039;width: 39;&#039; |border=&#039;|0&#039;&39; /&gt;39; &lt;img : 39|&lt;img| style=&#039;width: 39;&#039; border=&#039;0&#039;&39; /&gt;39; &g : 39&lt;img style=&#039;width: 39;&#039; border=&#039;0&#039;&39; /&gt;39; t; : 39&lt;img style=&#039;width: 39;&#039; border=&#039;0&#039;&39; /&gt;39; 将仅匹配为test string ring ring - 这对于查找和替换似乎毫无用处 - 我更新答案以便它可以用于匹配{ {1}}至少第一次,因为我之前错过了线路开始作为允许的终端字符,但我不认为这是所有可能的文本的有效解决方案。

原始回答(有修改)

如果你关心文本中出现的分号而不是相应的test st|ring| ring ring的终结字符,就像内联样式可能会说ring那样,那么你需要一些有点复杂的东西:

ring

下面提供的代码将输出原始测试用例:

&

它还会显示文本和搜索字词中出现的针对分号的特定输出。

style="width: 39;"

注意这就是方法崩溃的例子:

'((?:^|(?!(?:[^&;]+' + str + ')))(?=(?:(?:&|;|^)[^&;]*))(?:(?!(?:&))(?:(?:^|[;]?)[^&;]*?))?)(' + str + ')'

没有任何背后的东西可以让记忆变得更加容易。效果区分上一个匹配后的零个字符和零个字符,但在转义的序列中。因此,如果不匹配字符串"original": &lt;img border=&#039;0&#039; /$gt; "lt": &lt;img border=&#039;0&#039; /$gt; "39": &lt;img border=&#039;0&#039; /$gt; "&#039;": &lt;img border=|&#039;|0|&#039;| /$gt; "border=&#039;": &lt;img |border=&#039;|0&#039; /$gt; 中的"original": 39&lt;img style=&#039;width: 39;&#039; bor;der=39&#039;0&#039;&39; /&gt;39; test string that may be followed by semi-colon : |39|&lt;img style=&#039;width: |39|;&#039; bor;der=|39|&#039;0&#039;&39; /&gt;|39|; test match with semi-colon: 39&lt;img style=&#039;width: |39;|&#039; bor;der=39&#039;0&#039;&39; /&gt;|39;| test match with semi-colon mid string 39&lt;img style=&#039;width: 39;&#039; |bor;der|=39&#039;0&#039;&39; /&gt;39; <{1}},则无法匹配第二次出现"original": test string ring ring test st|ring| ring ring

下面是示例代码:

ring

重要的是要注意正则表达式不仅捕获您想要的文本,还捕获此前的文本块。这会影响您使用39作为替换值,因为您想要的文字现在是&#039;

对这些术语的解释并不简单,但可以分解为:

包含要查找的字符串的否定前瞻,以防止在非终端字符上开始匹配

function make_regex(str) {
  let regexp = new RegExp('((?:^|(?!(?:[^&;]+' + str + ')))(?=(?:(?:&|;|^)[^&;]*))(?:(?!(?:&))(?:(?:^|[;]?)[^&;]*?))?)(' + str + ')','gi');
  return regexp;
}

function do_replace(test_str, find_str, replace_str) {
        let new_reg = make_regex(find_str);
        return  test_str.replace(new_reg,'$1' + replace_str);
}

let str = '39&lt;img style=&#039;width: 39;&#039; bor;der=39&#039;0&#039;&39; /&gt;39;';
console.log();
console.log('"original":     ', str);
console.log('test string that may be followed by semi-colon :');
console.log(do_replace(str, '39', '|$2|' ));
console.log('test match with semi-colon:');
console.log(do_replace(str, '39;', '|39;|' ));
console.log('test match with semi-colon mid string');
console.log(do_replace(str, 'bor;der', '|bor;der|' ))

str = '&lt;img border=&#039;0&#039; /$gt;';
console.log();
console.log('"original":     ', str);
console.log('"lt":           ', do_replace(str, 'lt', '|lt|' ));
console.log('"39":           ', do_replace(str, '39', '|39|' ));
console.log('"&#039;":       ', do_replace(str, '&#039;', '|&#039;|' ));
console.log('"border=&#039;":', do_replace(str, 'border=&#039;', '|border=&#039;|' ));
str = 'test string ring ring';
console.log();
console.log('"original":     ', str);
console.log(do_replace(str, 'ring', '|$2|')); 

强制前瞻,强制匹配在终端字符或行的开头

开始
$1

一种负向前瞻,阻止匹配从$2开始,但允许行开头或前一个分号

(?:^|(?!(?:[^&;]+' + str + ')))

然后最后匹配的字符串

效果是我们匹配包含我们想要的文本的文本的整个部分,从前一个终端值到要匹配的字符串的结尾。因此,javascript最终会匹配匹配的完整块。例如,当我们要求(?=(?:(?:&|;|^)[^&;]*)) 时,我们将以块&结束。所以我们定义了两个捕获组,一个用于我们不感兴趣的块的一部分,另一个用于匹配。这允许我们使用(?:(?!(?:&))(?:(?:^|[;]?)[^&;]*?))? 重新创建字符串或border=&#039;来仅替换我们想要的部分。然后我们可以将;img border=&#039;与此策略一起使用

答案 10 :(得分:-1)

这将有助于您确定

pattern=new RegExp('\&\#.*?\;',"g") str="&lt;img border=&#039;0&#039; /&gt;" str.replace(pattern,"yourReplacementText")

相关问题