正则表达式忽略特殊的ASCII字符

时间:2015-04-19 17:26:35

标签: regex perl

我正在阅读PERL脚本中的一堆文件:css,html,js,png,jpg,我想只打印文本。该脚本正在使用此正则表达式:

if($substring4 =~ /[A-Za-z0-9 \t\n\r]/)
{
    print("GOOD ...\n");
    print($buffer);
}
else
{
    print("INVALID ...\n");
    print($buffer);
    print("\n\n");
}

女巫效果不佳:

标记为好:

--G=154 W=164 F=168 L=512--
GOOD ...
unload=null)}}},simulate:function(a,b,c,d){var e=f.extend(new f.Event,c,{type:a,isSimulated:!0,originalEvent:{}});d?f.event.trigger(e,null,b):f.event.dispatch.call(b,e),e.isDefaultPrevented()&&c.preventDefault()}},f.event.handle=f.event.dispatch,f.removeEvent=c.removeEventListener?function(a,b,c){a.removeEventListener&&a.removeEventListener(b,c,!1)}:function(a,b,c){a.detachEvent&&a.detachEvent("on"+b,c)},f.Event=function(a,b){if(!(this instanceof f.Event))return new f.Event(a,b);a&&a.type?(this.originalEven

--G=155 W=165 F=169 L=512--
GOOD ... ===> GOOD, it must accept those characters
t=a,this.type=a.type,this.isDefaultPrevented=a.defaultPrevented||a.returnValue===!1||a.getPreventDefault&&a.getPreventDefault()?K:J):this.type=a,b&&f.extend(this,b),this.timeStamp=a&&a.timeStamp||f.now(),this[f.expando]=!0},f.Event.prototype={preventDefault:function(){this.isDefaultPrevented=K;var a=this.originalEvent;!a||(a.preventDefault?a.preventDefault():a.returnValue=!1)},stopPropagation:function(){this.isPropagationStopped=K;var a=this.originalEvent;!a||(a.stopPropagation&&a.stopPropagation(),a.cancel

--G=156 W=166 F=170 L=512--
GOOD ... ===> GOOD, it must accept those characters
Bubble=!0)},stopImmediatePropagation:function(){this.isImmediatePropagationStopped=K,this.stopPropagation()},isDefaultPrevented:J,isPropagationStopped:J,isImmediatePropagationStopped:J},f.each({mouseenter:"mouseover",mouseleave:"mouseout"},function(a,b){f.event.special[a]={delegateType:b,bindType:b,handle:function(a){var c=this,d=a.relatedTarget,e=a.handleObj,g=e.selector,h;if(!d||d!==c&&!f.contains(c,d))a.type=e.origType,h=e.handler.apply(this,arguments),a.type=b;return h}}}),f.support.submitBubbles||(f.ev

当它应该被设置为GOOD时标记为无效,那里有正常的字符,您可以在键盘上找到:

INVALID ... ===> WRONG, it must accept those characters
/^(?:button|input|object|select|textarea)$/i,t=/^a(?:rea)?$/i,u=/^(?:autofocus|autoplay|async|checked|controls|defer|disabled|hidden|loop|multiple|open|readonly|required|scoped|selected)$/i,v=f.support.getSetAttribute,w,x,y;f.fn.extend({attr:function(a,b){return f.access(this,a,b,!0,f.attr)},removeAttr:function(a){return this.each(function(){f.removeAttr(this,a)})},prop:function(a,b){return f.access(this,a,b,!0,f.prop)},removeProp:function(a){a=f.propFix[a]||a;return this.each(function(){try{this[a]=b,delet

INVALID ... ===> WRONG, it must accept those characters
""),e=0,g=Object.prototype.toString,h=!1,i=!0,j=/\\/g,k=/\r\n/g,l=/\W/;[0,0].sort(function(){i=!1;return 0});var m=function(b,d,e,f){e=e||[],d=d||c;var h=d;if(d.nodeType!==1&&d.nodeType!==9)return[];if(!b||typeof b!="string")return e;var i,j,k,l,n,q,r,t,u=!0,v=m.isXML(d),w=[],x=b;do{a.exec(""),i=a.exec(x);if(i){x=i[3],w.push(i[1]);if(i[2]){l=i[3];break}}}while(i);if(w.length>1&&p.exec(b))if(w.length===2&&o.relative[w[0]])j=y(w[0]+w[1],d,f);else{j=o.relative[w[0]]?[d]:m(w.shift(),d);while(w.length)b=w.shift(

标记为好,但必须从ssl证书的图像标记为坏的有线字符

--G=722 W=723 F=749 L=512--
GOOD ... ===> WRONG, it must skip those characters, mark them as bad character
£©4Óô]>E‘Ÿ0AN=Går™ÉÉI²µ0ô ªªÒÒÒB"‘ ‹Õ-ûÚ×¾V'ü®¼òJxàz{{ë\+1Óñ>Ÿßþíߦ³³“Ûn»T*…ëºüþïÿ>K—.eË–-}˜‹ùåá”âåüÃA½yBpðŠw‘mïd㯟Ã0+³Ë~äájàÝ÷êéõÇ:~<EÁ±†¢é¡P„ñÔ(¥R‘þeç`š&““D"ú–,edd„J¹L1B+[”
¹ª ,9xð @µ:«iúŽ ãxa›.™Lšó×\À¦ËÞmšDÚâ2–JQÌÑ[[)UòÅRM„+57¸ZYÔÐTÎ]Þ˲žv†F’âÀ¡A†F’ÕF쮋¦iâò ‚°håÿgïÍãä¨ëüÿgÝÕÇô\=÷ä"„%‚«‚(ä0(‚úÔwYýŠ+»®« +*ÊOEQÔu¿_—å'‡À*» ‚,"§ˆrGH$dÎÌÝ÷Qç÷êª©é™   dfrÔ3JuWW×çÓ=]UŸ×ç}½ð¼ðÂ<ýôÓ¼üòËŒ“ÉdÈf³X–53ž ÐØØHcc#étš•+Wò¦7½‰ÉÉI¾ùÍoû……_ÄÎáÀSO=•;3uëÖ‘Ïç)—Ëœ{î¹<üðó

--G=723 W=724 F=750 L=512--
INVALID ... ===> GOOD, it must skip those characters, mark them as bad character, i don't know why here is working
,¨û+Ñ]4""bIð­põq€a (Š>„Çšš9ò¡{Iò»ÜŽ»ù%ŠŸ»ûŸ.Fÿ‹Ã‚ãî³ÐuA&ˆ"í]à:H²— EQ5Ç%›ÍÏçH&’Ȳˆ$
ä²’©®£T¥\©`T«<÷ÜslÛ¶\.G6›
Jxø³Ú…|ž|.Çú³Ï£©¹Ë²hIÆ(—˼üòTY&‹‘ÏeÉçstvõ Ë
ã£4¤Ñc1,Û¦û-üá‰gx~ãKàz¿Q’ˆòuFDD,óÅÕmܸ‘{{§žzŠÁÁÁ]:v>Ÿg`ÀixðÁ¹öÚkg¼¾víZn»í¶ÂoŸ½_í&ê¿Ÿ7¿ùÍÜxãœsÎ9T*&&&øÈG>Â<0Ëâº?‰¿ˆˆˆ%cGq€õÀ‚$ñä©ë9ø÷¾¦8@²*_ûö‡Ï'~ú»÷Ù8ÀZ5¾ AeÛ6‘%™öö.ª¦®é¤Óí$“)ôXèÒÝÝÍÈÈ(åR‘TcŽš`t ×u1“‘‘LÓ¤Z­Ö?Çq0M“¾mÛXsèZÞúöu¸ŽW€=•TÎÒØÜ‚}¸£    IDAT"Iˆ’„išH¢ˆã:ò9&'ÇM÷ŠÑ[Ž—f*“Ŷt-Êj±øÔß,ËâW¿ú

我想要一个正则表达式,接受您在htmlxmljsoncss文件中找到的所有字符,但不包括{{1}之类的字符在阅读图片时可以找到。

我最好的正则表达式是:

sÎ9T*&&&øÈG>Â<0Ëâº?‰¿ˆˆˆ%cGq€õÀ‚$ñä

我做错了什么?

感谢。

2 个答案:

答案 0 :(得分:0)

目前还不清楚你要求的是什么,但是你可以尝试检查任何非ASCII字符,或者&符号或尖括号。

喜欢这个

if ( $buffer =~ /[\P{ascii}&<>]/ ) {
    print "INVALID\n";
}
else {
    print "GOOD\n";
}
print $buffer, "\n\n";

答案 1 :(得分:0)

您可能希望选择仅由可见的ascii字符组成的行和/或\t\r\n

if ($substring4 =~ m/\A [\x09\x0A\x0D\x20-\x7E]* \z/xms) {
    print STDOUT "GOOD ...\n";
}

说明

  • 必须匹配整行:*
  • \n\r在perl中不可移植,请改用\x0A\x0D
  • 建议修饰符ms和锚点\A\z用于二进制模式
  • 修饰符x允许格式化的正则表达式(忽略空格)