字符串标记生成器无法正确处理行中的空标记或多个分隔符

时间:2014-12-16 02:48:28

标签: javascript tokenize strtok

所以我在javascript中有一个简单的strtok实现。出于某种原因,如果您尝试在字符串中的一行中传递多个分隔符,则会将每个其他分隔符作为字符进行分析。例如,由"This,, and that"","分隔的" "将返回[This,andthat的数组。第二个逗号,因为它紧跟在另一个分隔符之后被视为非分隔符。

这是代码:

//@param str: a string to tokenize
//@param tok: an array of chars to be delimiters
//@return: an array of strings.

var strtok = function(str, tok)
{
    if(str === "") return [];

    var ret = [],
    _buffer = str,
    sub,
    b = _buffer.charAt(0),
    start = 0,
    i = 0,
    len = _buffer.length; 

    while(i <= len) {
        b = _buffer.charAt(i);
        tok.forEach(function(elm) {
            if (b === elm) {              
                sub = _buffer.slice(start, i);
                if(!(sub in tok)) ret.push(sub);
                start = ++i;
                return;
            } 
        });  
        i++;
    }
    ret.push(_buffer.slice(start, len));
    return ret;
}

var str = "This,, that";
var tok = '. ,'.split('');
console.log(strtok(str,tok));
// ["This", ",", "that"]
str = "This,,, that";
console.log(strtok(str, tok));
// ["This", ",", " that"] <- notice the space before that

1 个答案:

答案 0 :(得分:1)

索引过度增加,归咎于start = ++i;,修正为start = i + 1;