Question

我想解析包含分号;的Python代码，用于分隔命令并生成用换行符\n替换它们的代码。例如，来自

def main():
    a = "a;b"; return a

我想制作

def main():
    a = "a;b"
    return a

任何提示？

Answer 1

使用tokenize library查找token.OP tokens，其中第二个元素是; ^*。用token.NEWLINE token替换这些令牌。

您也需要调整令牌偏移并生成匹配缩进;因此，在NEWLINE之后，您需要调整行号（按照您插入的每NEWLINE增加的偏移量增加）和＆＃39; next＆＃39; line（当前行的其余部分）必须调整索引以匹配当前缩进级别：

import tokenize

TokenInfo = getattr(tokenize, 'TokenInfo', lambda *a: a)  # Python 3 compat

def semicolon_to_newline(tokens):
    line_offset = 0
    last_indent = None
    col_offset = None  # None or an integer
    for ttype, tstr, (slno, scol), (elno, ecol), line in tokens:
        slno, elno = slno + line_offset, elno + line_offset
        if ttype in (tokenize.INDENT, tokenize.DEDENT):
            last_indent = ecol  # block is indented to this column
        elif ttype == tokenize.OP and tstr == ';':
            # swap out semicolon with a newline
            ttype = tokenize.NEWLINE
            tstr = '\n'
            line_offset += 1
            if col_offset is not None:
                scol, ecol = scol - col_offset, ecol - col_offset
            col_offset = 0  # next tokens should start at the current indent
        elif col_offset is not None:
            if not col_offset:
                # adjust column by starting column of next token
                col_offset = scol - last_indent
            scol, ecol = scol - col_offset, ecol - col_offset
            if ttype == tokenize.NEWLINE:
                col_offset = None
        yield TokenInfo(
            ttype, tstr, (slno, scol), (elno, ecol), line)

with open(sourcefile, 'r') as source, open(destination, 'w') as dest:
    generator = tokenize.generate_tokens(source.readline)
    dest.write(tokenize.untokenize(semicolon_to_newline(generator)))

请注意，我无需更正line值;它只是提供信息，在取消标记时，实际上并未使用从文件中读取的数据。

演示：

>>> from io import StringIO
>>> source = StringIO('''\
... def main():
...     a = "a;b"; return a
... ''')
>>> generator = tokenize.generate_tokens(source.readline)
>>> result = tokenize.untokenize(semicolon_to_newline(generator))
>>> print(result)
def main():
    a = "a;b"
    return a

稍微复杂一点：

>>> source = StringIO('''\
... class Foo(object):
...     def bar(self):
...         a = 10; b = 11; c = 12
...         if self.spam:
...             x = 12; return x
...         x = 15; return y
...
...     def baz(self):
...         return self.bar;
...         # note, nothing after the semicolon
... ''')
>>> generator = tokenize.generate_tokens(source.readline)
>>> result = tokenize.untokenize(semicolon_to_newline(generator))
>>> print(result)
class Foo(object):
    def bar(self):
        a = 10
        b = 11
        c = 12
        if self.spam:
            x = 12
            return x
        x = 15
        return y

    def baz(self):
        return self.bar

        # note, nothing after the semicolon

>>> print(result.replace(' ', '.'))
class.Foo(object):
....def.bar(self):
........a.=.10
........b.=.11
........c.=.12
........if.self.spam:
............x.=.12
............return.x
........x.=.15
........return.y

....def.baz(self):
........return.self.bar
........
........#.note,.nothing.after.the.semicolon

^* {3}的Python 3版本输出更具信息性的tokenize名为元组，它们具有额外的TokenInfo属性，可用于代替文字匹配：exact_type。我保持上面与Python 2和3兼容。

Answer 2

这是一个pyparsing解决方案 - 请参阅以下代码中的注释：

from pyparsing import Literal, restOfLine, quotedString, pythonStyleComment, line

SEMI = Literal(';')
patt = SEMI + restOfLine
patt.ignore(quotedString)
patt.ignore(pythonStyleComment)

def split_at(s, locs):
    """
    break up s into pieces, given list of break locations
    """
    current = 0
    ret = []
    for loc in locs:
        ret.append(s[current:loc].lstrip())
        current = loc+1
    ret.append(s[current:].lstrip())
    return ret

def split_on_semicolon(s,l,tokens):
    """
    parse time callback, when finding first unquoted ';' on a line
    """
    current_line = line(l,s)
    line_body = current_line.lstrip()
    indent = current_line.index(line_body)
    indent = current_line[:indent]

    # may be more than one ';' on this line, find them all
    # (the second token contains everything after the ';')
    remainder = tokens[1]
    if remainder.strip():
        all_semis = [s for _,s,_ in SEMI.scanString(remainder)]

        # break line into pieces
        pieces = split_at(remainder, all_semis)

        # rejoin pieces, with leading indents
        return '\n'+'\n'.join(indent+piece for piece in pieces)
    else:
        return ''

patt.addParseAction(split_on_semicolon)

sample = """
def main():
    this_semi_does_nothing();
    neither_does_this_but_there_are_spaces_afterward();   
    a = "a;b"; return a # this is a comment; it has a semicolon!

def b():
    if False:
        z=1000;b("; in quotes");  c=200;return z
    return ';'

class Foo(object):
    def bar(self):
        '''a docstring; with a semicolon'''
        a = 10; b = 11; c = 12

        # this comment; has several; semicolons
        if self.spam:
            x = 12; return x # so; does; this; one
        x = 15;;; y += x; return y

    def baz(self):
        return self.bar
"""
print(patt.transformString(sample))

给出：

def main():
    this_semi_does_nothing()
    neither_does_this_but_there_are_spaces_afterward()
    a = "a;b"
    return a # this is a comment; it has a semicolon!

def b():
    if False:
        z=1000
        b("; in quotes")
        c=200
        return z
    return ';'

class Foo(object):
    def bar(self):
        '''a docstring; with a semicolon'''
        a = 10
        b = 11
        c = 12

        # this comment; has several; semicolons
        if self.spam:
            x = 12
            return x # so; does; this; one
        x = 15
        y += x


        return y

    def baz(self):
        return self.bar

在python代码中用换行符替换分号

2 个答案: