Question

我的代码：

sent = str(input("Please input a sentence: "))
dl = [0]

for count , v in enumerate (splitsent):

    if splitsent.count(v) < 2:

        dl.append(max(dl) +1) 
    else:
        dl.append(splitsent.index(v) +1)

dl.remove(0)
print(sent, "\n",dl)

给出输出：

"1,2,3,4,1,2"

输入：

"To be or not to be"

这就是它＆＃34;压缩＆＃34;形成。我如何获取输出，＆＃34; 1,2,3,4,1,2＆＃34;从外部文件转到＆＃34;成为或不成为＆＃34;？

Answer 1

您的方法实际上不是压缩文本文件的有效方法，只需使用现有的zlib。

但是，对于学术练习，您将需要使用pickle来存储您的词典键，这样当您恢复它时，您将获得相同的值。如果您希望在调用之间存在“压缩”表单，以便您可以成功解压缩先前“压缩”的文件，则需要为每个单词分配索引。如果你想要一个'标准'python方法，可以使用集合中的OrderedDict以这种方式创建索引，将新单词添加到结尾，但与传统的dict对象不同，旧的单词保持其位置。一个更好的方法是OrderedSet，但这不是标准的python，请参阅this recipe。

<强>案例
你还必须决定'THIS'，'this'和'ThIs'是不同的单词还是同一个单词。也许每个单词标记需要一个位域来指示每个字符是低级还是大写，例如'ThIs'得到一个令牌15，但是一个5“0x1010”的位域，在压缩文件中产生一个（15,5）的元组。

<强>标点符号
您还需要考虑标点符号，其中一个单词因此被打断，您需要一种方式来表示压缩形式的这个，标点符号的标记。但这有一个问题。然后当你解压缩时，你需要完全重新创建原始，所以处理标点符号。例如“它是否正确？” - ＆GT; [1,2,3,4] - ＆gt; “它是否正确？”或者“这是对的吗？”没有空间。因此，对于每个标点符号，您需要指明它如何与前一个和下一个字符连接，例如：由于标点符号只有一个字符（即一个8位数字），您可能需要考虑将字符按原样放置。

多个空格
您还需要处理多个空格。

示例代码
此代码不完整，大部分未经测试，可能无法处理所有用例，但它说明了该问题的一种可能解决方案。

要使用它，请创建一个名为in.txt的文件，其中包含要压缩的文本，然后运行 python compdict.py -c in.txt out.comp 要么 python compdict.py -d out.comp out.txt 要么 python compdict.py --list

from ordered_set import OrderedSet #pip install ordered_set
import os
import cPickle as pickle
import string
import argparse

class CompDecomp(object):
  __DEFAULT_PICKLE_FN__ = "my.dict"

  printable_non_chars = set(string.printable) - set(string.digits) - set(string.ascii_letters)

  def __init__(self, fn=None, *args, **kw):
    if fn is None:
      self.fn = self.__DEFAULT_PICKLE_FN__
    else:
      self.fn = fn

    self.dict = self.loaddict()

  def loaddict(self):
    if os.path.exists(self.fn):
      pkl = open(self.fn, "rb")
      d = pickle.load(pkl)
      pkl.close()
    else:
      d = OrderedSet()
    return d

  def savedict(self):
      pkl = open(self.fn, "wb")
      pickle.dump(self.dict, pkl)
      pkl.close()

  def compressword(self, word, conjoin=False):
    if word.lower() not in self.dict:
      self.dict.append(word.lower())
      print "New word: \'%s\'" % word
      self.savedict()
    index, flag, _ = self.__caseflag__(word, conjoin)
    #print index, bin(flag)[2:].zfill(len(word)), conjoin
    return index, flag, conjoin

  def decompressword(self, index, caseflag=0, conjoin=False):
    if isinstance(index, int):
      word = self.dict[index]
    else:
      word = index
    if caseflag == 0:
      return word, conjoin
    flag = bin(caseflag)[2:].zfill(len(word))
    res = ""
    for n, c in enumerate(word):
      if flag[n] == '1':
        res += c.upper()
      else:
        res += c.lower()
    return res, conjoin

  def __caseflag__(self, word, conjoin):
    index = self.dict.index(word.lower())
    if word.lower() == word:
      #Word is all lowercase
      return (index,0, conjoin)
    if word.upper() == word:
      #Word is all uppercase
      return index, int("1" * len(word), 2), conjoin
    res = ""
    for c in word:
      if c in string.uppercase:
        res += "1"
      else:
        res += "0"
    return index, int(res, 2), conjoin

  def compressfile(self, fileobj):
    with fileobj as f:
      data = f.read(-1)
      f.close()

    words = data.split(" ")

    compress = []
    for word in words:
      #Handle multiple spaces
      if word == "":
        compress.append(" ")
        continue

      #Handle puntuation, treat apostrophied words as new words
      substr = []
      p1 = 0
      csplit = word.translate(None, string.ascii_letters+'\'')
      for n, c in enumerate(csplit):
        subword, word = word.split(c, 1)
        compress.append(self.compressword(subword, True if n > 0 else False))
        compress.append((c, 0, True))

      #Handle words
      if len(word) and not len(csplit):
        compress.append(self.compressword(word))
    return compress

  def decompressfile(self, fileobj):
    data = pickle.load(fileobj)

    decomp = ""
    for v in data:
      if not isinstance(v,tuple):
        print "Bad data %s" % v
        continue
      if len(v) > 0 and len(v) <= 3:
        d, conjoin = self.decompressword(*v)
        if len(decomp):
          decomp += "" if conjoin else " "
        decomp += d
      else:
        print "Bad data %s (length %d)" % (v, len(v))
    return decomp


if __name__ == "__main__":
  parser = argparse.ArgumentParser(description='Test file compress / decompress')

  group = parser.add_mutually_exclusive_group()
  parser.add_argument('infile', nargs='?', default=None)
  parser.add_argument('outfile', nargs='?', default=None)
  group.add_argument('-compress', action='store_true')
  group.add_argument('-decompress', action='store_true')
  group.add_argument('--list', action='store_true')

  args = parser.parse_args()

  cd = CompDecomp()

  #Invocation
  #python dictcompress.py [-h|-c|-d|--list] [<infile>] [<outfile>]
  infile, outfile = args.infile, args.outfile

  if infile is not None and not os.path.exists(infile):
    print "Input file missing"

  if outfile is not None:
    of = open(outfile, "wb")
  else:
    of = None

  if not args.list:
    if args.compress:
      print "Compress"
      pickle.dump(cd.compressfile(open(infile, "r")), of)

    if args.decompress:
      print "Decompress"
      of.write(cd.decompressfile(open(infile, "r")))
  else:
    for k in cd.dict:
      print k

  if of is not None:
    of.close()

读取（压缩）文件

1 个答案: