Question

我这里有一个难题……基本上，我正在执行一些非常基本的文件压缩步骤，如下所示：

打开文件并读为字符串/读入字符串
解析该字符串，并用表示该模式的较小文本替换重复模式（例如：aaaaaaaaaaa（11个字符）被[a＃$％11]（8个字符）替换）
将较小的新字符串保存到单独的文件中（可以比较大小）

由于某种原因，即使内存中的新字符串比原始字符串小3％，当我将字符串保存到文件中时，文件本身比文件系统上的原始文件大？这怎么可能呢？如果有人可以向我解释，那就太好了！

这是我用来执行此操作的代码：

void bkg_DoWork(object sender, DoWorkEventArgs e)
    {
        try
        {
            string file = File.ReadAllText(this.txt_CompressFilename.Text);

            int olength = file.Length;
            int nlength = 0;
            decimal pct = 0;

            string lastchar = "";
            int count = 0;

            List<RepeatingPattern> SinglePatterns = new List<RepeatingPattern>();
            List<RepeatingPattern> DoublePatterns = new List<RepeatingPattern>();
            List<RepeatingPattern> TriplePatterns = new List<RepeatingPattern>();
            List<RepeatingPattern> QuadruplePatterns = new List<RepeatingPattern>();

            UpdateProgress("Read file contents", 0, 1, 6);
            UpdateProgress("Finding single character replacements.", pct, 1, 6);

            //single character replaces.
            for (int i = 0; i < olength; i++)
            {
                if (file[i].ToString() == lastchar)
                    count += 1;
                else
                {
                    //create a pattern, if the count is more than what a pattern's compressed pattern looks like to save space... 8 chars
                    //[a#$%#]
                    if (count > 7)
                    {
                        //create and add a pattern to the list if necessary.
                        RepeatingPattern ptn = new RepeatingPattern(lastchar.ToString(), count);
                        if (!SinglePatterns.Contains(ptn))
                            SinglePatterns.Add(ptn);
                    }
                    count = 0;
                    lastchar = file[i].ToString();
                }
            }

            //handle possible trailing pattern
            if (count > 7)
            {
                //create and add a pattern to the list if necessary.
                RepeatingPattern ptn = new RepeatingPattern(lastchar.ToString(), count);
                if (!SinglePatterns.Contains(ptn))
                    SinglePatterns.Add(ptn);
            }

            if (SinglePatterns.Count > 0)
                for (int i = 0; i < SinglePatterns.Count; i++)
                    file = file.Replace(SinglePatterns[i].ToString(), SinglePatterns[i].ToReplaceString());

            nlength = file.Length;
            pct = (decimal)(((double)(olength - nlength) / olength) * 100);

            UpdateProgress("Found and replaced " + SinglePatterns.Count, pct, 2, 6);
            UpdateProgress("Finding double character replacements.", pct, 2, 6);

            lastchar = "";
            count = 0;

            //double character replaces.
            for (int i = 0; i + 1 < file.Length; i = i + 2)
            {
                if ("" + file[i] + "" + file[i + 1] == lastchar)
                    count += 1;
                else
                {
                    //create a pattern, if the count is more than what a pattern's compressed pattern looks like to save space... 8 chars
                    //[aa#$%#]
                    if (count > 8)
                    {
                        //create and add a pattern to the list if necessary.
                        RepeatingPattern ptn = new RepeatingPattern(lastchar.ToString(), count);
                        if (!DoublePatterns.Contains(ptn))
                            DoublePatterns.Add(ptn);
                    }
                    count = 0;
                    lastchar = "" + file[i] + "" + file[i + 1];
                }
            }

            //handle possible trailing pattern
            if (count > 8)
            {
                //create and add a pattern to the list if necessary.
                RepeatingPattern ptn = new RepeatingPattern(lastchar.ToString(), count);
                if (!DoublePatterns.Contains(ptn))
                    DoublePatterns.Add(ptn);
            }

            if (DoublePatterns.Count > 0)
                for (int i = 0; i < DoublePatterns.Count; i++)
                    file = file.Replace(DoublePatterns[i].ToString(), DoublePatterns[i].ToReplaceString());

            nlength = file.Length;
            pct = (decimal)(((double)(olength - nlength) / olength) * 100);

            UpdateProgress("Found and replaced " + DoublePatterns.Count, pct, 3, 6);
            UpdateProgress("Finding triple character replacements.", pct, 3, 6);

            lastchar = "";
            count = 0;

            //triple character replaces.
            for (int i = 0; i + 2 < file.Length; i = i + 3)
            {
                if ("" + file[i] + "" + file[i + 1] + "" + file[i + 2] == lastchar)
                    count += 1;
                else
                {
                    //create a pattern, if the count is more than what a pattern's compressed pattern looks like to save space... 8 chars
                    //[aaa#$%#]
                    if (count > 9)
                    {
                        //create and add a pattern to the list if necessary.
                        RepeatingPattern ptn = new RepeatingPattern(lastchar.ToString(), count);
                        if (!TriplePatterns.Contains(ptn))
                            TriplePatterns.Add(ptn);
                    }
                    count = 0;
                    lastchar = "" + file[i] + "" + file[i + 1] + "" + file[i + 2];
                }
            }

            //handle possible trailing pattern
            if (count > 9)
            {
                //create and add a pattern to the list if necessary.
                RepeatingPattern ptn = new RepeatingPattern(lastchar.ToString(), count);
                if (!TriplePatterns.Contains(ptn))
                    TriplePatterns.Add(ptn);
            }

            if (TriplePatterns.Count > 0)
                for (int i = 0; i < TriplePatterns.Count; i++)
                    file = file.Replace(TriplePatterns[i].ToString(), TriplePatterns[i].ToReplaceString());

            nlength = file.Length;
            pct = (decimal)(((double)(olength - nlength) / olength) * 100);

            UpdateProgress("Found and replaced " + TriplePatterns.Count, pct, 4, 6);
            UpdateProgress("Finding quadruple character replacements.", pct, 4, 6);

            lastchar = "";
            count = 0;

            //triple character replaces.
            for (int i = 0; i + 3 < file.Length; i = i + 4)
            {
                if ("" + file[i] + "" + file[i + 1] + "" + file[i + 2] + "" + file[i + 3] == lastchar)
                    count += 1;
                else
                {
                    //create a pattern, if the count is more than what a pattern's compressed pattern looks like to save space... 8 chars
                    //[aaaa#$%#]
                    if (count > 10)
                    {
                        //create and add a pattern to the list if necessary.
                        RepeatingPattern ptn = new RepeatingPattern(lastchar.ToString(), count);
                        if (!QuadruplePatterns.Contains(ptn))
                            QuadruplePatterns.Add(ptn);
                    }
                    count = 0;
                    lastchar = "" + file[i] + "" + file[i + 1] + "" + file[i + 2] + "" + file[i + 3];
                }
            }

            //Handle possible trailing pattern
            if (count > 10)
            {
                //create and add a pattern to the list if necessary.
                RepeatingPattern ptn = new RepeatingPattern(lastchar.ToString(), count);
                if (!QuadruplePatterns.Contains(ptn))
                    QuadruplePatterns.Add(ptn);
            }

            if (QuadruplePatterns.Count > 0)
                for (int i = 0; i < QuadruplePatterns.Count; i++)
                    file = file.Replace(QuadruplePatterns[i].ToString(), QuadruplePatterns[i].ToReplaceString());

            nlength = file.Length;
            pct = (decimal)(((double)(olength - nlength) / olength) * 100);

            UpdateProgress("Found and replaced " + QuadruplePatterns.Count, pct, 5, 6);
            UpdateProgress("Saving new .cmp file...", pct, 5, 6);

            string newpath = this.txt_FolderName.Text + "\\" + Path.GetFileName(this.txt_CompressFilename.Text);
            newpath = newpath.Substring(0, newpath.LastIndexOf("."));
            newpath = newpath + ".cmp";
            File.WriteAllText(newpath, file);

            stopwatch.Stop();

            UpdateProgress("Compression completed! Time to compress file: " + string.Format("{0}", stopwatch.Elapsed), pct, 6, 6);
            string report = "Compression report\n\n";

            FileInfo inf = new FileInfo(this.txt_CompressFilename.Text);
            FileInfo infNew = new FileInfo(newpath);
            report += "Single character replacements made: " + SinglePatterns.Count + "\n\n";
            report += "Double character replacements made: " + DoublePatterns.Count + "\n\n";
            report += "Triple character replacements made: " + TriplePatterns.Count + "\n\n";
            report += "Quadruple character replacements made: " + QuadruplePatterns.Count + "\n\n";
            report += "Total compression ration achieved in string: " + pct + "% \n\n";
            report += "Old file size: " + inf.Length + "\nNew file size: " + infNew.Length + " in bytes.";

            report += "Total time to achieve compression: " + string.Format("{0}", stopwatch.Elapsed);
            e.Result = report;
        }
        catch (Exception ex)
        {
            e.Result = ex;
        }
    }

这是RepeatingPattern类的代码...

    using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;

namespace Compressor
{
    public class RepeatingPattern : IEquatable<RepeatingPattern>
    {
        public string RepeatingChar { get; set; }
        public int Count { get; set; }

        public RepeatingPattern()
        {
            this.RepeatingChar = "";
            this.Count = -1;
        }

        public RepeatingPattern(string rchar, int count)
        {
            this.RepeatingChar = rchar;
            this.Count = count;
        }

        public RepeatingPattern(string FromReplaceString)
        {

            FromReplaceString = FromReplaceString.Replace("[", "").Replace("]", "");

            List<string> parts = FromReplaceString.Split(new string[] { "#$%" }, StringSplitOptions.None).ToList();

            if (parts.Count != 2)
                throw new ArgumentException("Invalid argument count. Must be in this format: [a#$%N]");

            try
            {
                this.RepeatingChar = parts[0];
                this.Count = int.Parse(parts[1]);
            }
            catch (Exception ex)
            {
                throw new ArgumentException("Unable to cast the argument and create an object from it. Error: " + ex.Message);
            }
        }
        public override bool Equals(object obj)
        {
            RepeatingPattern tmp = obj as RepeatingPattern;
            if (tmp != null)
                return base.Equals(tmp);
            else
                throw new Exception("Invalid comparison type. Both objects must be of type RepeatingPattern");
        }

        public bool Equals(RepeatingPattern tmp)
        {
            return this.RepeatingChar == tmp.RepeatingChar && this.Count == tmp.Count;
        }

        public override int GetHashCode()
        {
            return this.RepeatingChar.GetHashCode() ^ this.Count.GetHashCode();
        }
        public override string ToString()
        {
            string retval = "";
            for (int i = 0; i < this.Count; i++)
                retval += this.RepeatingChar;

            return retval;
        }

        public string ToReplaceString()
        {
            return "[" + this.RepeatingChar + "#$%" + this.Count + "]";
        }
    }
}

Answer 1

出于好奇，我尝试了代码。一些区别：

我创建了一个辅助函数来查找文本中的运行
在遍历旧字符串而不是替换旧字符串时，我使用StringBuilder构建了一个新字符串

我认为我的代码比您的代码简单一些。我已经测试过：

Input:  "aaaaaaaaaaabbbcdcdcdcdcdcdxxxxxxxxxxxxxxxxxxhello"
Output: "[a#$%11]bbb[cd#$%6][x#$%18]hello"

这是代码。这是初稿。可能需要做很多改进：

static int FindRun(string s, int start, int length)
{
    if (start + length >= s.Length) return 0;

    int numRuns = 0;
    string pattern = s.Substring(start, length);
    for (int i = start + length; i <= s.Length - length; i += length)
    {
        if (s.Substring(i, length) == pattern) numRuns += 1;
        else break;
    }
    return numRuns;
}

static string EncodeString(string src)
{
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < src.Length; i++)
    {
        string theRun = null;
        int numRuns = 0;
        // Find runs of lengths 4, 3, 2, 1
        for (int j = 4; j >= 1; j--)
        {
            int runs = FindRun(src, i, j);
            if (runs > 1)  // Run found!
            {
                // Save it for later. Want to append the longest run
                theRun = src.Substring(i, j);
                numRuns = runs;
            }
        }
        // No run? Just append the letter
        if (theRun == null)
        {
            sb.Append(src[i]);
        }
        else
        {
            // This is the size of the run
            int replacementStringSize = (numRuns * theRun.Length) + (theRun.Length - 1);
            // This is the code to use as a replacement
            String runCode = String.Format("[{0}#$%{1}]", theRun, numRuns + 1);
            // Only append if the code length is smaller than the original run
            if (runCode.Length < replacementStringSize)
            {
                sb.Append(runCode);
            }
            else
            {
                // Don't encode. Put original run back
                for (int j = 0; j <= numRuns; j++)
                {
                    sb.Append(theRun);
                }
            }
            // Skip over the run
            i += replacementStringSize;
        }
    }
    return sb.ToString();
}

Answer 2

更大的输出文件的根本原因是由于编码。 ChromeSetup.exe为1,397,976字节。使用File.ReadAllText读取文件时，它将尝试检测字符串编码。在这种情况下，字符串的长度为1,327,384个字符。不过，这是关键，因为对每个字符进行编码都不一定是单个字节。例如，在UTF-8中，每个字符为1到4个字节。因此，当写出结果字符串时，单个字符可能会变成多个字节。

对于读取/写入可执行文件/二进制文件，最好使用File.ReadAllBytes()`File.WriteAllBytes（）`。

在尝试运行您的代码时，我遇到了其他几个错误。这是我发现的错误。

1）在double / triple / quad字符中替换了for循环边界，应检查将要使用的字符。

//double character replaces.
for (int i = 0; i < file.Length; i = i + 2)
{
  if ("" + file[i] + "" + file[i + 1] == lastchar)

如果文件字符串是奇数个字符，将导致索引不足异常。添加+ 1来解决此问题。

for (int i = 0; i + 1 < file.Length; i = i + 2)

对于三元组，它将为+ 2，对于四元组+ 3。

2）如果字符串以重复模式结尾，则处理不正确。在for循环中，仅当遇到其他字符时才检查模式计数。因此，如果模式位于字符串的末尾，则不会检测到该模式。您可以通过检查for循环后的计数来解决此问题。

if (count > 7)
{
    //create and add a pattern to the list if necessary.
    RepeatingPattern ptn = new RepeatingPattern(lastchar.ToString(), count);
    if (!SinglePatterns.Contains(ptn))
      SinglePatterns.Add(ptn);
}

3）count和lastchar应该在每个for循环之前重设。如果一个for循环以count = 17结尾，而下一个for循环运行，则会添加一个重复计数模式17，该模式已经被替换。

4）正如其他人提到的那样，在进行输入字符串替换时可能会引起问题。

如果您可以发布自己的RepeatingPattern代码和输入文本文件，那么我们可以找出导致输出文件较大的确切原因。

编辑：使用您的RepeatingPattern代码运行我看到另一个小错误。模式“ aaaaaaaaaaaa”变为“ [a＃$％9] a”。它应该替换另一个字符。这可能会使您的输出字符串略长于预期。要解决此问题，请在启动新模式时将count的{{1}}设置为1（而不是0）。

字符串越小，文件大小越大

2 个答案: