我这里有一个难题……基本上,我正在执行一些非常基本的文件压缩步骤,如下所示:
由于某种原因,即使内存中的新字符串比原始字符串小3%,当我将字符串保存到文件中时,文件本身比文件系统上的原始文件大?这怎么可能呢?如果有人可以向我解释,那就太好了!
这是我用来执行此操作的代码:
void bkg_DoWork(object sender, DoWorkEventArgs e)
{
try
{
string file = File.ReadAllText(this.txt_CompressFilename.Text);
int olength = file.Length;
int nlength = 0;
decimal pct = 0;
string lastchar = "";
int count = 0;
List<RepeatingPattern> SinglePatterns = new List<RepeatingPattern>();
List<RepeatingPattern> DoublePatterns = new List<RepeatingPattern>();
List<RepeatingPattern> TriplePatterns = new List<RepeatingPattern>();
List<RepeatingPattern> QuadruplePatterns = new List<RepeatingPattern>();
UpdateProgress("Read file contents", 0, 1, 6);
UpdateProgress("Finding single character replacements.", pct, 1, 6);
//single character replaces.
for (int i = 0; i < olength; i++)
{
if (file[i].ToString() == lastchar)
count += 1;
else
{
//create a pattern, if the count is more than what a pattern's compressed pattern looks like to save space... 8 chars
//[a#$%#]
if (count > 7)
{
//create and add a pattern to the list if necessary.
RepeatingPattern ptn = new RepeatingPattern(lastchar.ToString(), count);
if (!SinglePatterns.Contains(ptn))
SinglePatterns.Add(ptn);
}
count = 0;
lastchar = file[i].ToString();
}
}
//handle possible trailing pattern
if (count > 7)
{
//create and add a pattern to the list if necessary.
RepeatingPattern ptn = new RepeatingPattern(lastchar.ToString(), count);
if (!SinglePatterns.Contains(ptn))
SinglePatterns.Add(ptn);
}
if (SinglePatterns.Count > 0)
for (int i = 0; i < SinglePatterns.Count; i++)
file = file.Replace(SinglePatterns[i].ToString(), SinglePatterns[i].ToReplaceString());
nlength = file.Length;
pct = (decimal)(((double)(olength - nlength) / olength) * 100);
UpdateProgress("Found and replaced " + SinglePatterns.Count, pct, 2, 6);
UpdateProgress("Finding double character replacements.", pct, 2, 6);
lastchar = "";
count = 0;
//double character replaces.
for (int i = 0; i + 1 < file.Length; i = i + 2)
{
if ("" + file[i] + "" + file[i + 1] == lastchar)
count += 1;
else
{
//create a pattern, if the count is more than what a pattern's compressed pattern looks like to save space... 8 chars
//[aa#$%#]
if (count > 8)
{
//create and add a pattern to the list if necessary.
RepeatingPattern ptn = new RepeatingPattern(lastchar.ToString(), count);
if (!DoublePatterns.Contains(ptn))
DoublePatterns.Add(ptn);
}
count = 0;
lastchar = "" + file[i] + "" + file[i + 1];
}
}
//handle possible trailing pattern
if (count > 8)
{
//create and add a pattern to the list if necessary.
RepeatingPattern ptn = new RepeatingPattern(lastchar.ToString(), count);
if (!DoublePatterns.Contains(ptn))
DoublePatterns.Add(ptn);
}
if (DoublePatterns.Count > 0)
for (int i = 0; i < DoublePatterns.Count; i++)
file = file.Replace(DoublePatterns[i].ToString(), DoublePatterns[i].ToReplaceString());
nlength = file.Length;
pct = (decimal)(((double)(olength - nlength) / olength) * 100);
UpdateProgress("Found and replaced " + DoublePatterns.Count, pct, 3, 6);
UpdateProgress("Finding triple character replacements.", pct, 3, 6);
lastchar = "";
count = 0;
//triple character replaces.
for (int i = 0; i + 2 < file.Length; i = i + 3)
{
if ("" + file[i] + "" + file[i + 1] + "" + file[i + 2] == lastchar)
count += 1;
else
{
//create a pattern, if the count is more than what a pattern's compressed pattern looks like to save space... 8 chars
//[aaa#$%#]
if (count > 9)
{
//create and add a pattern to the list if necessary.
RepeatingPattern ptn = new RepeatingPattern(lastchar.ToString(), count);
if (!TriplePatterns.Contains(ptn))
TriplePatterns.Add(ptn);
}
count = 0;
lastchar = "" + file[i] + "" + file[i + 1] + "" + file[i + 2];
}
}
//handle possible trailing pattern
if (count > 9)
{
//create and add a pattern to the list if necessary.
RepeatingPattern ptn = new RepeatingPattern(lastchar.ToString(), count);
if (!TriplePatterns.Contains(ptn))
TriplePatterns.Add(ptn);
}
if (TriplePatterns.Count > 0)
for (int i = 0; i < TriplePatterns.Count; i++)
file = file.Replace(TriplePatterns[i].ToString(), TriplePatterns[i].ToReplaceString());
nlength = file.Length;
pct = (decimal)(((double)(olength - nlength) / olength) * 100);
UpdateProgress("Found and replaced " + TriplePatterns.Count, pct, 4, 6);
UpdateProgress("Finding quadruple character replacements.", pct, 4, 6);
lastchar = "";
count = 0;
//triple character replaces.
for (int i = 0; i + 3 < file.Length; i = i + 4)
{
if ("" + file[i] + "" + file[i + 1] + "" + file[i + 2] + "" + file[i + 3] == lastchar)
count += 1;
else
{
//create a pattern, if the count is more than what a pattern's compressed pattern looks like to save space... 8 chars
//[aaaa#$%#]
if (count > 10)
{
//create and add a pattern to the list if necessary.
RepeatingPattern ptn = new RepeatingPattern(lastchar.ToString(), count);
if (!QuadruplePatterns.Contains(ptn))
QuadruplePatterns.Add(ptn);
}
count = 0;
lastchar = "" + file[i] + "" + file[i + 1] + "" + file[i + 2] + "" + file[i + 3];
}
}
//Handle possible trailing pattern
if (count > 10)
{
//create and add a pattern to the list if necessary.
RepeatingPattern ptn = new RepeatingPattern(lastchar.ToString(), count);
if (!QuadruplePatterns.Contains(ptn))
QuadruplePatterns.Add(ptn);
}
if (QuadruplePatterns.Count > 0)
for (int i = 0; i < QuadruplePatterns.Count; i++)
file = file.Replace(QuadruplePatterns[i].ToString(), QuadruplePatterns[i].ToReplaceString());
nlength = file.Length;
pct = (decimal)(((double)(olength - nlength) / olength) * 100);
UpdateProgress("Found and replaced " + QuadruplePatterns.Count, pct, 5, 6);
UpdateProgress("Saving new .cmp file...", pct, 5, 6);
string newpath = this.txt_FolderName.Text + "\\" + Path.GetFileName(this.txt_CompressFilename.Text);
newpath = newpath.Substring(0, newpath.LastIndexOf("."));
newpath = newpath + ".cmp";
File.WriteAllText(newpath, file);
stopwatch.Stop();
UpdateProgress("Compression completed! Time to compress file: " + string.Format("{0}", stopwatch.Elapsed), pct, 6, 6);
string report = "Compression report\n\n";
FileInfo inf = new FileInfo(this.txt_CompressFilename.Text);
FileInfo infNew = new FileInfo(newpath);
report += "Single character replacements made: " + SinglePatterns.Count + "\n\n";
report += "Double character replacements made: " + DoublePatterns.Count + "\n\n";
report += "Triple character replacements made: " + TriplePatterns.Count + "\n\n";
report += "Quadruple character replacements made: " + QuadruplePatterns.Count + "\n\n";
report += "Total compression ration achieved in string: " + pct + "% \n\n";
report += "Old file size: " + inf.Length + "\nNew file size: " + infNew.Length + " in bytes.";
report += "Total time to achieve compression: " + string.Format("{0}", stopwatch.Elapsed);
e.Result = report;
}
catch (Exception ex)
{
e.Result = ex;
}
}
这是RepeatingPattern类的代码...
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
namespace Compressor
{
public class RepeatingPattern : IEquatable<RepeatingPattern>
{
public string RepeatingChar { get; set; }
public int Count { get; set; }
public RepeatingPattern()
{
this.RepeatingChar = "";
this.Count = -1;
}
public RepeatingPattern(string rchar, int count)
{
this.RepeatingChar = rchar;
this.Count = count;
}
public RepeatingPattern(string FromReplaceString)
{
FromReplaceString = FromReplaceString.Replace("[", "").Replace("]", "");
List<string> parts = FromReplaceString.Split(new string[] { "#$%" }, StringSplitOptions.None).ToList();
if (parts.Count != 2)
throw new ArgumentException("Invalid argument count. Must be in this format: [a#$%N]");
try
{
this.RepeatingChar = parts[0];
this.Count = int.Parse(parts[1]);
}
catch (Exception ex)
{
throw new ArgumentException("Unable to cast the argument and create an object from it. Error: " + ex.Message);
}
}
public override bool Equals(object obj)
{
RepeatingPattern tmp = obj as RepeatingPattern;
if (tmp != null)
return base.Equals(tmp);
else
throw new Exception("Invalid comparison type. Both objects must be of type RepeatingPattern");
}
public bool Equals(RepeatingPattern tmp)
{
return this.RepeatingChar == tmp.RepeatingChar && this.Count == tmp.Count;
}
public override int GetHashCode()
{
return this.RepeatingChar.GetHashCode() ^ this.Count.GetHashCode();
}
public override string ToString()
{
string retval = "";
for (int i = 0; i < this.Count; i++)
retval += this.RepeatingChar;
return retval;
}
public string ToReplaceString()
{
return "[" + this.RepeatingChar + "#$%" + this.Count + "]";
}
}
}
答案 0 :(得分:1)
出于好奇,我尝试了代码。一些区别:
StringBuilder
构建了一个新字符串我认为我的代码比您的代码简单一些。我已经测试过:
Input: "aaaaaaaaaaabbbcdcdcdcdcdcdxxxxxxxxxxxxxxxxxxhello" Output: "[a#$%11]bbb[cd#$%6][x#$%18]hello"
这是代码。这是初稿。可能需要做很多改进:
static int FindRun(string s, int start, int length)
{
if (start + length >= s.Length) return 0;
int numRuns = 0;
string pattern = s.Substring(start, length);
for (int i = start + length; i <= s.Length - length; i += length)
{
if (s.Substring(i, length) == pattern) numRuns += 1;
else break;
}
return numRuns;
}
static string EncodeString(string src)
{
StringBuilder sb = new StringBuilder();
for (int i = 0; i < src.Length; i++)
{
string theRun = null;
int numRuns = 0;
// Find runs of lengths 4, 3, 2, 1
for (int j = 4; j >= 1; j--)
{
int runs = FindRun(src, i, j);
if (runs > 1) // Run found!
{
// Save it for later. Want to append the longest run
theRun = src.Substring(i, j);
numRuns = runs;
}
}
// No run? Just append the letter
if (theRun == null)
{
sb.Append(src[i]);
}
else
{
// This is the size of the run
int replacementStringSize = (numRuns * theRun.Length) + (theRun.Length - 1);
// This is the code to use as a replacement
String runCode = String.Format("[{0}#$%{1}]", theRun, numRuns + 1);
// Only append if the code length is smaller than the original run
if (runCode.Length < replacementStringSize)
{
sb.Append(runCode);
}
else
{
// Don't encode. Put original run back
for (int j = 0; j <= numRuns; j++)
{
sb.Append(theRun);
}
}
// Skip over the run
i += replacementStringSize;
}
}
return sb.ToString();
}
答案 1 :(得分:1)
更大的输出文件的根本原因是由于编码。 ChromeSetup.exe为1,397,976字节。使用File.ReadAllText
读取文件时,它将尝试检测字符串编码。在这种情况下,字符串的长度为1,327,384个字符。不过,这是关键,因为对每个字符进行编码都不一定是单个字节。例如,在UTF-8中,每个字符为1到4个字节。因此,当写出结果字符串时,单个字符可能会变成多个字节。
对于读取/写入可执行文件/二进制文件,最好使用File.ReadAllBytes()
`File.WriteAllBytes()`。
在尝试运行您的代码时,我遇到了其他几个错误。这是我发现的错误。
1)在double / triple / quad字符中替换了for循环边界,应检查将要使用的字符。
//double character replaces.
for (int i = 0; i < file.Length; i = i + 2)
{
if ("" + file[i] + "" + file[i + 1] == lastchar)
如果文件字符串是奇数个字符,将导致索引不足异常。添加+ 1
来解决此问题。
for (int i = 0; i + 1 < file.Length; i = i + 2)
对于三元组,它将为+ 2
,对于四元组+ 3
。
2)如果字符串以重复模式结尾,则处理不正确。在for循环中,仅当遇到其他字符时才检查模式计数。因此,如果模式位于字符串的末尾,则不会检测到该模式。您可以通过检查for循环后的计数来解决此问题。
if (count > 7)
{
//create and add a pattern to the list if necessary.
RepeatingPattern ptn = new RepeatingPattern(lastchar.ToString(), count);
if (!SinglePatterns.Contains(ptn))
SinglePatterns.Add(ptn);
}
3)count
和lastchar
应该在每个for循环之前重设。如果一个for循环以count = 17
结尾,而下一个for循环运行,则会添加一个重复计数模式17,该模式已经被替换。
4)正如其他人提到的那样,在进行输入字符串替换时可能会引起问题。
如果您可以发布自己的RepeatingPattern
代码和输入文本文件,那么我们可以找出导致输出文件较大的确切原因。
编辑:使用您的RepeatingPattern代码运行我看到另一个小错误。模式“ aaaaaaaaaaaa”变为“ [a#$%9] a”。它应该替换另一个字符。这可能会使您的输出字符串略长于预期。要解决此问题,请在启动新模式时将count
的{{1}}设置为1(而不是0)。