Question

我有以下文件，我使用迭代器块来解析文件中某些重新发生的节点/部分。我最初使用正则表达式来解析整个文件，但是当节点中不存在某些字段时，它将不匹配。所以我试图使用收益率模式。使用我正在使用的代码检测文件格式如下。我想从文件中获取的只是复制节点作为单独的部分，因此我可以使用键字符串获取其中的字段并存储在对象集合中。我可以开始解析第一个复制发生的位置，但无法在复制节点结束的地方结束它。

文件格式：

X_HEADER
{
    DATA_MANAGEMENT_FIELD_2     NA
    DATA_MANAGEMENT_FIELD_3     NA
    DATA_MANAGEMENT_FIELD_4     NA
    SYSTEM_SOFTWARE_VERSION     NA
}
Y_HEADER
{
    DATA_MANAGEMENT_FIELD_2     NA
    DATA_MANAGEMENT_FIELD_3     NA
    DATA_MANAGEMENT_FIELD_4     NA
    SYSTEM_SOFTWARE_VERSION     NA
}
COMPLETION
{
    NUMBER          877
    VERSION         4
    CALIBRATION_VERSION 1
    CONFIGURATION_ID    877    
}
REPLICATE
{
    REPLICATE_ID            1985
    ASSAY_NUMBER            656
    ASSAY_VERSION           4
    ASSAY_STATUS            Research
    DILUTION_ID         1
}
REPLICATE
{
    REPLICATE_ID            1985
    ASSAY_NUMBER            656
    ASSAY_VERSION           4
    ASSAY_STATUS            Research
}

代码：

static IEnumerable<IDictionary<string, string>> ReadParts(string path)
{
    using (var reader = File.OpenText(path))
    {
        var current = new Dictionary<string, string>();
        string line;
        while ((line = reader.ReadLine()) != null)
        {
            if (string.IsNullOrWhiteSpace(line)) continue;

            if (line.StartsWith("REPLICATE"))
            {
                yield return current;
                current = new Dictionary<string, string>();
            }
            else
            {
                var parts = line.Split('\t');
            }

            if (current.Count > 0) yield return current;
        }
    }
}

public static void parseFile(string fileName)
    {
        foreach (var part in ReadParts(fileName))
        {
           //part["fIELD1"] will retireve certain values from the REPLICATE PART HERE
        }
    }

Answer 1

嗯，听起来你需要在你得到一个结束括号时“关闭”一个部分，而那时只需要yield return。例如：

static IEnumerable<IDictionary<string, string>> ReadParts(string path)
{
    using (var reader = File.OpenText(path))
    {
        string currentName = null;
        IDictionary<string, string> currentMap = null;
        while ((line = reader.ReadLine()) != null)
        {
            if (string.IsNullOrWhiteSpace(line))
            {
                continue;
            }
            if (line == "{")
            {
                if (currentName == null || currentMap != null)
                {
                    throw new BadDataException("Open brace at wrong place");
                }
                currentMap = new Dictionary<string, string>();
            }
            else if (line == "}")
            {
                if (currentName == null || currentMap == null)
                {
                    throw new BadDataException("Closing brace at wrong place");
                }
                // Isolate the "REPLICATE-only" requirement to a single
                // line - if you ever need other bits, you can change this.
                if (currentName == "REPLICATE")
                {
                    yield return currentMap;
                }
                currentName = null;
                currentMap = null;
            }
            else if (!line.StartsWith("\t"))
            {
                if (currentName != null || currentMap != null)
                {
                    throw new BadDataException("Section name at wrong place");
                }
                currentName = line;
            }
            else
            {
                if (currentName == null || currentMap == null)
                {
                    throw new BadDataException("Name/value pair at wrong place");
                }
                var parts = line.Substring(1).Split('\t');
                if (parts.Length != 2)
                {
                    throw new BadDataException("Invalid name/value pair");
                }
                currentMap[parts[0]] = parts[1];
            }                
        }
    }
}

说实话，这是一个非常可怕的功能。我怀疑我将它放在它自己的类中（可能是嵌套的）来存储状态，并使每个处理程序成为自己的方法。哎呀，这实际上是一种状态模式可能有意义的情况：）

Answer 2

private IEnumerable<IDictionary<string, string>> ParseFile(System.IO.TextReader reader)
{
    string token = reader.ReadLine();

    while (token != null)
    {
        bool isReplicate = token.StartsWith("REPLICATE");
        token = reader.ReadLine(); //consume this token to either skip it or parse it

        if (isReplicate)
        {     
            yield return ParseBlock(ref token, reader);
        }
    }
}

private IDictionary<string, string> ParseBlock(ref string token, System.IO.TextReader reader)
{
    if (token != "{")
    {
        throw new Exception("Missing opening brace.");
    }

    token = reader.ReadLine();

    var result = ParseValues(ref token, reader);

    if (token != "}")
    {
        throw new Exception("Missing closing brace.");
    }

    token = reader.ReadLine();

    return result;
}

private IDictionary<string, string> ParseValues(ref string token, System.IO.TextReader reader)
{
    IDictionary<string, string> result = new Dictionary<string, string>();

    while (token != "}" and token != null)
    {
        var args = token.Split('\t');

        if (args.Length < 2)
        {
            throw new Exception();
        }

        result.Add(args[0], args[1]);

        token = reader.ReadLine();
    }

    return result;
}

Answer 3

如果在while循环结束后添加yield return current;，您将获得最终字典。

我认为最好检查'}'作为当前块的结尾，然后将yield return放在那里。虽然您无法使用正则表达式解析整个文件，但您可以使用正则表达式搜索行内的键值对。以下迭代器代码应该可以工作。它只返回REPLICATE块的dictonaries。

 // Check for lines that are a key-value pair, separated by whitespace.
// Note that value is optional
static string partPattern = @"^(?<Key>\w*)(\s+(?<Value>\.*))?$";

static IEnumerable<IDictionary<string, string>> ReadParts(string path)
{
    using (var reader = File.OpenText(path))
    {
        string line;
        while ((line = reader.ReadLine()) != null)
        {
            // Ignore lines that just contain whitespace
            if (string.IsNullOrWhiteSpace(line)) continue; 

            // This is a new replicate block, start a new dictionary
            if (line.Trim().CompareTo("REPLICATE") == 0)
            {
                yield return parseReplicateBlock(reader);
            }
        }
    }
}

private static IDictionary<string, string> parseReplicateBlock(StreamReader reader)
{
    // Make sure we have an opening brace
    VerifyOpening(reader);
    string line;
    var currentDictionary = new Dictionary<string, string>();
    while ((line = reader.ReadLine()) != null)
    {
        // Ignore lines that just contain whitespace
        if (string.IsNullOrWhiteSpace(line)) continue;

        line = line.Trim();

        // Since our regex used groupings (?<Key> and ?<Value>), 
        // we can do a match and check to see if our groupings 
        // found anything. If they did, extract the key and value. 
        Match m = Regex.Match(line, partPattern);
        if (m.Groups["Key"].Length > 0)
        {
            currentDictionary.Add(m.Groups["Key"].Value, m.Groups["Value"].Value);
        }
        else if (line.CompareTo("}") == 0)
        {
            return currentDictionary;
        }
    }
    // We exited the loop before we found a closing brace, throw an exception
    throw new ApplicationException("Missing closing brace");
}

private static void VerifyOpening(StreamReader reader)
{
    string line;
    while ((line = reader.ReadLine()) != null)
    {
        // Ignore lines that just contain whitespace
        if (string.IsNullOrWhiteSpace(line)) continue;

        if (line.Trim().CompareTo("{") == 0)
        {
            return;
        }
        else
        {
            throw new ApplicationException("Missing opening brace");
        }
    }
    throw new ApplicationException("Missing opening brace");
}

更新：我确保正则表达式字符串包含没有值的情况。此外，如果修改了正则表达式字符串，则组索引全部更改为使用组名以避免任何问题。

产量模式，状态机流量

3 个答案: