在一定权重下查找所有后续替换的最佳算法

时间:2014-09-21 18:14:07

标签: c# algorithm

让我们有一系列数字。说{1,2,3,4,5}。和一组带有权重的子序列替换。例如:
{1} - > {3}:0.2 - 巫婆意味着1可以被3代替,重量为0.1
{2,3,4} - > {4,3}:0.3
{5} - > {2}:0.4

我需要找到我们可以使用具有限制权重的替换获得的所有序列。

限制的作用是这样的:任何n(3)项(连续)的替换权重之和应该小于给定数e <= 0.5。 对于我们的例子,结果是:
{1,2,3,4,5}:替换权重:{0,0,0,0,0}。连续3个项目的总和小于0.5
{3,2,3,4,5}:替换权重:{0.2,0,0,0,0} {1,4,3,5}:替换权重:{0,0.3 / 3,0.3 / 3,0.3 / 3,0} 3个符号的序列所以/ 3
{3,4,3,5}替代权重:{0.2,0.3 / 3,0.3 / 3,0.3 / 3,0} {1,2,3,4,2}替换权重:{0,0,0,0,0.4}
{3,2,3,4,2}替换权重:{0.2,0,0,0,0.4}

我们不允许{1,4,3,2},因为替换权重{0,0.3 / 3,0.3 / 3,0.3 / 3,0.4}有3个最后项目,总和权重= 0.6。

在实例中,子序列替换集很大。几乎任何短的子序列都有替代品。 很明显,任务可以使用蛮力来完成。但我正在寻找一种快速做到这一点的方法。任何帮助,将不胜感激。

已更新
实际上,我处理字符串而不是数字序列。到目前为止,我想出了以下算法(用C#实现):

public class Substitution
{
    public string SubstituteWhat { get; private set; }
    public string SubstituteTo { get; private set; }
    public double  TotalWeight { get; private set; }
    public double  SymbolWeight { get; private set; }

    public Substitution(string substituteWhat, string substituteTo, double totalWeight)
    {
        SubstituteWhat = substituteWhat;
        SubstituteTo = substituteTo;
        TotalWeight = totalWeight;
        SymbolWeight = TotalWeight/SubstituteWhat.Length;
    }

    public override string ToString()
    {
        return string.Format("SubstituteWhat: {0}, SubstituteTo: {1}, TotalWeight: {2}", SubstituteWhat, SubstituteTo, TotalWeight);
    }
}

class SubstitutedStringWindowImpl
{
    public string OriginalPhrase { get; set; }
    public string SubstitutedPhrase { get; set; }

    private double[] weightVector;
    private double windowSum;
    private int addItemPointer;
    private int substructItemPoiner;

    public SubstitutedStringWindowImpl(string phrase, int windowLength)
    {
        this.OriginalPhrase = phrase;
        SubstitutedPhrase = String.Empty;
        weightVector = new double[phrase.Length + windowLength ];
        windowSum = 0;
        substructItemPoiner = 0;
        addItemPointer = windowLength;
    }

    public bool PerformSubstitution(
        Substitution substitution, 
        double maxWeight,
        out SubstitutedStringWindowImpl substitutedString)
    {
        substitutedString = MemberwiseClone() as SubstitutedStringWindowImpl;
        substitutedString.weightVector = weightVector.ToArray();

        for (int i = 0; i < substitution.SubstituteWhat.Length; i++)
        {
            substitutedString.weightVector[substitutedString.addItemPointer] = substitution.SymbolWeight;
            substitutedString.windowSum = substitutedString.windowSum -
                                          substitutedString.weightVector[substitutedString.substructItemPoiner] +
                                          substitutedString.weightVector[substitutedString.addItemPointer];

            substitutedString.substructItemPoiner++;
            substitutedString.addItemPointer++;

            if (substitutedString.windowSum > maxWeight)
                return false;

            if (substitutedString.addItemPointer == substitutedString.weightVector.Length)
                break;
        }

        substitutedString.SubstitutedPhrase = SubstitutedPhrase + substitution.SubstituteTo;

        return true;
    }
}
internal class SubstitutionManagerWindowImpl
{

    private readonly Dictionary<string, List<Substitution>> substitutionsDict;
    private readonly double maxWeight;
    private readonly int maxSubstitutionLength;
    private readonly int windowLength;

    public SubstitutionManagerWindowImpl(
        List<Substitution> substitutions,
        double maxWeight,
        int windowLength)
    {
        this.substitutionsDict = substitutions.GroupBy(x => x.SubstituteWhat)
            .ToDictionary(x => x.Key, x => x.ToList());
        this.maxWeight = maxWeight;
        this.windowLength = windowLength;
        maxSubstitutionLength = substitutions.Max(x => x.SubstituteWhat.Length);
    }


    private List<SubstitutedStringWindowImpl> GetAllSubstitutionsPrivate(
        SubstitutedStringWindowImpl stringToHandle, int symbolCount)
    {
        if (stringToHandle.OriginalPhrase.Length == symbolCount)
            return new List<SubstitutedStringWindowImpl> {stringToHandle};

        var result = new List<SubstitutedStringWindowImpl>();

        for (int i = 1;
            i <= Math.Min(maxSubstitutionLength, stringToHandle.OriginalPhrase.Length - symbolCount);
            i++)
        {
            var subphraseToSubstitute = stringToHandle.OriginalPhrase.Substring(symbolCount, i);

            List<Substitution> appropriateSubstitutions;
            if (!substitutionsDict.TryGetValue(subphraseToSubstitute, out appropriateSubstitutions))
                continue;

            foreach (var substitution in appropriateSubstitutions)
            {
                SubstitutedStringWindowImpl handledString;

                if (!stringToHandle.PerformSubstitution(substitution, maxWeight, out handledString))
                    continue;

                result.AddRange(GetAllSubstitutionsPrivate(handledString,
                    symbolCount + substitution.SubstituteWhat.Length));
            }
        }

        return result;
    }

    // this is the entry function
    public List<string> GetAllSubstitutions(string phrase)
    {
        var res = GetAllSubstitutionsPrivate(new SubstitutedStringWindowImpl(phrase,windowLength), 0);
        return res.Select(x => x.SubstitutedPhrase).ToList();

    }
}

但似乎它没有快速完成工作。有任何建议如何改进吗?

3 个答案:

答案 0 :(得分:2)

似乎并不需要能够用多项式延迟进行枚举。定义递归过程

all_substitutions(substitutions, first_half, last_n_weights, second_half)

其中first_half是输入的替换前缀,second_half是输入的未取代后缀,使得两半组成整个输入。 all_substitutions的正文应该测试second_half是否为空。如果是,请收益first_half并返回。否则,对于second_half开头的每个可行替换,使其适当递归(我们计算将单个替换为自身作为权重0的可行替换)。

如果这些方面的程序不够快,那么查看实际代码会很有帮助,因为进一步的改进将是数据结构的。

答案 1 :(得分:0)

我试图看看这个,并且被我似乎无法缓存结果的事实所困扰,这通常是一种非常有效的优化。经过一些调整后我发现了一种方法。

代码尽快替换序列,然后以递归方式替换右侧的剩余序列。它不打扰传递到替换先前替换成本的子序列,因为子序列将在稍后的时间重新分析而没有第一次替换,因此改变了先前的成本值。

然而,当父序列在子序列中接收到替换结果时,会考虑成本:它只保留那些满足其替换成本不会超出成本限制的事实。当然,就像在每个缓存机制中一样,你可以用内存来换取速度,但缓存在这里非常有用。 (理想情况下,缓存应该独立于CappedCostSubstitutionEnumerator;使用AOP可以在需要时注入)

我们首先定义SequenceSubstitution对象

public class Sequence<T>: List<T>
{
    public double Cost { get; set; }

    public override string ToString() // for the cache key
    {
        return string.Join("|", this.Select(t => t.ToString()));
    }

    public Sequence(IEnumerable<T> sequence)
    {
        this.AddRange(sequence);
        Cost = 0;
    }
}

public class Substitution<T>
{
    public IEnumerable<T> OriginalSequence { get; set; }
    public IEnumerable<T> SwappedSequence { get; set; }
    public double Cost { get; set; }
}

现在我们可以定义替换枚举器

public class CappedCostSubstitutionEnumerator<T>
{
    private static Dictionary<string, List<Sequence<T>>> Cache { get; set; }

    static CappedCostSubstitutionEnumerator()
    {
        Cache = new Dictionary<string, List<Sequence<T>>>();
    }

    public double AuthorizedCost { get; set; }
    public List<Substitution<T>> Substitutions { get; set; }

    public CappedCostSubstitutionEnumerator(double maxAuthorizedCost)
    {
        Substitutions = new List<Substitution<T>>();
        AuthorizedCost = maxAuthorizedCost;
    }

    public List<List<T>> EnumerateSequenceSubstitutions(List<T> sequence)
    {
        var values = EnumerateSequenceSubstitutions(new Sequence<T>(sequence));
        return values.Select(s => s as List<T>).ToList();
    }

    private List<Sequence<T>> EnumerateSequenceSubstitutions(Sequence<T> sourceSequence)
    {
        var sourceSequenceKey = sourceSequence.ToString();
        if (Cache.ContainsKey(sourceSequenceKey))
        {
            return Cache[sourceSequenceKey];
        }
        else
        {
            var sequenceSubstitutionsResults = new List<Sequence<T>> { sourceSequence };

            foreach (var substitution in Substitutions.Where(substitution => substitution.Cost <= AuthorizedCost))
            {
                SubstituteWhereOriginalSubstitutionSequenceIsFound(sourceSequence, substitution, sequenceSubstitutionsResults);
            }

            Cache.Add(sourceSequenceKey, sequenceSubstitutionsResults);

            return sequenceSubstitutionsResults;
        }
    }

    private void SubstituteWhereOriginalSubstitutionSequenceIsFound(Sequence<T> sourceSequence, Substitution<T> substitution,
        List<Sequence<T>> sequenceSubstitutionsResults)
    {
        var indexInSequence = 0;
        var originalSequenceLength = substitution.OriginalSequence.Count();
        var upperIndexInSequence = sourceSequence.Count() - originalSequenceLength;
        // we are going to look for the substitution pattern at each possible place in the source sequence
        while (indexInSequence <= upperIndexInSequence)
        {
            var evaluatedMatch = sourceSequence.Skip(indexInSequence).Take(originalSequenceLength);
            if (evaluatedMatch.SequenceEqual<T>(substitution.OriginalSequence))
            {
                var start = sourceSequence.Take(indexInSequence);
                var substituted = substitution.SwappedSequence;
                var remain = sourceSequence.Skip(indexInSequence + originalSequenceLength);
                // now we want the subsequences without taking cost into account
                var subSequences = EnumerateSequenceSubstitutions(new Sequence<T>(remain));
                // we filter the subsequences on the total cost here
                foreach (
                    var subSequence in
                        subSequences.Where(
                            subSeq => (subSeq.Cost + substitution.Cost) <= (AuthorizedCost - sourceSequence.Cost)))
                {
                    sequenceSubstitutionsResults.Add(
                        new Sequence<T>(start.Concat(substituted).Concat(subSequence))
                        {
                            Cost = substitution.Cost + subSequence.Cost
                        });
                }
            }
            indexInSequence++;
        }
    }
}

有些观点:

  • 我使用列表,但它可能不是最有效的数据持有者;事实上,我很确定内存碎片可能是一个问题。如果它导致问题,使用内存池可能是一个好主意
  • 一次走一个元素序列不是最有效的,有一些更好的方法来查找序列出现
  • 缓存确实有帮助,对于下面的示例,它达到了1796次,为53次插入

要使用它,请执行以下操作:

private static void Main(string[] args)
{
    var p = new CappedCostSubstitutionEnumerator<int>(0.5);
    p.Substitutions.Add(new Substitution<int>() {OriginalSequence = new int[] {1}, SwappedSequence = new int[] {3}, Cost = 0.2});
    p.Substitutions.Add(new Substitution<int>() { OriginalSequence = new int[] { 2,3,4 }, SwappedSequence = new int[] { 4,3 }, Cost = 0.3 });
    p.Substitutions.Add(new Substitution<int>() { OriginalSequence = new int[] { 5 }, SwappedSequence = new int[] { 2 }, Cost = 0.4 });
    p.Substitutions.Add(new Substitution<int>() { OriginalSequence = new int[] { 5,1 }, SwappedSequence = new int[] { 4, 3 }, Cost = 0.3 });
    p.Substitutions.Add(new Substitution<int>() { OriginalSequence = new int[] { 2,3 }, SwappedSequence = new int[] { 4, 3 }, Cost = 0.3 });
    p.Substitutions.Add(new Substitution<int>() { OriginalSequence = new int[] { 2 }, SwappedSequence = new int[] { 4, 3 }, Cost = 0.1 });
    p.Substitutions.Add(new Substitution<int>() { OriginalSequence = new int[] {  4 }, SwappedSequence = new int[] { 4, 3 }, Cost = 0.3 });

    var results = p.EnumerateSequenceSubstitutions(new List<int>() { 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 1, 2, 3, 4, 5, 5 });
    // results contains 5390 values
}

答案 2 :(得分:0)

这是我的解决方案,以及一些测试。 在我的计算机上以不到一秒的速度计算30个令牌序列的替换。

它作为二叉树工作,并在上下文对象中缓存中间结果,因此所有分支共享缓存。 问题是将源分成两个子树并不容易,因为有各种令牌长度。它是基于每个可能的令牌长度在不同点重复拆分而解决的,因此每个令牌长度都可以在两个子树中获得。

涉及很多优化,包括通过“预算”广告位预过滤替换令牌(无法按每个预算计算)。

请注意,原始项目不包含在结果中作为递归函数的优化。 此外,我已经用小数替换了双倍以避免舍入错误,因为它从预算中减去了成本。 实际上,我建议用不同范围的整数替换成本。

using System;
using System.Collections.Generic;
using System.Linq;
using Microsoft.VisualStudio.TestTools.UnitTesting;

namespace SubstitutionsWithBudget
{
    [TestClass]
    public class SubstitutionTest
    {
        private static readonly List<Substitution> Substitutions = new List<Substitution>
            {
                new Substitution("1", "3", 0.2m),
                new Substitution("234", "43", 0.3m),
                new Substitution("5", "2", 0.4m)
            };

        private readonly SubstitutionsDefinition substitutionsDefinition = new SubstitutionsDefinition(Substitutions);

        [TestMethod]
        public void OriginalQuestion()
        {
            string source = "12345";

            var variants = EnumerateSubstitutions(new Context(), substitutionsDefinition, source, 0.6m);

            foreach (var variant in variants)
            {
                Console.WriteLine(variant);
            }

            Assert.AreEqual(5, variants.Count());
        }

        [TestMethod]
        public void MultiplicityTest()
        {
            const int multiplicity = 6;
            string source = string.Join("", Enumerable.Repeat("12345", multiplicity));

            var variants = EnumerateSubstitutions(new Context(), substitutionsDefinition, source, multiplicity * 0.6m).ToList();

            foreach (var variant in variants.Take(10))
            {
                Console.WriteLine(variant);
            }
        }

        [TestMethod]
        public void SomeUsefulApplication()
        {
            var substitutions = new List<Substitution>
            {
                new Substitution("monkey", "elephant", 2m),
                new Substitution("monkey", "shark", 3m),
                new Substitution("banana", "apple", 1m),
                new Substitution("feed", "kill", 4m),
            };
            var resultingSubstitutions = EnumerateSubstitutions(
                new Context(),
                new SubstitutionsDefinition(substitutions),
                "feed monkey with banana",
                4)
                .OrderBy(s => s.Item2).ToList();
            foreach (var substitution in resultingSubstitutions)
            {
                Console.WriteLine(substitution);
            }

            Assert.IsTrue(resultingSubstitutions.Any(
                s => s.Item1 == "feed elephant with banana"));
            Assert.IsFalse(resultingSubstitutions.Any(
                s => s.Item1 == "kill shark with banana"));
        }

        IEnumerable<Tuple<string, decimal>> EnumerateSubstitutions(Context context, SubstitutionsDefinition substitutionsDefinition, string source, decimal maxCost)
        {
            if (source.Length == 0)
            {
                yield break;
            }
            var possibleSubstitutions = substitutionsDefinition.GetSubstitutions(source, maxCost).ToList();

            // find substitutions of whole string
            foreach (var possibleSubstitution in possibleSubstitutions)
            {
                yield return Tuple.Create(possibleSubstitution.Destination, possibleSubstitution.Weight);
            }

            // Variable split boundary to accomodate tokens of different length
            var middle = source.Length / 2;
            var window = substitutionsDefinition.MaxLength - 1;

            if (window > source.Length - 2)
            {
                window = source.Length - 2;
            }
            var min = middle - window / 2;

            var returned = new HashSet<Tuple<string, decimal>> { Tuple.Create(source, 0m) };

            for (var i = 0; i <= window; i++)
            {
                var divideAt = min + i;
                var left = source.Substring(0, divideAt);
                var right = source.Substring(divideAt, source.Length - divideAt);
                var q =
                    from leftSubstitution in context.GetCachedResult(Tuple.Create(left, maxCost),
                        () => EnumerateSubstitutions(context, substitutionsDefinition, left, maxCost)).Concat(new[] { Tuple.Create(left, 0m) })
                    let leftCost = leftSubstitution.Item2
                    from rightSubstitution in context.GetCachedResult(Tuple.Create(right, maxCost - leftCost),
                    () => EnumerateSubstitutions(context, substitutionsDefinition, right, maxCost - leftCost)).Concat(new[] { Tuple.Create(right, 0m) })
                    where leftCost + rightSubstitution.Item2 <= maxCost
                    select new { leftSubstitution, rightSubstitution };
                q = q.ToList();

                foreach (var item in q.Select(pair => Tuple.Create(pair.leftSubstitution.Item1 + pair.rightSubstitution.Item1,
                    pair.leftSubstitution.Item2 + pair.rightSubstitution.Item2)).Where(item => !returned.Contains(item)))
                {
                    yield return item;
                    returned.Add(item);
                }
            }
        }
    }
    public struct Substitution
    {
        public readonly string Souce;
        public readonly string Destination;
        public readonly decimal Weight;

        public Substitution(string souce, string destination, decimal weight)
        {
            Souce = souce;
            Destination = destination;
            Weight = weight;
        }
    }

    public class Context
    {
        private readonly Dictionary<Tuple<string, decimal>, List<Tuple<string, decimal>>> cache = new Dictionary<Tuple<string, decimal>, List<Tuple<string, decimal>>>();

        public IEnumerable<Tuple<string, decimal>> GetCachedResult(Tuple<string, decimal> key, Func<IEnumerable<Tuple<string, decimal>>> create)
        {
            List<Tuple<string, decimal>> result;
            cache.TryGetValue(key, out result);
            if (result != null) return result;
            result = create().ToList();
            cache.Add(key, result);
            return result;
        }

        public void AddToCache(Tuple<string, decimal> key, List<Tuple<string, decimal>> result)
        {
            cache.Add(key, result);
        }
    }

    public class SubstitutionsDefinition
    {
        private readonly decimal maxCost;
        private const int Granularity = 10;

        /// <summary>
        /// Holds substitutions lookups based on budget slots.
        /// A slot with higher budget also holds values of all lower-budget slots.
        /// </summary>
        private readonly ILookup<int, ILookup<string, Substitution>> byCost;

        private readonly int maxLength;

        private readonly ILookup<string, Substitution> simple;

        private bool simpleMode;

        public int MaxLength { get { return maxLength; } }

        // Really helpful if there are a lot of substitutions
        public IEnumerable<Substitution> GetSubstitutions(string token, decimal budget)
        {
            return simpleMode
                ? GetSubstitutionsSmallSet(token, budget)
                : GetSubstitutionsLargeSet(token, budget);
        }

        private IEnumerable<Substitution> GetSubstitutionsLargeSet(string token, decimal budget)
        {
            return byCost[GetSlot(budget)].SelectMany(i => i[token]).Where(s => s.Weight <= budget);
        }

        public IEnumerable<Substitution> GetSubstitutionsSmallSet(string token, decimal budget)
        {
            return simple[token].Where(s => s.Weight <= budget);
        }

        public SubstitutionsDefinition(IEnumerable<Substitution> substitutions)
        {
            var list = substitutions.ToList();
            simpleMode = list.Count < 20; // Need to be found experimentally.
            simple = list.ToLookup(i => i.Souce);
            maxCost = list.Max(s => s.Weight);
            maxLength = list.Max(s => s.Souce.Length);
            var byLength = list.ToLookup(s => s.Souce.Length);
            var q =
                from length in list.Select(i => i.Souce.Length).Distinct()
                from budgetSlot in Enumerable.Range(0, Granularity + 1)
                from item in byLength[length]
                where item.Weight <= GetCost(budgetSlot)
                group item by budgetSlot;
            byCost = q.ToLookup(i => i.Key, i => i.ToLookup(j => j.Souce));
        }

        private decimal GetCost(int slot)
        {
            var d = maxCost * slot;
            return d / Granularity;
        }

        private int GetSlot(decimal weight)
        {
            var maxWeight = Math.Min(weight, maxCost);
            return (int)(Granularity * maxWeight / maxCost);
        }
    }
}