我的代码看似死锁,但我无法弄清楚原因。代码是一个简单的消费者/生产者,有一些额外的错误/执行处理。生产者填充ConcurrentQueue,而消费者循环队列并处理其内容,一次输入1个。
这就是扭曲:队列包含需要执行的异步操作,需要等待其结果。最后一部分允许某种程度的错误处理,例如重试。异步操作是将消息中继到集线器的SignalR调用。集线器的监听器完全依赖于这些消息,这就是为什么队列需要比平常更强大的原因。
编辑:以下是您要求我提供的更完整的代码示例。 不死锁。请看问题的底部。
public class Program
{
public static void Main(string[] args)
{
Console.Write( "Run how many tries?: " );
int tries = int.Parse( Console.ReadLine() );
ActionPump pump = new ActionPump();
Random r = new Random();
for ( int i = 0; i < tries; i++ )
{
// Fail 10% of the time?
int failOrNot = (int)( r.NextDouble() + 0.9 );
Console.WriteLine( "{0}: {1}", i, failOrNot );
pump.EnqueueAction( new ActionWrapper( () => RandomlyTimeout( failOrNot ) ) );
}
while ( !pump.IsDone )
Thread.Sleep( 10 );
// NOTE: Check pump.rejects here to see if any ActionWrappers still have a Task and if the Task.Status is WaitingForActivation.
Debugger.Break();
}
private static Task RandomlyTimeout(int failOrNot)
{
// When failing, wait "indefinitly" to fake a timeout.
return Task.Delay(failOrNot == 0 ? 3000 : 10);
}
}
public class ActionWrapper
{
private static long NextId;
private long? _id;
private readonly Func<Task> _action;
protected readonly TaskStatus[] ValidTaskStates = { TaskStatus.Created, TaskStatus.WaitingForActivation, TaskStatus.WaitingToRun, TaskStatus.Running, TaskStatus.WaitingForChildrenToComplete, };
public long Id
{
get { return _id ?? (_id = ++NextId) ?? 0; }
}
protected Task Task { get; set; }
public int FailureCount { get; protected internal set; }
public bool HasSucceeded { get; set; }
public ActionWrapper(Func<Task> action)
{
_action = action;
}
protected virtual bool VerifyTaskState()
{
// Don't execute the action again if the task is still busy.
if (Task != null)
{
// If the task is somehow not still running, dispose of it.
if (Task.IsFaulted || Task.IsCanceled || Task.Exception != null || !ValidTaskStates.Any(x => x == Task.Status))
{
// This must have been an unregistered failure, because the ExecuteInternal method always clears
// the Task property when it detects a failure. Since it hasn't, we can also assume the failure
// counter hasn't been incremented.
FailureCount++;
HasSucceeded = false;
// If we have an error to log, log it.
if (Task.Exception != null)
{
Debug.WriteLine("Discovered failure to execute ActionWrapper, the action with id {0} has failed {1} times.{2}", Id, FailureCount, Task.Exception);
}
else
{
Debug.WriteLine("Discovered failure to execute ActionWrapper, Task has status '{0}'. The action with id {1} has failed {2} times.", Task.Status, Id, FailureCount);
}
try
{
// Dispose of the failed or canceled task.
Task.Dispose();
Debug.WriteLine("The existing task for action with id {0} that has failed {1} times has been disposed. Advising new task creation.", Id, FailureCount);
}
catch (Exception e)
{
Console.WriteLine(string.Format("Failed to dispose of corrupt task for action {0}.", Id), e);
}
finally
{
Task = null;
}
}
else
{
Debug.WriteLine("The action with id {0} that has failed {1} times is still running, awaiting same task.", Id, FailureCount);
// If the task is still running and in a valid state, don't execute the action again. We don't want
// to execute the action twice, because we don't know what it does.
return false;
}
}
else
{
Debug.WriteLine("The action with id {0} that has failed {1} times has not succeeded yet, advising new task creation.", Id, FailureCount);
}
return true;
}
public Task Execute()
{
bool recycleExistingTask = HasSucceeded || !VerifyTaskState();
if (recycleExistingTask)
{
if (Task != null)
{
Debug.WriteLine("The action with id {0} that has failed {1} times has not succeeded yet, returning existing task.", Id, FailureCount);
return Task;
}
Debug.WriteLine("The action with id {0} that has failed {1} times has already succeeded, returning empty task.", Id, FailureCount);
return Task.FromResult(0);
}
Debug.WriteLine("The action with id {0} that has failed {1} times has not been executed yet, returning new task.", Id, FailureCount);
return ExecuteInternal();
}
protected virtual async Task ExecuteInternal()
{
try
{
SynchronizationContext.SetSynchronizationContext(null);
Task = _action();
await Task.ConfigureAwait(false);
HasSucceeded = true;
Debug.WriteLine("The action with id {0} that has failed {1} times has succeeded.", Id, FailureCount);
}
catch (Exception e)
{
FailureCount++;
HasSucceeded = false;
Debug.WriteLine("Failed to execute ActionWrapper, the action with id {0} has failed {1} times.{2}", Id, FailureCount, e);
}
finally
{
Task = null;
}
}
}
public class ActionPump
{
private readonly ConcurrentQueue<ActionWrapper> _actionQueue = new ConcurrentQueue<ActionWrapper>();
private readonly List<ActionWrapper> _rejects = new List<ActionWrapper>();
private Thread _pumpThread;
public bool IsRunning { get; private set; }
public int MaximumActionFailures { get; private set; }
public int MaximumActionWaitTime { get; private set; }
public bool IsDone
{
get { return !IsRunning && _actionQueue.Count == 0; }
}
public ActionPump(int maximumActionFailures = 5, int maximumActionWaitTime = 1000)
{
MaximumActionFailures = maximumActionFailures;
MaximumActionWaitTime = maximumActionWaitTime;
}
public void EnqueueAction(ActionWrapper action)
{
_actionQueue.Enqueue(action);
StartPump();
}
public void StartPump()
{
lock (this)
{
if (IsRunning)
return;
IsRunning = true;
}
_pumpThread = new Thread(Pump);
_pumpThread.Start();
}
private void Pump()
{
ActionWrapper action = null;
bool canRepeatedlyFail = MaximumActionFailures > 0;
try
{
IsRunning = true;
while (IsRunning && _actionQueue.Count > 0)
{
/* Preparation code before running action. */
if (_actionQueue.TryDequeue(out action))
{
int originalFailureCount = action.FailureCount;
action.Execute().Wait(MaximumActionWaitTime);
// The action has timed out if it has not succeeded nor failed after waiting for the allotted time.
bool hasTimedOut = !action.HasSucceeded && originalFailureCount == action.FailureCount;
bool hasRepeatedlyFailed = canRepeatedlyFail && action.FailureCount >= MaximumActionFailures;
if (hasTimedOut)
{
// A timeout counts as a failure. If it times out too many times, we'll discard it.
action.FailureCount++;
Debug.WriteLine("An action with id {0} that has failed {1} times (excl.) has timed out.", action.Id, originalFailureCount);
if (canRepeatedlyFail && action.FailureCount < MaximumActionFailures)
{
Debug.WriteLine("The action with id {0} that has failed {1} times (excl.) is requeued to be reexamined later", action.Id, originalFailureCount);
_actionQueue.Enqueue(action);
}
}
else if (canRepeatedlyFail && !action.HasSucceeded && !hasRepeatedlyFailed)
{
Debug.WriteLine("The action with id {0} that has failed {1} times (excl.) is requeued to be reexamined later", action.Id, originalFailureCount);
_actionQueue.Enqueue(action);
}
else if (!action.HasSucceeded || hasRepeatedlyFailed)
{
Debug.WriteLine("An action with id {0} has failed to execute too many times. Skipping execution and removing it from the queue.", action.Id);
}
if ( !action.HasSucceeded )
{
Debug.WriteLine( "Storing reject action with id {0} forever. Use in debugger to see if it completes sometime.",
action.Id );
_rejects.Add( action );
}
else
{
_rejects.Remove( action );
}
}
}
}
catch (Exception e)
{
Debug.WriteLine("Failed to pump action with id {0}.{1}", action == null ? -1 : action.Id, e);
/* Cleanup code */
}
finally
{
IsRunning = false;
}
}
}
在这段代码中,我试图介绍几个死锁场景: 首先,我虽然ConfigureAwait会有所帮助,因为如果代码“同时”处于同步处理Wait调用的同一个线程上,我们就无法处理catch块或将HasSucceeded bool设置为true的代码。 Wait调用将阻塞该线程,而被阻塞的线程想要处理HasSucceeded = true语句。
然后,我认为使用Task.Run或TaskScheduler.StartNew启动泵将在ThreadPool线程中启动泵,或者至少是可用于Task系统的线程。即使我使用ConfigureAwait,await之后的代码也需要执行并需要一个线程。因为ConfigureAwait指示不必返回到同一个线程,这并不意味着它不会意外地选择它已经运行的确切ThreadPool线程继续。通过启动一个新的手动管理线程,我希望能够对付这个模糊的案例。
作为绝望的尝试将代码从SynchronizationContext中删除,允许它在特定线程上继续,我尝试将其设置为null。虽然在调试时我发现运行时上下文已经为空。
运行时会发生什么,我收到一条日志消息,说正在执行Execute。然后,在10秒后,我收到一条日志消息,说明操作失败了。这仅偶尔发生,即每50或100个泵入口一次。我认为ExecuteInternal方法的其余部分在前面的Wait调用中是死锁。
但为什么它会陷入僵局呢?它无法访问我的SynchronizationContext,无论如何都是null。发生了什么事?
PS:我发现Wait调用是同步的,本质上很容易出现死锁,但我无法控制该操作返回的Task。如果SignalR任务甚至完成,它有时会等待很长时间。在这种情况下,我希望选项超时并通过再次调用操作重试该调用。
编辑:起初我虽然样本是死锁的,因为我有一些拒绝任务(当输入15或更多输入时)有Task.Status“WaitingForActivation”。事实证明,这只是说“我准备好继续等待执行和/或将值返回到等待我的代码”。在我将Pump.IsDone添加到我的代码后,我看到这些任务最终被泵的重试代码清理。
我认为原始代码是死锁的原因是因为SignalR消息正在进入UI,但发送它们的任务有时会一直等待(看似)。这就是我首先在重试代码中构建的原因,但即使消息通过,SignalR Tasks仍然没有完成。我想我的问题更多的是SignalR,而不是任务死锁。
我有一个关于任务的最后一个小问题:当一个任务永远运行时会发生什么?有没有代码观察这个并通过超时关闭这些任务,或者它们只是在应用程序结束之前占用ThreadPool线程?如果不是_rejects列表,我只是“泄露”了x重试后仍然超时的任务,它们最终会被清除还是会导致内存泄漏?当我无法控制任务创建(阻止我在创建任务期间使用内置功能添加超时或取消令牌)时,如何才能最好地处理此问题?