mirror of
https://github.com/outbackdingo/firezone.git
synced 2026-01-27 18:18:55 +00:00
fix(connlib): discard timer once it fired (#7288)
Within `connlib`, we have many nested state machines. Many of them have internal timers by means of timestamps with which they indicate, when they'd like to be "woken" to perform time-related processing. For example, the `Allocation` state machine would indicate with a timestamp 5 minutes from the time an allocation is created that it needs to be woken again in order to send the refresh message to the relay. When we reset our network connections, we pretty much discard all state within connlib and together with that, all of these timers. Thus the `poll_timeout` function would return `None`, indicating that our state machines are not waiting for anything. Within the eventloop, the most outer state machine, i.e. `ClientState` is paired with an `Io` component that actually implements the timer by scheduling a wake-up aggregated as the earliest point of all state machines. In order to not fire the same timer multiple times in a row, we already intended to reset the timer once it fired. It turns out that this never worked and the timer still lingered around. When we call `reset`, `poll_timeout` - which feeds this timer - returns `None` and the timer doesn't get updated until it will finally return `Some` with an `Instant`. Because the previous timer didn't get cleared when it fired, this caused `connlib` to busy loop and prevent some(?) other parts of it from progressing, resulting in us never being able to reconnect to the portal. Yet, because the event loop itself was still operating, we could still resolve DNS queries and such. Resolves: #7254. --------- Co-authored-by: Jamil Bou Kheir <jamilbk@users.noreply.github.com>
This commit is contained in:
@@ -145,7 +145,7 @@ impl Io {
|
||||
if let Some(timeout) = self.timeout.as_mut() {
|
||||
if timeout.poll_unpin(cx).is_ready() {
|
||||
let deadline = timeout.deadline().into();
|
||||
self.timeout.as_mut().take(); // Clear the timeout.
|
||||
self.timeout = None; // Clear the timeout.
|
||||
|
||||
return Poll::Ready(Ok(Input::Timeout(deadline)));
|
||||
}
|
||||
@@ -421,6 +421,11 @@ fn is_max_wg_packet_size(d: &DatagramIn) -> bool {
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::future::poll_fn;
|
||||
|
||||
use domain::base::scan::ScannerError;
|
||||
use futures::task::noop_waker_ref;
|
||||
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
@@ -430,4 +435,36 @@ mod tests {
|
||||
|
||||
assert_eq!(max_channel_size, 1_360_000); // 1.36MB is fine, we only have 2 of these channels, meaning less than 3MB additional buffer in total.
|
||||
}
|
||||
|
||||
#[tokio::test]
|
||||
async fn timer_is_reset_after_it_fires() {
|
||||
let now = Instant::now();
|
||||
|
||||
let mut io = Io::new(
|
||||
Arc::new(|_| Err(io::Error::custom("not implemented"))),
|
||||
Arc::new(|_| Err(io::Error::custom("not implemented"))),
|
||||
);
|
||||
|
||||
io.reset_timeout(now + Duration::from_secs(1));
|
||||
|
||||
let poll_fn = poll_fn(|cx| io.poll(cx, &mut [], &mut [], &EncryptBuffer::new()))
|
||||
.await
|
||||
.unwrap();
|
||||
|
||||
let Input::Timeout(timeout) = poll_fn else {
|
||||
panic!("Unexpected result");
|
||||
};
|
||||
|
||||
assert_eq!(timeout, now + Duration::from_secs(1));
|
||||
|
||||
let poll = io.poll(
|
||||
&mut Context::from_waker(noop_waker_ref()),
|
||||
&mut [],
|
||||
&mut [],
|
||||
&EncryptBuffer::new(),
|
||||
);
|
||||
|
||||
assert!(poll.is_pending());
|
||||
assert!(io.timeout.is_none());
|
||||
}
|
||||
}
|
||||
|
||||
@@ -18,6 +18,10 @@ export default function Android() {
|
||||
<ChangeItem pull="7265">
|
||||
Prevents re-connections to the portal from hanging for longer than 5s.
|
||||
</ChangeItem>
|
||||
<ChangeItem pull="7288">
|
||||
Fixes an issue where network roaming would cause Firezone to become
|
||||
unresponsive.
|
||||
</ChangeItem>
|
||||
</Unreleased>
|
||||
<Entry version="1.3.6" date={new Date("2024-10-31")}>
|
||||
<ChangeItem>Handles DNS queries over TCP correctly.</ChangeItem>
|
||||
|
||||
@@ -19,6 +19,10 @@ export default function Apple() {
|
||||
<ChangeItem pull="7265">
|
||||
Prevents re-connections to the portal from hanging for longer than 5s.
|
||||
</ChangeItem>
|
||||
<ChangeItem pull="7288">
|
||||
Fixes an issue where network roaming would cause Firezone to become
|
||||
unresponsive.
|
||||
</ChangeItem>
|
||||
</Entry>
|
||||
<Entry version="1.3.7" date={new Date("2024-10-31")}>
|
||||
<ChangeItem>Handles DNS queries over TCP correctly.</ChangeItem>
|
||||
|
||||
@@ -14,7 +14,12 @@ export default function GUI({ title }: { title: string }) {
|
||||
return (
|
||||
<Entries href={href} arches={arches} title={title}>
|
||||
{/* When you cut a release, remove any solved issues from the "known issues" lists over in `client-apps`. This must not be done when the issue's PR merges. */}
|
||||
<Unreleased></Unreleased>
|
||||
<Unreleased>
|
||||
<ChangeItem pull="7288">
|
||||
Fixes an issue where network roaming would cause Firezone to become
|
||||
unresponsive.
|
||||
</ChangeItem>
|
||||
</Unreleased>
|
||||
<Entry version="1.3.11" date={new Date("2024-11-05")}>
|
||||
<ChangeItem pull="7263">
|
||||
Mitigates a crash in case the maximum packet size is not respected.
|
||||
|
||||
@@ -18,9 +18,13 @@ export default function Headless() {
|
||||
<ChangeItem pull="7265">
|
||||
Prevents re-connections to the portal from hanging for longer than 5s.
|
||||
</ChangeItem>
|
||||
<ChangeItem pull="7288">
|
||||
Fixes an issue where network roaming would cause Firezone to become
|
||||
unresponsive.
|
||||
</ChangeItem>
|
||||
<ChangeItem pull="7287">
|
||||
Fixes an issue where subsequent SIGHUP signals after the first one were
|
||||
ignored.
|
||||
Fixes an issue where subsequent SIGHUP signals after the first one
|
||||
were ignored.
|
||||
</ChangeItem>
|
||||
</Unreleased>
|
||||
<Entry version="1.3.5" date={new Date("2024-10-31")}>
|
||||
|
||||
Reference in New Issue
Block a user