Skip to main content

cargo/util/network/
retry.rs

1//! Utilities for retrying a network operation.
2//!
3//! Some network errors are considered "spurious", meaning it is not a real
4//! error (such as a 404 not found) and is likely a transient error (like a
5//! bad network connection) that we can hope will resolve itself shortly. The
6//! [`Retry`] type offers a way to repeatedly perform some kind of network
7//! operation with a delay if it detects one of these possibly transient
8//! errors.
9//!
10//! This supports errors from [`git2`], [`gix`], [`curl`], and
11//! [`HttpNotSuccessful`] 5xx HTTP errors.
12//!
13//! The number of retries can be configured by the user via the `net.retry`
14//! config option. This indicates the number of times to retry the operation
15//! (default 3 times for a total of 4 attempts).
16//!
17//! There are hard-coded constants that indicate how long to sleep between
18//! retries. The constants are tuned to balance a few factors, such as the
19//! responsiveness to the user (we don't want cargo to hang for too long
20//! retrying things), and accommodating things like Cloudfront's default
21//! negative TTL of 10 seconds (if Cloudfront gets a 5xx error for whatever
22//! reason it won't try to fetch again for 10 seconds).
23//!
24//! The timeout also implements a primitive form of random jitter. This is so
25//! that if multiple requests fail at the same time that they don't all flood
26//! the server at the same time when they are retried. This jitter still has
27//! some clumping behavior, but should be good enough.
28//!
29//! [`Retry`] is the core type for implementing retry logic. The
30//! [`Retry::try`] method can be called with a callback, and it will
31//! indicate if it needs to be called again sometime in the future if there
32//! was a possibly transient error. The caller is responsible for sleeping the
33//! appropriate amount of time and then calling [`Retry::try`] again.
34//!
35//! [`with_retry`] is a convenience function that will create a [`Retry`] and
36//! handle repeatedly running a callback until it succeeds, or it runs out of
37//! retries.
38//!
39//! Some interesting resources about retries:
40//! - <https://aws.amazon.com/blogs/architecture/exponential-backoff-and-jitter/>
41//! - <https://en.wikipedia.org/wiki/Exponential_backoff>
42//! - <https://developer.mozilla.org/en-US/docs/Web/HTTP/Headers/Retry-After>
43
44use crate::util::errors::{GitCliError, HttpNotSuccessful};
45use crate::util::network::http_async;
46use crate::{CargoResult, GlobalContext};
47use anyhow::Error;
48use rand::RngExt;
49use std::cmp::min;
50use std::time::Duration;
51
52/// State for managing retrying a network operation.
53pub struct Retry<'a> {
54    gctx: &'a GlobalContext,
55    /// The number of failed attempts that have been done so far.
56    ///
57    /// Starts at 0, and increases by one each time an attempt fails.
58    retries: u64,
59    /// The maximum number of times the operation should be retried.
60    ///
61    /// 0 means it should never retry.
62    max_retries: u64,
63}
64
65/// The result of attempting some operation via [`Retry::try`].
66pub enum RetryResult<T> {
67    /// The operation was successful.
68    ///
69    /// The wrapped value is the return value of the callback function.
70    Success(T),
71    /// The operation was an error, and it should not be tried again.
72    Err(anyhow::Error),
73    /// The operation failed, and should be tried again in the future.
74    ///
75    /// The wrapped value is the number of milliseconds to wait before trying
76    /// again. The caller is responsible for waiting this long and then
77    /// calling [`Retry::try`] again.
78    Retry(u64),
79}
80
81/// Maximum amount of time a single retry can be delayed (milliseconds).
82const MAX_RETRY_SLEEP_MS: u64 = 10 * 1000;
83/// The minimum initial amount of time a retry will be delayed (milliseconds).
84///
85/// The actual amount of time will be a random value above this.
86const INITIAL_RETRY_SLEEP_BASE_MS: u64 = 500;
87/// The maximum amount of additional time the initial retry will take (milliseconds).
88///
89/// The initial delay will be [`INITIAL_RETRY_SLEEP_BASE_MS`] plus a random range
90/// from 0 to this value.
91const INITIAL_RETRY_JITTER_MS: u64 = 1000;
92
93impl<'a> Retry<'a> {
94    pub fn new(gctx: &'a GlobalContext) -> CargoResult<Retry<'a>> {
95        Ok(Retry {
96            gctx,
97            retries: 0,
98            max_retries: gctx.net_config()?.retry.unwrap_or(3) as u64,
99        })
100    }
101
102    /// Calls the given callback, and returns a [`RetryResult`] which
103    /// indicates whether or not this needs to be called again at some point
104    /// in the future to retry the operation if it failed.
105    pub fn r#try<T>(&mut self, f: impl FnOnce() -> CargoResult<T>) -> RetryResult<T> {
106        match f() {
107            Err(ref e) if maybe_spurious(e) && self.retries < self.max_retries => {
108                let err = e.downcast_ref::<HttpNotSuccessful>();
109                let err_msg = err
110                    .map(|http_err| http_err.display_short())
111                    .unwrap_or_else(|| e.root_cause().to_string());
112                let left_retries = self.max_retries - self.retries;
113                let msg = format!(
114                    "spurious network error ({} {} remaining): {err_msg}",
115                    left_retries,
116                    if left_retries != 1 { "tries" } else { "try" }
117                );
118                if let Err(e) = self.gctx.shell().warn(msg) {
119                    return RetryResult::Err(e);
120                }
121                self.retries += 1;
122                let sleep = err
123                    .and_then(|v| Self::parse_retry_after(v, &jiff::Timestamp::now()))
124                    // Limit the Retry-After to a maximum value to avoid waiting too long.
125                    .map(|retry_after| retry_after.min(MAX_RETRY_SLEEP_MS))
126                    .unwrap_or_else(|| self.next_sleep_ms());
127                RetryResult::Retry(sleep)
128            }
129            Err(e) => RetryResult::Err(e),
130            Ok(r) => RetryResult::Success(r),
131        }
132    }
133
134    /// Gets the next sleep duration in milliseconds.
135    fn next_sleep_ms(&self) -> u64 {
136        if let Ok(sleep) = self.gctx.get_env("__CARGO_TEST_FIXED_RETRY_SLEEP_MS") {
137            return sleep.parse().expect("a u64");
138        }
139
140        if self.retries == 1 {
141            let mut rng = rand::rng();
142            INITIAL_RETRY_SLEEP_BASE_MS + rng.random_range(0..INITIAL_RETRY_JITTER_MS)
143        } else {
144            min(
145                ((self.retries - 1) * 3) * 1000 + INITIAL_RETRY_SLEEP_BASE_MS,
146                MAX_RETRY_SLEEP_MS,
147            )
148        }
149    }
150
151    /// Parse the HTTP `Retry-After` header.
152    /// Returns the number of milliseconds to wait before retrying according to the header.
153    fn parse_retry_after(response: &HttpNotSuccessful, now: &jiff::Timestamp) -> Option<u64> {
154        // Only applies to HTTP 429 (too many requests) and 503 (service unavailable).
155        if !matches!(response.code, 429 | 503) {
156            return None;
157        }
158
159        // Extract the Retry-After header value.
160        let retry_after = response
161            .headers
162            .iter()
163            .filter_map(|h| h.split_once(':'))
164            .map(|(k, v)| (k.trim(), v.trim()))
165            .find(|(k, _)| k.eq_ignore_ascii_case("retry-after"))?
166            .1;
167
168        // First option: Retry-After is a positive integer of seconds to wait.
169        if let Ok(delay_secs) = retry_after.parse::<u32>() {
170            return Some(delay_secs as u64 * 1000);
171        }
172
173        // Second option: Retry-After is a future HTTP date string that tells us when to retry.
174        if let Ok(retry_time) = jiff::fmt::rfc2822::parse(retry_after) {
175            let diff_ms = now
176                .until(&retry_time)
177                .unwrap()
178                .total(jiff::Unit::Millisecond)
179                .unwrap();
180            if diff_ms > 0.0 {
181                return Some(diff_ms as u64);
182            }
183        }
184        None
185    }
186}
187
188fn maybe_spurious(err: &Error) -> bool {
189    fn maybe_spurious_curl(curl_err: &curl::Error) -> bool {
190        curl_err.is_couldnt_connect()
191            || curl_err.is_couldnt_resolve_proxy()
192            || curl_err.is_couldnt_resolve_host()
193            || curl_err.is_operation_timedout()
194            || curl_err.is_recv_error()
195            || curl_err.is_send_error()
196            || curl_err.is_http2_error()
197            || curl_err.is_http2_stream_error()
198            || curl_err.is_ssl_connect_error()
199            || curl_err.is_partial_file()
200    }
201    if let Some(async_http_error) = err.downcast_ref::<http_async::Error>() {
202        match async_http_error {
203            http_async::Error::Easy(error) => return maybe_spurious_curl(error),
204            http_async::Error::TooSlow { .. } => return true,
205            http_async::Error::Multi(_) => {}
206            http_async::Error::BadHeader { .. } => {}
207        }
208    }
209    if let Some(git_err) = err.downcast_ref::<git2::Error>() {
210        match git_err.class() {
211            git2::ErrorClass::Net
212            | git2::ErrorClass::Os
213            | git2::ErrorClass::Zlib
214            | git2::ErrorClass::Http => return git_err.code() != git2::ErrorCode::Certificate,
215            _ => (),
216        }
217    }
218    if let Some(curl_err) = err.downcast_ref::<curl::Error>() {
219        if maybe_spurious_curl(curl_err) {
220            return true;
221        }
222    }
223    if let Some(not_200) = err.downcast_ref::<HttpNotSuccessful>() {
224        if 500 <= not_200.code && not_200.code < 600 || not_200.code == 429 {
225            return true;
226        }
227    }
228
229    use gix::protocol::transport::IsSpuriousError;
230
231    if let Some(err) = err.downcast_ref::<crate::sources::git::fetch::Error>() {
232        if err.is_spurious() {
233            return true;
234        }
235    }
236
237    if let Some(err) = err.downcast_ref::<GitCliError>() {
238        if err.is_spurious() {
239            return true;
240        }
241    }
242
243    false
244}
245
246/// Wrapper method for network call retry logic.
247///
248/// Retry counts provided by Config object `net.retry`. Config shell outputs
249/// a warning on per retry.
250///
251/// Closure must return a `CargoResult`.
252///
253/// # Examples
254///
255/// ```
256/// # use crate::cargo::util::{CargoResult, GlobalContext};
257/// # let download_something = || return Ok(());
258/// # let gctx = GlobalContext::default().unwrap();
259/// use cargo::util::network;
260/// let cargo_result = network::retry::with_retry(&gctx, || download_something());
261/// ```
262pub fn with_retry<T, F>(gctx: &GlobalContext, mut callback: F) -> CargoResult<T>
263where
264    F: FnMut() -> CargoResult<T>,
265{
266    let mut retry = Retry::new(gctx)?;
267    loop {
268        match retry.r#try(&mut callback) {
269            RetryResult::Success(r) => return Ok(r),
270            RetryResult::Err(e) => return Err(e),
271            RetryResult::Retry(sleep) => std::thread::sleep(Duration::from_millis(sleep)),
272        }
273    }
274}
275
276#[test]
277fn with_retry_repeats_the_call_then_works() {
278    use cargo_util_terminal::Shell;
279
280    //Error HTTP codes (5xx) are considered maybe_spurious and will prompt retry
281    let error1 = HttpNotSuccessful {
282        code: 501,
283        url: "Uri".to_string(),
284        ip: None,
285        body: Vec::new(),
286        headers: Vec::new(),
287    }
288    .into();
289    let error2 = HttpNotSuccessful {
290        code: 502,
291        url: "Uri".to_string(),
292        ip: None,
293        body: Vec::new(),
294        headers: Vec::new(),
295    }
296    .into();
297    let mut results: Vec<CargoResult<()>> = vec![Ok(()), Err(error1), Err(error2)];
298    let gctx = GlobalContext::default().unwrap();
299    *gctx.shell() = Shell::from_write(Box::new(Vec::new()));
300    let result = with_retry(&gctx, || results.pop().unwrap());
301    assert!(result.is_ok())
302}
303
304#[test]
305fn with_retry_finds_nested_spurious_errors() {
306    use cargo_util_terminal::Shell;
307
308    //Error HTTP codes (5xx) are considered maybe_spurious and will prompt retry
309    //String error messages are not considered spurious
310    let error1 = anyhow::Error::from(HttpNotSuccessful {
311        code: 501,
312        url: "Uri".to_string(),
313        ip: None,
314        body: Vec::new(),
315        headers: Vec::new(),
316    });
317    let error1 = anyhow::Error::from(error1.context("A non-spurious wrapping err"));
318    let error2 = anyhow::Error::from(HttpNotSuccessful {
319        code: 502,
320        url: "Uri".to_string(),
321        ip: None,
322        body: Vec::new(),
323        headers: Vec::new(),
324    });
325    let error2 = anyhow::Error::from(error2.context("A second chained error"));
326    let mut results: Vec<CargoResult<()>> = vec![Ok(()), Err(error1), Err(error2)];
327    let gctx = GlobalContext::default().unwrap();
328    *gctx.shell() = Shell::from_write(Box::new(Vec::new()));
329    let result = with_retry(&gctx, || results.pop().unwrap());
330    assert!(result.is_ok())
331}
332
333#[test]
334fn default_retry_schedule() {
335    use cargo_util_terminal::Shell;
336
337    let spurious = || -> CargoResult<()> {
338        Err(anyhow::Error::from(HttpNotSuccessful {
339            code: 500,
340            url: "Uri".to_string(),
341            ip: None,
342            body: Vec::new(),
343            headers: Vec::new(),
344        }))
345    };
346    let gctx = GlobalContext::default().unwrap();
347    *gctx.shell() = Shell::from_write(Box::new(Vec::new()));
348    let mut retry = Retry::new(&gctx).unwrap();
349    match retry.r#try(|| spurious()) {
350        RetryResult::Retry(sleep) => {
351            assert!(
352                sleep >= INITIAL_RETRY_SLEEP_BASE_MS
353                    && sleep < INITIAL_RETRY_SLEEP_BASE_MS + INITIAL_RETRY_JITTER_MS
354            );
355        }
356        _ => panic!("unexpected non-retry"),
357    }
358    match retry.r#try(|| spurious()) {
359        RetryResult::Retry(sleep) => assert_eq!(sleep, 3500),
360        _ => panic!("unexpected non-retry"),
361    }
362    match retry.r#try(|| spurious()) {
363        RetryResult::Retry(sleep) => assert_eq!(sleep, 6500),
364        _ => panic!("unexpected non-retry"),
365    }
366    match retry.r#try(|| spurious()) {
367        RetryResult::Err(_) => {}
368        _ => panic!("unexpected non-retry"),
369    }
370}
371
372#[test]
373fn curle_http2_stream_is_spurious() {
374    let code = curl_sys::CURLE_HTTP2_STREAM;
375    let err = curl::Error::new(code);
376    assert!(maybe_spurious(&err.into()));
377}
378
379#[test]
380fn retry_after_parsing() {
381    use cargo_util_terminal::Shell;
382    fn spurious(code: u32, header: &str) -> HttpNotSuccessful {
383        HttpNotSuccessful {
384            code,
385            url: "Uri".to_string(),
386            ip: None,
387            body: Vec::new(),
388            headers: vec![header.to_string()],
389        }
390    }
391
392    // Start of year 2025.
393    let now = jiff::Timestamp::new(1735689600, 0).unwrap();
394    let headers = spurious(429, "Retry-After: 10");
395    assert_eq!(Retry::parse_retry_after(&headers, &now), Some(10_000));
396    let headers = spurious(429, "retry-after: Wed, 01 Jan 2025 00:00:10 GMT");
397    let actual = Retry::parse_retry_after(&headers, &now).unwrap();
398    assert_eq!(10000, actual);
399
400    let headers = spurious(429, "Content-Type: text/html");
401    assert_eq!(Retry::parse_retry_after(&headers, &now), None);
402
403    let headers = spurious(429, "retry-after: Fri, 01 Jan 2000 00:00:00 GMT");
404    assert_eq!(Retry::parse_retry_after(&headers, &now), None);
405
406    let headers = spurious(429, "retry-after: -1");
407    assert_eq!(Retry::parse_retry_after(&headers, &now), None);
408
409    let headers = spurious(400, "retry-after: 1");
410    assert_eq!(Retry::parse_retry_after(&headers, &now), None);
411
412    let gctx = GlobalContext::default().unwrap();
413    *gctx.shell() = Shell::from_write(Box::new(Vec::new()));
414    let mut retry = Retry::new(&gctx).unwrap();
415    match retry
416        .r#try(|| -> CargoResult<()> { Err(anyhow::Error::from(spurious(429, "Retry-After: 7"))) })
417    {
418        RetryResult::Retry(sleep) => assert_eq!(sleep, 7_000),
419        _ => panic!("unexpected non-retry"),
420    }
421}
422
423#[test]
424fn git_cli_error_spurious() {
425    let error = GitCliError::new(Error::msg("test-git-cli-error"), false);
426    assert!(!maybe_spurious(&error.into()));
427
428    let error = GitCliError::new(Error::msg("test-git-cli-error"), true);
429    assert!(maybe_spurious(&error.into()));
430}