mirror of
https://github.com/optim-enterprises-bv/vault.git
synced 2025-11-02 11:38:02 +00:00
PKI - Fix order of chain building writes (#17772)
* Ensure correct write ordering in rebuildIssuersChains When troubleshooting a recent migration failure from 1.10->1.11, it was noted that some PKI mounts had bad chain construction despite having valid, chaining issuers. Due to the cluster's leadership trashing between nodes, the migration logic was re-executed several times, partially succeeding each time. While the legacy CA bundle migration logic was written with this in mind, one shortcoming in the chain building code lead us to truncate the ca_chain: by sorting the list of issuers after including non-written issuers (with random IDs), these issuers would occasionally be persisted prior to storage _prior_ to existing CAs with modified chains. The migration code carefully imported the active issuer prior to its parents. However, due to this bug, there was a chance that, if write to the pending parent succeeded but updating the active issuer didn't, the active issuer's ca_chain field would only contain the self-reference and not the parent's reference as well. Ultimately, a workaround of setting and subsequently unsetting a manual chain would force a chain regeneration. In this patch, we simply fix the write ordering: because we need to ensure a stable chain sorting, we leave the sort location in the same place, but delay writing the provided referenceCert to the last position. This is because the reference is meant to be the user-facing action: without transactional write capabilities, other chains may succeed, but if the last user-facing action fails, the user will hopefully retry the action. This will also correct migration, by ensuring the subsequent issuer import will be attempted again, triggering another chain build and only persisting this issuer when all other issuers have also been updated. Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Remigrate ca_chains to fix any missing issuers In the previous commit, we identified an issue that would occur on legacy issuer migration to the new storage format. This is easy enough to detect for any given mount (by an operator), but automating scanning and remediating all PKI mounts in large deployments might be difficult. Write a new storage migration version to regenerate all chains on upgrade, once. Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Add changelog entry Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Add issue to PKI considerations documentation Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> * Correct %v -> %w in chain building errs Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com> Signed-off-by: Alexander Scheel <alex.scheel@hashicorp.com>
This commit is contained in:
@@ -43,7 +43,7 @@ func (sc *storageContext) rebuildIssuersChains(referenceCert *issuerEntry /* opt
|
||||
// To begin, we fetch all known issuers from disk.
|
||||
issuers, err := sc.listIssuers()
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to list issuers to build chain: %v", err)
|
||||
return fmt.Errorf("unable to list issuers to build chain: %w", err)
|
||||
}
|
||||
|
||||
// Fast path: no issuers means we can set the reference cert's value, if
|
||||
@@ -79,6 +79,28 @@ func (sc *storageContext) rebuildIssuersChains(referenceCert *issuerEntry /* opt
|
||||
// Now call a stable sorting algorithm here. We want to ensure the results
|
||||
// are the same across multiple calls to rebuildIssuersChains with the same
|
||||
// input data.
|
||||
//
|
||||
// Note: while we want to ensure referenceCert is written last (because it
|
||||
// is the user-facing action), we need to balance this with always having
|
||||
// a stable chain order, regardless of which certificate was chosen as the
|
||||
// reference cert. (E.g., for a given collection of unchanging certificates,
|
||||
// if we repeatedly set+unset a manual chain, triggering rebuilds, we should
|
||||
// always have the same chain after each unset). Thus, delay the write of
|
||||
// the referenceCert below when persisting -- but keep the sort AFTER the
|
||||
// referenceCert was added to the list, not before.
|
||||
//
|
||||
// (Otherwise, if this is called with one existing issuer and one new
|
||||
// reference cert, and the reference cert sorts before the existing
|
||||
// issuer, we will sort this list and have persisted the new issuer
|
||||
// first, and may fail on the subsequent write to the existing issuer.
|
||||
// Alternatively, if we don't sort the issuers in this order and there's
|
||||
// a parallel chain (where cert A is a child of both B and C, with
|
||||
// C.ID < B.ID and C was passed in as the yet unwritten referenceCert),
|
||||
// then we'll create a chain with order A -> B -> C on initial write (as
|
||||
// A and B come from disk) but A -> C -> B on subsequent writes (when all
|
||||
// certs come from disk). Thus the sort must be done after adding in the
|
||||
// referenceCert, thus sorting it consistently, but its write must be
|
||||
// singled out to occur last.)
|
||||
sort.SliceStable(issuers, func(i, j int) bool {
|
||||
return issuers[i] > issuers[j]
|
||||
})
|
||||
@@ -116,7 +138,7 @@ func (sc *storageContext) rebuildIssuersChains(referenceCert *issuerEntry /* opt
|
||||
// Otherwise, fetch it from disk.
|
||||
stored, err = sc.fetchIssuerById(identifier)
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to fetch issuer %v to build chain: %v", identifier, err)
|
||||
return fmt.Errorf("unable to fetch issuer %v to build chain: %w", identifier, err)
|
||||
}
|
||||
}
|
||||
|
||||
@@ -127,7 +149,7 @@ func (sc *storageContext) rebuildIssuersChains(referenceCert *issuerEntry /* opt
|
||||
issuerIdEntryMap[identifier] = stored
|
||||
cert, err := stored.GetCertificate()
|
||||
if err != nil {
|
||||
return fmt.Errorf("unable to parse issuer %v to certificate to build chain: %v", identifier, err)
|
||||
return fmt.Errorf("unable to parse issuer %v to certificate to build chain: %w", identifier, err)
|
||||
}
|
||||
|
||||
issuerIdCertMap[identifier] = cert
|
||||
@@ -417,13 +439,27 @@ func (sc *storageContext) rebuildIssuersChains(referenceCert *issuerEntry /* opt
|
||||
}
|
||||
|
||||
// Finally, write all issuers to disk.
|
||||
//
|
||||
// See the note above when sorting issuers for why we delay persisting
|
||||
// the referenceCert, if it was provided.
|
||||
for _, issuer := range issuers {
|
||||
entry := issuerIdEntryMap[issuer]
|
||||
|
||||
if referenceCert != nil && issuer == referenceCert.ID {
|
||||
continue
|
||||
}
|
||||
|
||||
err := sc.writeIssuer(entry)
|
||||
if err != nil {
|
||||
pretty := prettyIssuer(issuerIdEntryMap, issuer)
|
||||
return fmt.Errorf("failed to persist issuer (%v) chain to disk: %v", pretty, err)
|
||||
return fmt.Errorf("failed to persist issuer (%v) chain to disk: %w", pretty, err)
|
||||
}
|
||||
}
|
||||
if referenceCert != nil {
|
||||
err := sc.writeIssuer(issuerIdEntryMap[referenceCert.ID])
|
||||
if err != nil {
|
||||
pretty := prettyIssuer(issuerIdEntryMap, referenceCert.ID)
|
||||
return fmt.Errorf("failed to persist issuer (%v) chain to disk: %w", pretty, err)
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user