backport of commit 5f8e67d8cd (#20090)

Co-authored-by: Alexander Scheel <alex.scheel@hashicorp.com>
This commit is contained in:
hc-github-team-secure-vault-core
2023-04-11 14:31:06 -04:00
committed by GitHub
parent f980e0b3d9
commit 3d84957788
3 changed files with 283 additions and 45 deletions

View File

@@ -364,7 +364,28 @@ func (cb *crlBuilder) getPresentLocalDeltaWALForClearing(sc *storageContext) ([]
}
func (cb *crlBuilder) getPresentUnifiedDeltaWALForClearing(sc *storageContext) ([]string, error) {
return cb._getPresentDeltaWALForClearing(sc, unifiedDeltaWALPath)
walClusters, err := sc.Storage.List(sc.Context, unifiedDeltaWALPrefix)
if err != nil {
return nil, fmt.Errorf("error fetching list of clusters with delta WAL entries: %w", err)
}
var allPaths []string
for index, cluster := range walClusters {
prefix := unifiedDeltaWALPrefix + cluster
clusterPaths, err := cb._getPresentDeltaWALForClearing(sc, prefix)
if err != nil {
return nil, fmt.Errorf("error fetching delta WAL entries for cluster (%v / %v): %w", index, cluster, err)
}
// Here, we don't want to include the unifiedDeltaWALPrefix because
// clearUnifiedDeltaWAL handles that for us. Instead, just include
// the cluster identifier.
for _, clusterPath := range clusterPaths {
allPaths = append(allPaths, cluster+clusterPath)
}
}
return allPaths, nil
}
func (cb *crlBuilder) _clearDeltaWAL(sc *storageContext, walSerials []string, path string) error {
@@ -511,49 +532,70 @@ func (cb *crlBuilder) _shouldRebuildUnifiedCRLs(sc *storageContext, override boo
return false, nil
}
// If we're overriding whether we should build Delta CRLs, always return
// true, even if storage errors might've happen.
if override {
return true, nil
}
// Fetch two storage entries to see if we actually need to do this
// rebuild, given we're within the window.
lastWALEntry, err := sc.Storage.Get(sc.Context, unifiedDeltaWALLastRevokedSerial)
if err != nil || !override && (lastWALEntry == nil || lastWALEntry.Value == nil) {
// If this entry does not exist, we don't need to rebuild the
// delta WAL due to the expiration assumption above. There must
// not have been any new revocations. Since err should be nil
// in this case, we can safely return it.
return false, err
}
lastBuildEntry, err := sc.Storage.Get(sc.Context, unifiedDeltaWALLastBuildSerial)
// rebuild, given we're within the window. We need to fetch these
// two entries per cluster.
clusters, err := sc.Storage.List(sc.Context, unifiedDeltaWALPrefix)
if err != nil {
return false, err
return false, fmt.Errorf("failed to get the list of clusters having written Delta WALs: %w", err)
}
if !override && lastBuildEntry != nil && lastBuildEntry.Value != nil {
// If the last build entry doesn't exist, we still want to build a
// new delta WAL, since this could be our very first time doing so.
//
// If any cluster tells us to rebuild, we should rebuild.
shouldRebuild := false
for index, cluster := range clusters {
prefix := unifiedDeltaWALPrefix + cluster
clusterUnifiedLastRevokedWALEntry := prefix + deltaWALLastRevokedSerialName
clusterUnifiedLastBuiltWALEntry := prefix + deltaWALLastBuildSerialName
lastWALEntry, err := sc.Storage.Get(sc.Context, clusterUnifiedLastRevokedWALEntry)
if err != nil {
return false, fmt.Errorf("failed fetching last revoked WAL entry for cluster (%v / %v): %w", index, cluster, err)
}
if lastWALEntry == nil || lastWALEntry.Value == nil {
continue
}
lastBuildEntry, err := sc.Storage.Get(sc.Context, clusterUnifiedLastBuiltWALEntry)
if err != nil {
return false, fmt.Errorf("failed fetching last built CRL WAL entry for cluster (%v / %v): %w", index, cluster, err)
}
if lastBuildEntry == nil || lastBuildEntry.Value == nil {
// If the last build entry doesn't exist, we still want to build a
// new delta WAL, since this could be our very first time doing so.
shouldRebuild = true
break
}
// Otherwise, here, now that we know it exists, we want to check this
// value against the other value. Since we previously guarded the WAL
// entry being non-empty, we're good to decode everything within this
// guard.
var walInfo lastWALInfo
if err := lastWALEntry.DecodeJSON(&walInfo); err != nil {
return false, err
return false, fmt.Errorf("failed decoding last revoked WAL entry for cluster (%v / %v): %w", index, cluster, err)
}
var deltaInfo lastDeltaInfo
if err := lastBuildEntry.DecodeJSON(&deltaInfo); err != nil {
return false, err
return false, fmt.Errorf("failed decoding last built CRL WAL entry for cluster (%v / %v): %w", index, cluster, err)
}
// Here, everything decoded properly and we know that no new certs
// have been revoked since we built this last delta CRL. We can exit
// without rebuilding then.
if walInfo.Serial == deltaInfo.Serial {
return false, nil
if walInfo.Serial != deltaInfo.Serial {
shouldRebuild = true
break
}
}
return true, nil
// No errors occurred, so return the result.
return shouldRebuild, nil
}
func (cb *crlBuilder) rebuildDeltaCRLs(sc *storageContext, forceNew bool) error {
@@ -979,8 +1021,20 @@ func revokeCert(sc *storageContext, config *crlConfig, cert *x509.Certificate) (
}
sc.Backend.incrementTotalRevokedCertificatesCount(certsCounted, revEntry.Key)
// From here on out, the certificate has been revoked locally. Any other
// persistence issues might still err, but any other failure messages
// should be added as warnings to the revocation.
resp := &logical.Response{
Data: map[string]interface{}{
"revocation_time": revInfo.RevocationTime,
"revocation_time_rfc3339": revInfo.RevocationTimeUTC.Format(time.RFC3339Nano),
"state": "revoked",
},
}
// If this flag is enabled after the fact, existing local entries will be published to
// the unified storage space through a periodic function.
failedWritingUnifiedCRL := false
if config.UnifiedCRL {
entry := &unifiedRevocationEntry{
SerialNumber: colonSerial,
@@ -996,6 +1050,9 @@ func revokeCert(sc *storageContext, config *crlConfig, cert *x509.Certificate) (
sc.Backend.Logger().Error("Failed to write unified revocation entry, will re-attempt later",
"serial_number", colonSerial, "error", ignoreErr)
sc.Backend.unifiedTransferStatus.forceRun()
resp.AddWarning(fmt.Sprintf("Failed to write unified revocation entry, will re-attempt later: %v", err))
failedWritingUnifiedCRL = true
}
}
@@ -1014,26 +1071,20 @@ func revokeCert(sc *storageContext, config *crlConfig, cert *x509.Certificate) (
}
}
} else if config.EnableDelta {
if err := writeRevocationDeltaWALs(sc, config, hyphenSerial, colonSerial); err != nil {
if err := writeRevocationDeltaWALs(sc, config, resp, failedWritingUnifiedCRL, hyphenSerial, colonSerial); err != nil {
return nil, fmt.Errorf("failed to write WAL entries for Delta CRLs: %w", err)
}
}
return &logical.Response{
Data: map[string]interface{}{
"revocation_time": revInfo.RevocationTime,
"revocation_time_rfc3339": revInfo.RevocationTimeUTC.Format(time.RFC3339Nano),
"state": "revoked",
},
}, nil
return resp, nil
}
func writeRevocationDeltaWALs(sc *storageContext, config *crlConfig, hyphenSerial string, colonSerial string) error {
func writeRevocationDeltaWALs(sc *storageContext, config *crlConfig, resp *logical.Response, failedWritingUnifiedCRL bool, hyphenSerial string, colonSerial string) error {
if err := writeSpecificRevocationDeltaWALs(sc, hyphenSerial, colonSerial, localDeltaWALPath); err != nil {
return fmt.Errorf("failed to write local delta WAL entry: %w", err)
}
if config.UnifiedCRL {
if config.UnifiedCRL && !failedWritingUnifiedCRL {
// We only need to write cross-cluster unified Delta WAL entries when
// it is enabled; in particular, because we rebuild CRLs when enabling
// this flag, any revocations that happened prior to enabling unified
@@ -1043,13 +1094,21 @@ func writeRevocationDeltaWALs(sc *storageContext, config *crlConfig, hyphenSeria
// listing for the unified CRL rebuild, this revocation will not
// appear on either the main or the next delta CRL, but will need to
// wait for a subsequent complete CRL rebuild).
//
// Lastly, we don't attempt this if the unified CRL entry failed to
// write, as we need that entry before the delta WAL entry will make
// sense.
if ignoredErr := writeSpecificRevocationDeltaWALs(sc, hyphenSerial, colonSerial, unifiedDeltaWALPath); ignoredErr != nil {
// Just log the error if we fail to write across clusters, a separate background
// thread will reattempt it later on as we have the local write done.
sc.Backend.Logger().Error("Failed to write cross-cluster delta WAL entry, will re-attempt later",
"serial_number", colonSerial, "error", ignoredErr)
sc.Backend.unifiedTransferStatus.forceRun()
resp.AddWarning(fmt.Sprintf("Failed to write cross-cluster delta WAL entry, will re-attempt later: %v", ignoredErr))
}
} else if failedWritingUnifiedCRL {
resp.AddWarning("Skipping cross-cluster delta WAL entry as cross-cluster revocation failed to write; will re-attempt later.")
}
return nil
@@ -1272,7 +1331,7 @@ func buildAnyCRLs(sc *storageContext, forceNew bool, isDelta bool) error {
}
func getLastWALSerial(sc *storageContext, path string) (string, error) {
lastWALEntry, err := sc.Storage.Get(sc.Context, localDeltaWALLastRevokedSerial)
lastWALEntry, err := sc.Storage.Get(sc.Context, path)
if err != nil {
return "", err
}
@@ -1435,11 +1494,23 @@ func buildAnyUnifiedCRLs(
// (and potentially more) in it; when we're done writing the delta CRL,
// we'll write this serial as a sentinel to see if we need to rebuild it
// in the future.
var lastDeltaSerial string
//
// We need to do this per-cluster.
lastDeltaSerial := map[string]string{}
if isDelta {
lastDeltaSerial, err = getLastWALSerial(sc, unifiedDeltaWALLastRevokedSerial)
clusters, err := sc.Storage.List(sc.Context, unifiedDeltaWALPrefix)
if err != nil {
return nil, err
return nil, fmt.Errorf("error listing clusters for unified delta WAL building: %w", err)
}
for index, cluster := range clusters {
path := unifiedDeltaWALPrefix + cluster + deltaWALLastRevokedSerialName
serial, err := getLastWALSerial(sc, path)
if err != nil {
return nil, fmt.Errorf("error getting last written Delta WAL serial for cluster (%v / %v): %w", index, cluster, err)
}
lastDeltaSerial[cluster] = serial
}
}
@@ -1510,12 +1581,20 @@ func buildAnyUnifiedCRLs(
// for a while.
sc.Backend.crlBuilder.lastDeltaRebuildCheck = time.Now()
if len(lastDeltaSerial) > 0 {
// When we have a last delta serial, write out the relevant info
// so we can skip extra CRL rebuilds.
deltaInfo := lastDeltaInfo{Serial: lastDeltaSerial}
// Persist all of our known last revoked serial numbers here, as the
// last seen serial during build. This will allow us to detect if any
// new revocations have occurred, forcing us to rebuild the delta CRL.
for cluster, serial := range lastDeltaSerial {
if len(serial) == 0 {
continue
}
lastDeltaBuildEntry, err := logical.StorageEntryJSON(unifiedDeltaWALLastBuildSerial, deltaInfo)
// Make sure to use the cluster-specific path. Since we're on the
// active node of the primary cluster, we own this entry and can
// safely write it.
path := unifiedDeltaWALPrefix + cluster + deltaWALLastBuildSerialName
deltaInfo := lastDeltaInfo{Serial: serial}
lastDeltaBuildEntry, err := logical.StorageEntryJSON(path, deltaInfo)
if err != nil {
return nil, fmt.Errorf("error creating last delta CRL rebuild serial entry: %w", err)
}

View File

@@ -8,6 +8,7 @@ import (
"time"
"github.com/hashicorp/vault/sdk/helper/consts"
"github.com/hashicorp/vault/sdk/logical"
)
const (
@@ -87,7 +88,16 @@ func runUnifiedTransfer(sc *storageContext) {
if err != nil {
b.Logger().Error("an error occurred running unified transfer", "error", err.Error())
status.forceRerun.Store(true)
} else {
if config.EnableDelta {
err = doUnifiedTransferMissingDeltaWALSerials(sc, clusterId)
if err != nil {
b.Logger().Error("an error occurred running unified transfer", "error", err.Error())
status.forceRerun.Store(true)
}
}
}
status.lastRun = time.Now()
}
@@ -119,7 +129,7 @@ func doUnifiedTransferMissingLocalSerials(sc *storageContext, clusterId string)
err := readRevocationEntryAndTransfer(sc, serialNum)
if err != nil {
errCount++
sc.Backend.Logger().Debug("Failed transferring local revocation to unified space",
sc.Backend.Logger().Error("Failed transferring local revocation to unified space",
"serial", serialNum, "error", err)
}
}
@@ -132,6 +142,152 @@ func doUnifiedTransferMissingLocalSerials(sc *storageContext, clusterId string)
return nil
}
func doUnifiedTransferMissingDeltaWALSerials(sc *storageContext, clusterId string) error {
// We need to do a similar thing for Delta WAL entry certificates.
// When the delta WAL failed to write for one or more entries,
// we'll need to replicate these up to the primary cluster. When it
// has performed a new delta WAL build, it will empty storage and
// update to a last written WAL entry that exceeds what we've seen
// locally.
thisUnifiedWALEntryPath := unifiedDeltaWALPath + deltaWALLastRevokedSerialName
lastUnifiedWALEntry, err := getLastWALSerial(sc, thisUnifiedWALEntryPath)
if err != nil {
return fmt.Errorf("failed to fetch last cross-cluster unified revoked delta WAL serial number: %w", err)
}
lastLocalWALEntry, err := getLastWALSerial(sc, localDeltaWALLastRevokedSerial)
if err != nil {
return fmt.Errorf("failed to fetch last locally revoked delta WAL serial number: %w", err)
}
// We now need to transfer all the entries and then write the last WAL
// entry at the end. Start by listing all certificates; any missing
// certificates will be copied over and then the WAL entry will be
// updated once.
//
// We do not delete entries either locally or remotely, as either
// cluster could've rebuilt delta CRLs with out-of-sync information,
// removing some entries (and, we cannot differentiate between these
// two cases). On next full CRL rebuild (on either cluster), the state
// should get synchronized, and future delta CRLs after this function
// returns without issue will see the remaining entries.
//
// Lastly, we need to ensure we don't accidentally write any unified
// delta WAL entries that aren't present in the main cross-cluster
// revoked storage location. This would mean the above function failed
// to copy them for some reason, despite them presumably appearing
// locally.
_unifiedWALEntries, err := sc.Storage.List(sc.Context, unifiedDeltaWALPath)
if err != nil {
return fmt.Errorf("failed to list cross-cluster unified delta WAL storage: %w", err)
}
unifiedWALEntries := sliceToMapKey(_unifiedWALEntries)
_unifiedRevokedSerials, err := listClusterSpecificUnifiedRevokedCerts(sc, clusterId)
if err != nil {
return fmt.Errorf("failed to list cross-cluster revoked certificates: %w", err)
}
unifiedRevokedSerials := sliceToMapKey(_unifiedRevokedSerials)
localWALEntries, err := sc.Storage.List(sc.Context, localDeltaWALPath)
if err != nil {
return fmt.Errorf("failed to list local delta WAL storage: %w", err)
}
if lastUnifiedWALEntry == lastLocalWALEntry && len(_unifiedWALEntries) == len(localWALEntries) {
// Writing the last revoked WAL entry is the last thing that we do.
// Because these entries match (across clusters) and we have the same
// number of entries, assume we don't have anything to sync and exit
// early.
//
// We need both checks as, in the event of PBPWF failing and then
// returning while more revocations are happening, we could have
// been schedule to run, but then skip running (if only the first
// condition was checked) because a later revocation succeeded
// in writing a unified WAL entry, before we started replicating
// the rest back up.
//
// The downside of this approach is that, if the main cluster
// does a full rebuild in the mean time, we could re-sync more
// entries back up to the primary cluster that are already
// included in the complete CRL. Users can manually rebuild the
// full CRL (clearing these duplicate delta CRL entries) if this
// affects them.
return nil
}
errCount := 0
for index, serial := range localWALEntries {
if index%25 == 0 {
config, _ := sc.Backend.crlBuilder.getConfigWithUpdate(sc)
if config != nil && (!config.UnifiedCRL || !config.EnableDelta) {
return errors.New("unified or delta CRLs have been disabled after we started, stopping")
}
}
if serial == deltaWALLastBuildSerialName || serial == deltaWALLastRevokedSerialName {
// Skip our special serial numbers.
continue
}
_, isAlreadyPresent := unifiedWALEntries[serial]
if isAlreadyPresent {
// Serial exists on both local and unified cluster. We're
// presuming we don't need to read and re-write these entries
// and that only missing entries need to be updated.
continue
}
_, isRevokedCopied := unifiedRevokedSerials[serial]
if !isRevokedCopied {
// We need to wait here to copy over.
errCount += 1
sc.Backend.Logger().Debug("Delta WAL exists locally, but corresponding cross-cluster full revocation entry is missing; skipping", "serial", serial)
continue
}
// All good: read the local entry and write to the remote variant.
localPath := localDeltaWALPath + serial
unifiedPath := unifiedDeltaWALPath + serial
entry, err := sc.Storage.Get(sc.Context, localPath)
if err != nil || entry == nil {
errCount += 1
sc.Backend.Logger().Error("Failed reading local delta WAL entry to copy to cross-cluster", "serial", serial, "err", err)
continue
}
entry.Key = unifiedPath
err = sc.Storage.Put(sc.Context, entry)
if err != nil {
errCount += 1
sc.Backend.Logger().Error("Failed sync local delta WAL entry to cross-cluster unified delta WAL location", "serial", serial, "err", err)
continue
}
}
if errCount > 0 {
// See note above about why we don't fail here.
sc.Backend.Logger().Warn(fmt.Sprintf("Failed transfering %d local delta WAL serials to unified storage", errCount))
return nil
}
// Everything worked. Here, we can write over the delta WAL last revoked
// value. By using the earlier value, even if new revocations have
// occurred, we ensure any further missing entries can be handled in the
// next round.
lastRevSerial := lastWALInfo{Serial: lastLocalWALEntry}
lastWALEntry, err := logical.StorageEntryJSON(thisUnifiedWALEntryPath, lastRevSerial)
if err != nil {
return fmt.Errorf("unable to create cross-cluster unified last delta CRL WAL entry: %w", err)
}
if err = sc.Storage.Put(sc.Context, lastWALEntry); err != nil {
return fmt.Errorf("error saving cross-cluster unified last delta CRL WAL entry: %w", err)
}
return nil
}
func readRevocationEntryAndTransfer(sc *storageContext, serial string) error {
hyphenSerial := normalizeSerial(serial)
revInfo, err := sc.fetchRevocationInfo(hyphenSerial)

3
changelog/20058.txt Normal file
View File

@@ -0,0 +1,3 @@
```release-note:bug
secrets/pki: Fix building of unified delta CRLs and recovery during unified delta WAL write failures.
```