1
0
vault-redux/vault/cluster.go

398 lines
11 KiB
Go
Raw Normal View History

// Copyright (c) HashiCorp, Inc.
// SPDX-License-Identifier: BUSL-1.1
package vault
import (
"context"
"crypto/ecdsa"
"crypto/elliptic"
"crypto/rand"
"crypto/x509"
"crypto/x509/pkix"
"encoding/json"
"errors"
"fmt"
"math/big"
mathrand "math/rand"
"net"
"net/http"
"strings"
"time"
uuid "github.com/hashicorp/go-uuid"
"github.com/hashicorp/vault/sdk/helper/jsonutil"
"github.com/hashicorp/vault/sdk/logical"
"github.com/hashicorp/vault/vault/cluster"
)
const (
// Storage path where the local cluster name and identifier are stored
2016-08-09 18:28:42 +03:00
coreLocalClusterInfoPath = "core/cluster/local/info"
2017-01-05 00:44:03 +03:00
corePrivateKeyTypeP521 = "p521"
corePrivateKeyTypeED25519 = "ed25519"
// Internal so as not to log a trace message
IntNoForwardingHeaderName = "X-Vault-Internal-No-Request-Forwarding"
)
var (
ErrCannotForward = errors.New("cannot forward request; no connection or address not known")
ErrCannotForwardLocalOnly = errors.New("cannot forward local-only request")
)
type ClusterLeaderParams struct {
LeaderUUID string
LeaderRedirectAddr string
LeaderClusterAddr string
}
// Structure representing the storage entry that holds cluster information
type Cluster struct {
// Name of the cluster
Name string `json:"name" structs:"name" mapstructure:"name"`
// Identifier of the cluster
ID string `json:"id" structs:"id" mapstructure:"id"`
}
2017-01-06 23:42:18 +03:00
// Cluster fetches the details of the local cluster. This method errors out
// when Vault is sealed.
func (c *Core) Cluster(ctx context.Context) (*Cluster, error) {
var cluster Cluster
// Fetch the storage entry. This call fails when Vault is sealed.
entry, err := c.barrier.Get(ctx, coreLocalClusterInfoPath)
if err != nil {
2016-08-10 22:22:12 +03:00
return nil, err
}
if entry == nil {
return &cluster, nil
}
// Decode the cluster information
if err = jsonutil.DecodeJSON(entry.Value, &cluster); err != nil {
return nil, fmt.Errorf("failed to decode cluster details: %w", err)
}
// Set in config file
if c.clusterName != "" {
cluster.Name = c.clusterName
}
return &cluster, nil
}
2016-08-19 18:03:53 +03:00
// This sets our local cluster cert and private key based on the advertisement.
// It also ensures the cert is in our local cluster cert pool.
func (c *Core) loadLocalClusterTLS(adv activeAdvertisement) (retErr error) {
defer func() {
if retErr != nil {
c.localClusterCert.Store(([]byte)(nil))
c.localClusterParsedCert.Store((*x509.Certificate)(nil))
c.localClusterPrivateKey.Store((*ecdsa.PrivateKey)(nil))
c.requestForwardingConnectionLock.Lock()
c.clearForwardingClients()
c.requestForwardingConnectionLock.Unlock()
}
}()
switch {
case adv.ClusterAddr == "":
// Clustering disabled on the server, don't try to look for params
return nil
case adv.ClusterKeyParams == nil:
c.logger.Error("no key params found loading local cluster TLS information")
return fmt.Errorf("no local cluster key params found")
case adv.ClusterKeyParams.X == nil, adv.ClusterKeyParams.Y == nil, adv.ClusterKeyParams.D == nil:
c.logger.Error("failed to parse local cluster key due to missing params")
return fmt.Errorf("failed to parse local cluster key")
2016-08-19 18:03:53 +03:00
case adv.ClusterKeyParams.Type != corePrivateKeyTypeP521:
c.logger.Error("unknown local cluster key type", "key_type", adv.ClusterKeyParams.Type)
return fmt.Errorf("failed to find valid local cluster key type")
case adv.ClusterCert == nil || len(adv.ClusterCert) == 0:
c.logger.Error("no local cluster cert found")
return fmt.Errorf("no local cluster cert found")
}
2016-08-19 18:03:53 +03:00
c.localClusterPrivateKey.Store(&ecdsa.PrivateKey{
PublicKey: ecdsa.PublicKey{
Curve: elliptic.P521(),
X: adv.ClusterKeyParams.X,
Y: adv.ClusterKeyParams.Y,
},
D: adv.ClusterKeyParams.D,
})
locCert := make([]byte, len(adv.ClusterCert))
copy(locCert, adv.ClusterCert)
c.localClusterCert.Store(locCert)
cert, err := x509.ParseCertificate(adv.ClusterCert)
if err != nil {
c.logger.Error("failed parsing local cluster certificate", "error", err)
return fmt.Errorf("error parsing local cluster certificate: %w", err)
}
c.localClusterParsedCert.Store(cert)
return nil
}
// setupCluster creates storage entries for holding Vault cluster information.
2016-07-26 16:18:38 +03:00
// Entries will be created only if they are not already present. If clusterName
// is not supplied, this method will auto-generate it.
func (c *Core) setupCluster(ctx context.Context) error {
2017-01-06 23:42:18 +03:00
// Prevent data races with the TLS parameters
c.clusterParamsLock.Lock()
defer c.clusterParamsLock.Unlock()
// Check if storage index is already present or not
cluster, err := c.Cluster(ctx)
if err != nil {
c.logger.Error("failed to get cluster details", "error", err)
return err
}
var modified bool
if cluster == nil {
cluster = &Cluster{}
}
if cluster.Name == "" {
// If cluster name is not supplied, generate one
if c.clusterName == "" {
c.logger.Debug("cluster name not found/set, generating new")
clusterNameBytes, err := uuid.GenerateRandomBytes(4)
if err != nil {
c.logger.Error("failed to generate cluster name", "error", err)
return err
}
c.clusterName = fmt.Sprintf("vault-cluster-%08x", clusterNameBytes)
}
cluster.Name = c.clusterName
2016-08-19 23:45:17 +03:00
if c.logger.IsDebug() {
c.logger.Debug("cluster name set", "name", cluster.Name)
2016-08-19 23:45:17 +03:00
}
modified = true
}
// This is the first point at which the stored (or newly generated)
// cluster name is known.
c.metricSink.SetDefaultClusterName(cluster.Name)
if cluster.ID == "" {
c.logger.Debug("cluster ID not found, generating new")
// Generate a clusterID
cluster.ID, err = uuid.GenerateUUID()
if err != nil {
c.logger.Error("failed to generate cluster identifier", "error", err)
return err
}
2016-08-19 23:45:17 +03:00
if c.logger.IsDebug() {
c.logger.Debug("cluster ID set", "id", cluster.ID)
2016-08-19 23:45:17 +03:00
}
modified = true
}
2016-08-19 18:03:53 +03:00
// If we're using HA, generate server-to-server parameters
if c.ha != nil {
// Create a private key
if c.localClusterPrivateKey.Load().(*ecdsa.PrivateKey) == nil {
2018-09-18 06:03:00 +03:00
c.logger.Debug("generating cluster private key")
key, err := ecdsa.GenerateKey(elliptic.P521(), c.secureRandomReader)
2016-08-19 18:03:53 +03:00
if err != nil {
c.logger.Error("failed to generate local cluster key", "error", err)
2016-08-19 18:03:53 +03:00
return err
}
c.localClusterPrivateKey.Store(key)
}
2016-08-19 18:03:53 +03:00
// Create a certificate
if c.localClusterCert.Load().([]byte) == nil {
2016-08-19 18:03:53 +03:00
host, err := uuid.GenerateUUID()
if err != nil {
return err
}
2017-03-02 02:16:47 +03:00
host = fmt.Sprintf("fw-%s", host)
c.logger.Debug("generating local cluster certificate", "host", host)
2016-08-19 18:03:53 +03:00
template := &x509.Certificate{
Subject: pkix.Name{
CommonName: host,
},
DNSNames: []string{host},
ExtKeyUsage: []x509.ExtKeyUsage{
x509.ExtKeyUsageServerAuth,
x509.ExtKeyUsageClientAuth,
},
KeyUsage: x509.KeyUsageDigitalSignature | x509.KeyUsageKeyEncipherment | x509.KeyUsageKeyAgreement | x509.KeyUsageCertSign,
SerialNumber: big.NewInt(mathrand.Int63()),
NotBefore: time.Now().Add(-30 * time.Second),
// 30 years of single-active uptime ought to be enough for anybody
NotAfter: time.Now().Add(262980 * time.Hour),
BasicConstraintsValid: true,
2018-09-04 19:29:18 +03:00
IsCA: true,
2016-08-19 18:03:53 +03:00
}
certBytes, err := x509.CreateCertificate(rand.Reader, template, template, c.localClusterPrivateKey.Load().(*ecdsa.PrivateKey).Public(), c.localClusterPrivateKey.Load().(*ecdsa.PrivateKey))
2016-08-19 18:03:53 +03:00
if err != nil {
c.logger.Error("error generating self-signed cert", "error", err)
return fmt.Errorf("unable to generate local cluster certificate: %w", err)
2016-08-19 18:03:53 +03:00
}
parsedCert, err := x509.ParseCertificate(certBytes)
2016-08-19 18:03:53 +03:00
if err != nil {
c.logger.Error("error parsing self-signed cert", "error", err)
return fmt.Errorf("error parsing generated certificate: %w", err)
2016-08-19 18:03:53 +03:00
}
c.localClusterCert.Store(certBytes)
c.localClusterParsedCert.Store(parsedCert)
}
}
if modified {
// Encode the cluster information into as a JSON string
rawCluster, err := json.Marshal(cluster)
if err != nil {
c.logger.Error("failed to encode cluster details", "error", err)
return err
}
// Store it
err = c.barrier.Put(ctx, &logical.StorageEntry{
Key: coreLocalClusterInfoPath,
Value: rawCluster,
})
if err != nil {
c.logger.Error("failed to store cluster details", "error", err)
return err
}
}
c.clusterID.Store(cluster.ID)
return nil
}
func (c *Core) loadCluster(ctx context.Context) error {
cluster, err := c.Cluster(ctx)
if err != nil {
c.logger.Error("failed to get cluster details", "error", err)
return err
}
c.clusterID.Store(cluster.ID)
return nil
}
// startClusterListener starts cluster request listeners during unseal. It
2016-08-19 18:03:53 +03:00
// is assumed that the state lock is held while this is run. Right now this
// only starts cluster listeners. Once the listener is started handlers/clients
// can start being registered to it.
2018-01-19 12:11:59 +03:00
func (c *Core) startClusterListener(ctx context.Context) error {
if c.ClusterAddr() == "" {
c.logger.Info("clustering disabled, not starting listeners")
return nil
}
if c.getClusterListener() != nil {
Raft Storage Backend (#6888) * Work on raft backend * Add logstore locally * Add encryptor and unsealable interfaces * Add clustering support to raft * Remove client and handler * Bootstrap raft on init * Cleanup raft logic a bit * More raft work * Work on TLS config * More work on bootstrapping * Fix build * More work on bootstrapping * More bootstrapping work * fix build * Remove consul dep * Fix build * merged oss/master into raft-storage * Work on bootstrapping * Get bootstrapping to work * Clean up FMS and node-id * Update local node ID logic * Cleanup node-id change * Work on snapshotting * Raft: Add remove peer API (#906) * Add remove peer API * Add some comments * Fix existing snapshotting (#909) * Raft get peers API (#912) * Read raft configuration * address review feedback * Use the Leadership Transfer API to step-down the active node (#918) * Raft join and unseal using Shamir keys (#917) * Raft join using shamir * Store AEAD instead of master key * Split the raft join process to answer the challenge after a successful unseal * get the follower to standby state * Make unseal work * minor changes * Some input checks * reuse the shamir seal access instead of new default seal access * refactor joinRaftSendAnswer function * Synchronously send answer in auto-unseal case * Address review feedback * Raft snapshots (#910) * Fix existing snapshotting * implement the noop snapshotting * Add comments and switch log libraries * add some snapshot tests * add snapshot test file * add TODO * More work on raft snapshotting * progress on the ConfigStore strategy * Don't use two buckets * Update the snapshot store logic to hide the file logic * Add more backend tests * Cleanup code a bit * [WIP] Raft recovery (#938) * Add recovery functionality * remove fmt.Printfs * Fix a few fsm bugs * Add max size value for raft backend (#942) * Add max size value for raft backend * Include physical.ErrValueTooLarge in the message * Raft snapshot Take/Restore API (#926) * Inital work on raft snapshot APIs * Always redirect snapshot install/download requests * More work on the snapshot APIs * Cleanup code a bit * On restore handle special cases * Use the seal to encrypt the sha sum file * Add sealer mechanism and fix some bugs * Call restore while state lock is held * Send restore cb trigger through raft log * Make error messages nicer * Add test helpers * Add snapshot test * Add shamir unseal test * Add more raft snapshot API tests * Fix locking * Change working to initalize * Add underlying raw object to test cluster core * Move leaderUUID to core * Add raft TLS rotation logic (#950) * Add TLS rotation logic * Cleanup logic a bit * Add/Remove from follower state on add/remove peer * add comments * Update more comments * Update request_forwarding_service.proto * Make sure we populate all nodes in the followerstate obj * Update times * Apply review feedback * Add more raft config setting (#947) * Add performance config setting * Add more config options and fix tests * Test Raft Recovery (#944) * Test raft recovery * Leave out a node during recovery * remove unused struct * Update physical/raft/snapshot_test.go * Update physical/raft/snapshot_test.go * fix vendoring * Switch to new raft interface * Remove unused files * Switch a gogo -> proto instance * Remove unneeded vault dep in go.sum * Update helper/testhelpers/testhelpers.go Co-Authored-By: Calvin Leung Huang <cleung2010@gmail.com> * Update vault/cluster/cluster.go * track active key within the keyring itself (#6915) * track active key within the keyring itself * lookup and store using the active key ID * update docstring * minor refactor * Small text fixes (#6912) * Update physical/raft/raft.go Co-Authored-By: Calvin Leung Huang <cleung2010@gmail.com> * review feedback * Move raft logical system into separate file * Update help text a bit * Enforce cluster addr is set and use it for raft bootstrapping * Fix tests * fix http test panic * Pull in latest raft-snapshot library * Add comment
2019-06-20 22:14:58 +03:00
c.logger.Warn("cluster listener is already started")
return nil
}
2016-08-19 18:03:53 +03:00
if c.clusterListenerAddrs == nil || len(c.clusterListenerAddrs) == 0 {
c.logger.Warn("clustering not disabled but no addresses to listen on")
2016-08-19 18:03:53 +03:00
return fmt.Errorf("cluster addresses not found")
}
c.logger.Debug("starting cluster listeners")
2016-08-19 18:03:53 +03:00
networkLayer := c.clusterNetworkLayer
if networkLayer == nil {
tcpLogger := c.logger.Named("cluster-listener.tcp")
networkLayer = cluster.NewTCPLayer(c.clusterListenerAddrs, tcpLogger)
c.AddLogger(tcpLogger)
}
listenerLogger := c.logger.Named("cluster-listener")
c.clusterListener.Store(cluster.NewListener(networkLayer,
c.clusterCipherSuites,
listenerLogger,
5*c.clusterHeartbeatInterval))
c.AddLogger(listenerLogger)
err := c.getClusterListener().Run(ctx)
if err != nil {
return err
}
if strings.HasSuffix(c.ClusterAddr(), ":0") {
// If we listened on port 0, record the port the OS gave us.
c.clusterAddr.Store(fmt.Sprintf("https://%s", c.getClusterListener().Addr()))
}
if len(c.ClusterAddr()) != 0 {
if err := c.getClusterListener().SetAdvertiseAddr(c.ClusterAddr()); err != nil {
return err
}
}
return nil
}
Raft Storage Backend (#6888) * Work on raft backend * Add logstore locally * Add encryptor and unsealable interfaces * Add clustering support to raft * Remove client and handler * Bootstrap raft on init * Cleanup raft logic a bit * More raft work * Work on TLS config * More work on bootstrapping * Fix build * More work on bootstrapping * More bootstrapping work * fix build * Remove consul dep * Fix build * merged oss/master into raft-storage * Work on bootstrapping * Get bootstrapping to work * Clean up FMS and node-id * Update local node ID logic * Cleanup node-id change * Work on snapshotting * Raft: Add remove peer API (#906) * Add remove peer API * Add some comments * Fix existing snapshotting (#909) * Raft get peers API (#912) * Read raft configuration * address review feedback * Use the Leadership Transfer API to step-down the active node (#918) * Raft join and unseal using Shamir keys (#917) * Raft join using shamir * Store AEAD instead of master key * Split the raft join process to answer the challenge after a successful unseal * get the follower to standby state * Make unseal work * minor changes * Some input checks * reuse the shamir seal access instead of new default seal access * refactor joinRaftSendAnswer function * Synchronously send answer in auto-unseal case * Address review feedback * Raft snapshots (#910) * Fix existing snapshotting * implement the noop snapshotting * Add comments and switch log libraries * add some snapshot tests * add snapshot test file * add TODO * More work on raft snapshotting * progress on the ConfigStore strategy * Don't use two buckets * Update the snapshot store logic to hide the file logic * Add more backend tests * Cleanup code a bit * [WIP] Raft recovery (#938) * Add recovery functionality * remove fmt.Printfs * Fix a few fsm bugs * Add max size value for raft backend (#942) * Add max size value for raft backend * Include physical.ErrValueTooLarge in the message * Raft snapshot Take/Restore API (#926) * Inital work on raft snapshot APIs * Always redirect snapshot install/download requests * More work on the snapshot APIs * Cleanup code a bit * On restore handle special cases * Use the seal to encrypt the sha sum file * Add sealer mechanism and fix some bugs * Call restore while state lock is held * Send restore cb trigger through raft log * Make error messages nicer * Add test helpers * Add snapshot test * Add shamir unseal test * Add more raft snapshot API tests * Fix locking * Change working to initalize * Add underlying raw object to test cluster core * Move leaderUUID to core * Add raft TLS rotation logic (#950) * Add TLS rotation logic * Cleanup logic a bit * Add/Remove from follower state on add/remove peer * add comments * Update more comments * Update request_forwarding_service.proto * Make sure we populate all nodes in the followerstate obj * Update times * Apply review feedback * Add more raft config setting (#947) * Add performance config setting * Add more config options and fix tests * Test Raft Recovery (#944) * Test raft recovery * Leave out a node during recovery * remove unused struct * Update physical/raft/snapshot_test.go * Update physical/raft/snapshot_test.go * fix vendoring * Switch to new raft interface * Remove unused files * Switch a gogo -> proto instance * Remove unneeded vault dep in go.sum * Update helper/testhelpers/testhelpers.go Co-Authored-By: Calvin Leung Huang <cleung2010@gmail.com> * Update vault/cluster/cluster.go * track active key within the keyring itself (#6915) * track active key within the keyring itself * lookup and store using the active key ID * update docstring * minor refactor * Small text fixes (#6912) * Update physical/raft/raft.go Co-Authored-By: Calvin Leung Huang <cleung2010@gmail.com> * review feedback * Move raft logical system into separate file * Update help text a bit * Enforce cluster addr is set and use it for raft bootstrapping * Fix tests * fix http test panic * Pull in latest raft-snapshot library * Add comment
2019-06-20 22:14:58 +03:00
func (c *Core) ClusterAddr() string {
return c.clusterAddr.Load().(string)
Raft Storage Backend (#6888) * Work on raft backend * Add logstore locally * Add encryptor and unsealable interfaces * Add clustering support to raft * Remove client and handler * Bootstrap raft on init * Cleanup raft logic a bit * More raft work * Work on TLS config * More work on bootstrapping * Fix build * More work on bootstrapping * More bootstrapping work * fix build * Remove consul dep * Fix build * merged oss/master into raft-storage * Work on bootstrapping * Get bootstrapping to work * Clean up FMS and node-id * Update local node ID logic * Cleanup node-id change * Work on snapshotting * Raft: Add remove peer API (#906) * Add remove peer API * Add some comments * Fix existing snapshotting (#909) * Raft get peers API (#912) * Read raft configuration * address review feedback * Use the Leadership Transfer API to step-down the active node (#918) * Raft join and unseal using Shamir keys (#917) * Raft join using shamir * Store AEAD instead of master key * Split the raft join process to answer the challenge after a successful unseal * get the follower to standby state * Make unseal work * minor changes * Some input checks * reuse the shamir seal access instead of new default seal access * refactor joinRaftSendAnswer function * Synchronously send answer in auto-unseal case * Address review feedback * Raft snapshots (#910) * Fix existing snapshotting * implement the noop snapshotting * Add comments and switch log libraries * add some snapshot tests * add snapshot test file * add TODO * More work on raft snapshotting * progress on the ConfigStore strategy * Don't use two buckets * Update the snapshot store logic to hide the file logic * Add more backend tests * Cleanup code a bit * [WIP] Raft recovery (#938) * Add recovery functionality * remove fmt.Printfs * Fix a few fsm bugs * Add max size value for raft backend (#942) * Add max size value for raft backend * Include physical.ErrValueTooLarge in the message * Raft snapshot Take/Restore API (#926) * Inital work on raft snapshot APIs * Always redirect snapshot install/download requests * More work on the snapshot APIs * Cleanup code a bit * On restore handle special cases * Use the seal to encrypt the sha sum file * Add sealer mechanism and fix some bugs * Call restore while state lock is held * Send restore cb trigger through raft log * Make error messages nicer * Add test helpers * Add snapshot test * Add shamir unseal test * Add more raft snapshot API tests * Fix locking * Change working to initalize * Add underlying raw object to test cluster core * Move leaderUUID to core * Add raft TLS rotation logic (#950) * Add TLS rotation logic * Cleanup logic a bit * Add/Remove from follower state on add/remove peer * add comments * Update more comments * Update request_forwarding_service.proto * Make sure we populate all nodes in the followerstate obj * Update times * Apply review feedback * Add more raft config setting (#947) * Add performance config setting * Add more config options and fix tests * Test Raft Recovery (#944) * Test raft recovery * Leave out a node during recovery * remove unused struct * Update physical/raft/snapshot_test.go * Update physical/raft/snapshot_test.go * fix vendoring * Switch to new raft interface * Remove unused files * Switch a gogo -> proto instance * Remove unneeded vault dep in go.sum * Update helper/testhelpers/testhelpers.go Co-Authored-By: Calvin Leung Huang <cleung2010@gmail.com> * Update vault/cluster/cluster.go * track active key within the keyring itself (#6915) * track active key within the keyring itself * lookup and store using the active key ID * update docstring * minor refactor * Small text fixes (#6912) * Update physical/raft/raft.go Co-Authored-By: Calvin Leung Huang <cleung2010@gmail.com> * review feedback * Move raft logical system into separate file * Update help text a bit * Enforce cluster addr is set and use it for raft bootstrapping * Fix tests * fix http test panic * Pull in latest raft-snapshot library * Add comment
2019-06-20 22:14:58 +03:00
}
func (c *Core) getClusterListener() *cluster.Listener {
cl := c.clusterListener.Load()
if cl == nil {
return nil
}
return cl.(*cluster.Listener)
}
// stopClusterListener stops any existing listeners during seal. It is
// assumed that the state lock is held while this is run.
func (c *Core) stopClusterListener() {
clusterListener := c.getClusterListener()
if clusterListener == nil {
c.logger.Debug("clustering disabled, not stopping listeners")
2016-08-19 18:03:53 +03:00
return
}
c.logger.Info("stopping cluster listeners")
2016-08-19 18:03:53 +03:00
clusterListener.Stop()
c.clusterListener.Store((*cluster.Listener)(nil))
c.logger.Info("cluster listeners successfully shut down")
}
2016-08-19 18:03:53 +03:00
func (c *Core) SetClusterListenerAddrs(addrs []*net.TCPAddr) {
c.clusterListenerAddrs = addrs
if c.ClusterAddr() == "" && len(addrs) == 1 {
c.clusterAddr.Store(fmt.Sprintf("https://%s", addrs[0].String()))
}
}
func (c *Core) SetClusterHandler(handler http.Handler) {
c.clusterHandler = handler
}
func (c *Core) ClusterID() string {
return c.clusterID.Load()
}