From 0e26eed743d92504bc4b915bc93acd58a51c47a0 Mon Sep 17 00:00:00 2001 From: Siva Date: Wed, 3 Dec 2025 19:45:46 +0530 Subject: [PATCH 1/3] Docs: Etcd Mode Reconfiguration Procedures # docs update --- docs/using/etcd-reconfiguration.md | 116 +++++++++++++++++++++++++++++ server/internal/etcd/embedded.go | 78 +++++++++++++++++++ server/internal/etcd/interface.go | 2 + server/internal/etcd/provide.go | 64 ++++++++++++++-- server/internal/etcd/rbac.go | 42 ++++++++--- server/internal/etcd/remote.go | 62 +++++++++++++++ 6 files changed, 347 insertions(+), 17 deletions(-) create mode 100644 docs/using/etcd-reconfiguration.md diff --git a/docs/using/etcd-reconfiguration.md b/docs/using/etcd-reconfiguration.md new file mode 100644 index 00000000..e8597d54 --- /dev/null +++ b/docs/using/etcd-reconfiguration.md @@ -0,0 +1,116 @@ +# Etcd Mode Reconfiguration + + +This guide explains how to change a Control Plane host's etcd mode after cluster initialization. + + +## Overview + + +The Control Plane supports two etcd modes: + + +- **Server mode**: Runs an embedded etcd server and participates as a voting member +- **Client mode**: Connects to the etcd cluster as a client only + + +**Recommended topology:** +- 1-3 hosts: All should be etcd servers +- 4-7 hosts: 3 etcd servers, rest as clients +- 8+ hosts: 5 etcd servers, rest as clients + + +!!! warning "Maintain Odd Numbers" + Etcd requires an **odd number** of servers (3 or 5) for proper quorum. + + +## How It Works + + +Etcd mode reconfiguration is **fully automatic**: + + +1. Stop the container +2. Update `PGEDGE_ETCD_MODE` environment variable +3. Restart the container +4. The system automatically handles all cluster operations + + +**What happens automatically:** +- **Client→Server**: Discovers cluster, obtains credentials, joins as voting member +- **Server→Client**: Removes itself from membership, transitions to client mode + +## Procedures + + +### Promoting a Client to Server (Example - host-4) + + +```bash +# 1. Stop the container +docker stop control-plane-host-4 + + +# 2. Update docker-compose.yaml environment: +PGEDGE_ETCD_MODE: server # was: client + + +# 3. Restart +docker-compose up -d host-4 + + +# 4. Verify (check logs) +docker logs control-plane-host-4 +``` + + +### Demoting a Server to Client (Example - host-4) + + +!!! warning "Quorum Check" + Ensure at least 2 other healthy servers remain before demotion. + + +```bash +# 1. Stop the container +docker stop control-plane-host-4 + + +# 2. Update docker-compose.yaml environment: +PGEDGE_ETCD_MODE: client # was: server + + +# 3. Restart +docker-compose up -d host-4 + + +# 4. Verify (check logs) +docker logs control-plane-host-4 +``` + +### General Troubleshooting + + +Check cluster health: + + +```bash +docker exec control-plane-host-1 etcdctl member list +``` + + +All members should show `STATUS=started`. + + +## Best Practices + + +- **Change one host at a time** - Wait for completion before reconfiguring another +- **Monitor cluster health** - Verify all servers healthy before/after changes +- **Maintain odd numbers** - Always keep 3 or 5 etcd servers, never 2 or 4 + +## Summary + + +Etcd mode reconfiguration is fully automatic - just update the environment variable and restart. The Control Plane handles all cluster operations including credential provisioning, membership changes, and configuration updates without manual intervention. + diff --git a/server/internal/etcd/embedded.go b/server/internal/etcd/embedded.go index 85166a80..e3637633 100644 --- a/server/internal/etcd/embedded.go +++ b/server/internal/etcd/embedded.go @@ -53,6 +53,10 @@ func (e *EmbeddedEtcd) Start(ctx context.Context) error { e.mu.Lock() defer e.mu.Unlock() + if e.etcd != nil { + return nil // already started + } + initialized, err := e.IsInitialized() if err != nil { return err @@ -292,6 +296,7 @@ func (e *EmbeddedEtcd) Shutdown() error { } if e.etcd != nil { e.etcd.Close() + e.etcd = nil } return errors.Join(errs...) } @@ -443,6 +448,79 @@ func (e *EmbeddedEtcd) HealthCheck() common.ComponentStatus { } } +func (e *EmbeddedEtcd) ChangeMode(ctx context.Context, mode config.EtcdMode) (Etcd, error) { + if mode != config.EtcdModeClient { + return nil, fmt.Errorf("invalid mode transition from %s to %s", config.EtcdModeServer, mode) + } + + if err := e.Start(ctx); err != nil { + return nil, err + } + + cfg := e.cfg.Config() + + embeddedClient, err := e.GetClient() + if err != nil { + return nil, err + } + + // Get the full member list before removing this host + resp, err := embeddedClient.MemberList(ctx) + if err != nil { + return nil, fmt.Errorf("failed to list etcd members for server->client transition: %w", err) + } + + var endpoints []string + for _, m := range resp.Members { + // Skip this host's member; we are about to remove it. + if m.Name == cfg.HostID { + continue + } + endpoints = append(endpoints, m.ClientURLs...) + } + + if len(endpoints) == 0 { + return nil, fmt.Errorf("cannot demote etcd server on host %s: no remaining cluster members with client URLs", cfg.HostID) + } + + generated := e.cfg.GeneratedConfig() + generated.EtcdClient.Endpoints = endpoints + if err := e.cfg.UpdateGeneratedConfig(generated); err != nil { + return nil, fmt.Errorf("failed to update generated config with client endpoints: %w", err) + } + + if err := e.Shutdown(); err != nil { + return nil, err + } + + remote := NewRemoteEtcd(e.cfg, e.logger) + if err := remote.Start(ctx); err != nil { + return nil, fmt.Errorf("failed to start remote client: %w", err) + } + + remoteClient, err := remote.GetClient() + if err != nil { + return nil, fmt.Errorf("failed to get remote client: %w", err) + } + + if err := RemoveMember(ctx, remoteClient, cfg.HostID); err != nil { + return nil, fmt.Errorf("failed to remove embedded etcd from cluster: %w", err) + } + + if err := os.RemoveAll(e.etcdDir()); err != nil { + return nil, fmt.Errorf("failed to remove embedded etcd data dir: %w", err) + } + + generated.EtcdMode = config.EtcdModeClient + generated.EtcdServer = config.EtcdServer{} + generated.EtcdClient = cfg.EtcdClient + if err := e.cfg.UpdateGeneratedConfig(generated); err != nil { + return nil, fmt.Errorf("failed to clear out etcd server settings in generated config: %w", err) + } + + return remote, err +} + const maxLearnerStallTime = 5 * time.Minute type learnerProgress struct { diff --git a/server/internal/etcd/interface.go b/server/internal/etcd/interface.go index 6bf515b9..8a0896cc 100644 --- a/server/internal/etcd/interface.go +++ b/server/internal/etcd/interface.go @@ -6,6 +6,7 @@ import ( clientv3 "go.etcd.io/etcd/client/v3" "github.com/pgEdge/control-plane/server/internal/common" + "github.com/pgEdge/control-plane/server/internal/config" ) type ClusterMember struct { @@ -50,4 +51,5 @@ type Etcd interface { RemoveHost(ctx context.Context, hostID string) error JoinToken() (string, error) VerifyJoinToken(in string) error + ChangeMode(ctx context.Context, mode config.EtcdMode) (Etcd, error) } diff --git a/server/internal/etcd/provide.go b/server/internal/etcd/provide.go index 6ce96b73..a634d80d 100644 --- a/server/internal/etcd/provide.go +++ b/server/internal/etcd/provide.go @@ -1,7 +1,9 @@ package etcd import ( + "context" "fmt" + "time" "github.com/rs/zerolog" "github.com/samber/do" @@ -27,6 +29,18 @@ func provideClient(i *do.Injector) { }) } +// newEtcdForMode creates an Etcd instance based on the specified mode. +func newEtcdForMode(mode config.EtcdMode, cfg *config.Manager, logger zerolog.Logger) (Etcd, error) { + switch mode { + case config.EtcdModeServer: + return NewEmbeddedEtcd(cfg, logger), nil + case config.EtcdModeClient: + return NewRemoteEtcd(cfg, logger), nil + default: + return nil, fmt.Errorf("invalid etcd mode: %s", mode) + } +} + func provideEtcd(i *do.Injector) { do.Provide(i, func(i *do.Injector) (Etcd, error) { cfg, err := do.Invoke[*config.Manager](i) @@ -38,13 +52,51 @@ func provideEtcd(i *do.Injector) { return nil, err } - switch storageType := cfg.Config().EtcdMode; storageType { - case config.EtcdModeServer: - return NewEmbeddedEtcd(cfg, logger), nil - case config.EtcdModeClient: - return NewRemoteEtcd(cfg, logger), nil + appCfg := cfg.Config() + generated := cfg.GeneratedConfig() + + oldMode := generated.EtcdMode + newMode := appCfg.EtcdMode + + logger.Info(). + Str("old_mode", string(oldMode)). + Str("new_mode", string(newMode)). + Bool("old_mode_empty", oldMode == ""). + Bool("modes_equal", oldMode == newMode). + Msg("checking etcd mode for reconfiguration") + + // Mode has changed - perform reconfiguration. + ctx, cancel := context.WithTimeout(context.Background(), 60*time.Second) + defer cancel() + + switch { + case oldMode == "" || oldMode == newMode: + etcd, err := newEtcdForMode(newMode, cfg, logger) + if err != nil { + return nil, err + } + initialized, err := etcd.IsInitialized() + if err != nil { + return nil, err + } + if initialized { + generated.EtcdMode = appCfg.EtcdMode + generated.EtcdServer = appCfg.EtcdServer + generated.EtcdClient = appCfg.EtcdClient + if err := cfg.UpdateGeneratedConfig(generated); err != nil { + return nil, fmt.Errorf("failed to persist etcd configuration: %w", err) + } + } + + return etcd, nil + case oldMode == config.EtcdModeServer && newMode == config.EtcdModeClient: + embedded := NewEmbeddedEtcd(cfg, logger) + return embedded.ChangeMode(ctx, newMode) + case oldMode == config.EtcdModeClient && newMode == config.EtcdModeServer: + remote := NewRemoteEtcd(cfg, logger) + return remote.ChangeMode(ctx, newMode) default: - return nil, fmt.Errorf("invalid storage type: %s", storageType) + return nil, fmt.Errorf("unsupported etcd mode transition: %s -> %s", oldMode, newMode) } }) } diff --git a/server/internal/etcd/rbac.go b/server/internal/etcd/rbac.go index bb2b26ae..cdae3f65 100644 --- a/server/internal/etcd/rbac.go +++ b/server/internal/etcd/rbac.go @@ -51,18 +51,9 @@ func CreateHostCredentials( if opts.EmbeddedEtcdEnabled { // Create a cert for the peer server - serverPrincipal, err := certSvc.EtcdServer(ctx, - opts.HostID, - opts.Hostname, - []string{"localhost", opts.Hostname}, - []string{"127.0.0.1", opts.IPv4Address}, - ) - if err != nil { - return nil, fmt.Errorf("failed to create cert for etcd server: %w", err) + if err := addEtcdServerCredentials(ctx, opts.HostID, opts.Hostname, opts.IPv4Address, certSvc, creds); err != nil { + return nil, err } - - creds.ServerCert = serverPrincipal.CertPEM - creds.ServerKey = serverPrincipal.KeyPEM } return creds, nil @@ -353,9 +344,38 @@ func writeHostCredentials(creds *HostCredentials, cfg *config.Manager) error { generatedCfg := cfg.GeneratedConfig() generatedCfg.EtcdUsername = creds.Username generatedCfg.EtcdPassword = creds.Password + generatedCfg.EtcdMode = appCfg.EtcdMode + generatedCfg.EtcdClient = appCfg.EtcdClient + generatedCfg.EtcdServer = appCfg.EtcdServer + if err := cfg.UpdateGeneratedConfig(generatedCfg); err != nil { return fmt.Errorf("failed to update generated config: %w", err) } return nil } + +func addEtcdServerCredentials( + ctx context.Context, + hostID string, + hostname string, + ipv4Address string, + certSvc *certificates.Service, + creds *HostCredentials, +) error { + // Create a cert for the peer server + serverPrincipal, err := certSvc.EtcdServer(ctx, + hostID, + hostname, + []string{"localhost", hostname}, + []string{"127.0.0.1", ipv4Address}, + ) + if err != nil { + return fmt.Errorf("failed to create cert for etcd server: %w", err) + } + + creds.ServerCert = serverPrincipal.CertPEM + creds.ServerKey = serverPrincipal.KeyPEM + + return nil +} diff --git a/server/internal/etcd/remote.go b/server/internal/etcd/remote.go index bdd355b1..208eca2e 100644 --- a/server/internal/etcd/remote.go +++ b/server/internal/etcd/remote.go @@ -250,3 +250,65 @@ func (r *RemoteEtcd) updateEndpointsConfig(ctx context.Context, client *clientv3 return nil } + +func (r *RemoteEtcd) ChangeMode(ctx context.Context, mode config.EtcdMode) (Etcd, error) { + if mode != config.EtcdModeServer { + return nil, fmt.Errorf("invalid mode transition from %s to %s", config.EtcdModeClient, mode) + } + + if err := r.Start(ctx); err != nil { + return nil, err + } + + cfg := r.cfg.Config() + + clientPrincipal, err := r.certSvc.HostEtcdUser(ctx, cfg.HostID) + if err != nil { + return nil, fmt.Errorf("failed to get client principal: %w", err) + } + + creds := &HostCredentials{ + Username: cfg.EtcdUsername, + Password: cfg.EtcdPassword, + CaCert: r.certSvc.CACert(), + ClientCert: clientPrincipal.CertPEM, + ClientKey: clientPrincipal.KeyPEM, + } + + if err := addEtcdServerCredentials(ctx, cfg.HostID, cfg.Hostname, cfg.IPv4Address, r.certSvc, creds); err != nil { + return nil, err + } + + client, err := r.GetClient() + if err != nil { + return nil, err + } + + leader, err := GetClusterLeader(ctx, client) + if err != nil { + return nil, fmt.Errorf("failed to get cluster leader: %w", err) + } + + if err := r.Shutdown(); err != nil { + return nil, err + } + + embedded := NewEmbeddedEtcd(r.cfg, r.logger) + err = embedded.Join(ctx, JoinOptions{ + Leader: leader, + Credentials: creds, + }) + if err != nil { + return nil, fmt.Errorf("failed to join embedded etcd to cluster: %w", err) + } + + generated := r.cfg.GeneratedConfig() + generated.EtcdMode = config.EtcdModeServer + generated.EtcdClient = config.EtcdClient{} + generated.EtcdServer = cfg.EtcdServer + if err := r.cfg.UpdateGeneratedConfig(generated); err != nil { + return nil, fmt.Errorf("failed to clear out etcd client settings in generated config: %w", err) + } + + return embedded, nil +} From a37979dd6dabbdfcb1fc92e21144e3b5cc6fa529 Mon Sep 17 00:00:00 2001 From: Siva Date: Fri, 9 Jan 2026 20:21:45 +0530 Subject: [PATCH 2/3] addressing review comments --- docs/using/etcd-reconfiguration.md | 116 ----------------------------- 1 file changed, 116 deletions(-) delete mode 100644 docs/using/etcd-reconfiguration.md diff --git a/docs/using/etcd-reconfiguration.md b/docs/using/etcd-reconfiguration.md deleted file mode 100644 index e8597d54..00000000 --- a/docs/using/etcd-reconfiguration.md +++ /dev/null @@ -1,116 +0,0 @@ -# Etcd Mode Reconfiguration - - -This guide explains how to change a Control Plane host's etcd mode after cluster initialization. - - -## Overview - - -The Control Plane supports two etcd modes: - - -- **Server mode**: Runs an embedded etcd server and participates as a voting member -- **Client mode**: Connects to the etcd cluster as a client only - - -**Recommended topology:** -- 1-3 hosts: All should be etcd servers -- 4-7 hosts: 3 etcd servers, rest as clients -- 8+ hosts: 5 etcd servers, rest as clients - - -!!! warning "Maintain Odd Numbers" - Etcd requires an **odd number** of servers (3 or 5) for proper quorum. - - -## How It Works - - -Etcd mode reconfiguration is **fully automatic**: - - -1. Stop the container -2. Update `PGEDGE_ETCD_MODE` environment variable -3. Restart the container -4. The system automatically handles all cluster operations - - -**What happens automatically:** -- **Client→Server**: Discovers cluster, obtains credentials, joins as voting member -- **Server→Client**: Removes itself from membership, transitions to client mode - -## Procedures - - -### Promoting a Client to Server (Example - host-4) - - -```bash -# 1. Stop the container -docker stop control-plane-host-4 - - -# 2. Update docker-compose.yaml environment: -PGEDGE_ETCD_MODE: server # was: client - - -# 3. Restart -docker-compose up -d host-4 - - -# 4. Verify (check logs) -docker logs control-plane-host-4 -``` - - -### Demoting a Server to Client (Example - host-4) - - -!!! warning "Quorum Check" - Ensure at least 2 other healthy servers remain before demotion. - - -```bash -# 1. Stop the container -docker stop control-plane-host-4 - - -# 2. Update docker-compose.yaml environment: -PGEDGE_ETCD_MODE: client # was: server - - -# 3. Restart -docker-compose up -d host-4 - - -# 4. Verify (check logs) -docker logs control-plane-host-4 -``` - -### General Troubleshooting - - -Check cluster health: - - -```bash -docker exec control-plane-host-1 etcdctl member list -``` - - -All members should show `STATUS=started`. - - -## Best Practices - - -- **Change one host at a time** - Wait for completion before reconfiguring another -- **Monitor cluster health** - Verify all servers healthy before/after changes -- **Maintain odd numbers** - Always keep 3 or 5 etcd servers, never 2 or 4 - -## Summary - - -Etcd mode reconfiguration is fully automatic - just update the environment variable and restart. The Control Plane handles all cluster operations including credential provisioning, membership changes, and configuration updates without manual intervention. - From 4ba07dd4a54f24bdc9a42213abcd51ededf08cea Mon Sep 17 00:00:00 2001 From: Siva Date: Fri, 9 Jan 2026 22:36:06 +0530 Subject: [PATCH 3/3] added change log --- changes/unreleased/Added-20260109-223536.yaml | 3 +++ 1 file changed, 3 insertions(+) create mode 100644 changes/unreleased/Added-20260109-223536.yaml diff --git a/changes/unreleased/Added-20260109-223536.yaml b/changes/unreleased/Added-20260109-223536.yaml new file mode 100644 index 00000000..c38ab972 --- /dev/null +++ b/changes/unreleased/Added-20260109-223536.yaml @@ -0,0 +1,3 @@ +kind: Added +body: Enabled automatic etcd client ↔ server mode reconfiguration +time: 2026-01-09T22:35:36.887346+05:30