etcd数据备份数据恢复数据压缩碎片整理

本文介绍: etcd 数据备份，使用备份数据进行数据库重构，数据压缩，碎片整理

在这#!/usr/bin/python3
# encoding: utf-8
#filename: etcd-backups-restore-compress-defragmentation.py
#author: gaohaixiang
#writetime:202401161055

"""
脚本功能：
etcd 数据备份，使用备份数据进行数据库重构，数据压缩，碎片整理

数据压缩及碎片整理的原因：
etcd数据写入频繁，导致版本不断叠加，从而导致数据库不断变大 
需要对其进行压缩，进行碎片整理，从而减小etcd数据库的大小

etcd默认的数据存储大小为2G，当超过这个存储大小，可能会限制数据写入 
或者报错mcc、NOSPACE，除了进行数据压缩碎片整理外，还可以进行参数调整 
etcd启动添加参数 --quota-backend-bytes ，将etcd存储调整到多少 
单位为B，10737418240 为10G

etcd启动示例:
/opt/etcd/etcd --quota-backend-bytes=10737418240 --auth-token jwt --config-file=/opt/etcd/nodefile.yml

注意：
备份恢复是选择最新的一个备份进行数据恢复，不是指定固定的备份来进行操作

"""

import json
import subprocess
import datetime
import os
import stat
import getpass

# 命令执行
def run_command(command):
    process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
    stdout, stderr = process.communicate()
    return process.returncode, stdout, stderr

# 目录权限检查
def check_directory_access(directory):
    if not os.path.isdir(directory):
        # print(f"Directory {directory} does not exist")
        # return False

        # 创建目录
        os.makedirs(directory)
        # 更改目录的权限，使得所有用户都可以读取和写入
        os.chmod(directory, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO)

    if not os.access(directory, os.R_OK):
        #print(f"User {getpass.getuser()} does not have read access to directory {directory}")
        #return False

        # 更改目录的权限，使得所有用户都可以读取和写入
        os.chmod(directory, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO)

    if not os.access(directory, os.W_OK):
        #print(f"User {getpass.getuser()} does not have write access to directory {directory}")
        #return False

        # 更改目录的权限，使得所有用户都可以读取和写入
        os.chmod(directory, stat.S_IRWXU | stat.S_IRWXG | stat.S_IRWXO)

    return True

# 备份
def backup_etcd(endpoints, backup_dir):
    # 检查备份目录的访问权限
    if not check_directory_access(backup_dir):
        return False

    # 生成备份文件的名称
    backup_file = f"{backup_dir}/etcd_backup_{datetime.datetime.now().strftime('%Y%m%d%H%M%S')}.db"

    # 执行备份操作
    backup_command = f"ETCDCTL_API=3 etcdctl --endpoints={endpoints} snapshot save {backup_file}"
    ret, stdout, stderr = run_command(backup_command)
    if ret != 0:
        print(f"Failed to backup etcd. Error: {stderr}")
        return False

    print(f"Etcd backup completed successfully. Backup file is {backup_file}")
    return True

# 备份恢复，etcd重构
def restore_etcd(backup_dir, restore_dir, name, initial_cluster, initial_advertise_peer_urls):
    # 检查备份目录和恢复目录的访问权限
    if not check_directory_access(backup_dir) or not check_directory_access(restore_dir):
        return False

    # 选择最新的备份文件
    backup_files = [f for f in os.listdir(backup_dir) if os.path.isfile(os.path.join(backup_dir, f))]
    backup_files.sort(reverse=True)
    if not backup_files:
        print("No backup files found")
        return False

    backup_file = os.path.join(backup_dir, backup_files[0])

    # 执行恢复操作
    restore_command = f"ETCDCTL_API=3 etcdctl snapshot restore {backup_file} --name {name} --data-dir {restore_dir} --initial-cluster {initial_cluster} --initial-advertise-peer-urls {initial_advertise_peer_urls}"
    ret, stdout, stderr = run_command(restore_command)
    if ret != 0:
        print(f"Failed to restore etcd. Error: {stderr}")
        return False

    print(f"Etcd restore completed successfully. Restored data is in {restore_dir}")
    return True

# 数据压缩及碎片整理
def compact_and_defrag(endpoints):
    status_command = f"ETCDCTL_API=3 etcdctl --endpoints={endpoints} endpoint status --write-out=json"
    ret, stdout, stderr = run_command(status_command)

    if ret != 0:
        print(f"Failed to get etcd status. Error: {stderr}")
        return

    status = json.loads(stdout)

    for s in status:
        revision = s['Status']['raftIndex']

        # 执行压缩操作
        compact_command = f"ETCDCTL_API=3 etcdctl --endpoints={endpoints} compact {revision}"
        ret, stdout, stderr = run_command(compact_command)
        if ret != 0:
            print(f"Failed to compact etcd. Error: {stderr}")
            return

        # 执行碎片整理操作
        defrag_command = f"ETCDCTL_API=3 etcdctl --endpoints={endpoints} defrag"
        ret, stdout, stderr = run_command(defrag_command)
        if ret != 0:
            print(f"Failed to defrag etcd. Error: {stderr}")
            return

    print("Etcd compact and defrag completed successfully")


# etcd数据恢复，配置文件修改及启动
def etcd_start(timenow,restore_dir):
    etcdfiledata = """
    name: node1
    data-dir: %s
    listen-client-urls: 'http://192.168.73.10:2380'
    advertise-client-urls: 'http://192.168.73.10:2380'
    listen-peer-urls: 'http://192.168.73.10:2379'
    initial-advertise-peer-urls: 'http://192.168.73.10:2379'
    initial-cluster: node1=http://192.168.73.10:2379
    initial-cluster-token: etcd-cluster-1
    initial-cluster-state: new
    """ % restore_dir

    etcdfile = "/data/etcd/nodefile%s.yml" % timenow
    ff = open(etcdfile,"w")
    ff.writelines(etcdfiledata)
    ff.close()

    # 关闭etcd
    etcdKillCommand = "ps -ef |grep etcd|grep 'config-file'|grep -v grep|awk '{print $2}'|xargs kill -9"
    ret, stdout, stderr = run_command(etcdKillCommand)
    if ret != 0:
        print(f"Failed to start etcd. Error: {stderr}")

    # 启动etcd
    etcdStartCommand = "setsid nohup etcd --config-file=%s >> /data/etcd/etcd%s.log &" % (etcdfile,timenow)
    ret = subprocess.Popen(etcdStartCommand, shell=True)
    if not ret:
        print(f"Failed to start etcd. Error: {stderr}")

def main():
    endpoints = "http://192.168.73.10:2379"  # 你的 etcd 节点的地址
    backup_dir = "/data/etcd/etcddatabak/"  # 你的备份目录
    timenow = datetime.datetime.now().strftime('%Y%m%d%H%M%S')
    restore_dir = "/data/etcd/etcddata%s" % timenow  # 你的恢复目录
    name = "node1"  # 你的 etcd 节点的名称
    initial_cluster = "node1=http://192.168.73.10:2379"  # 你的初始集群配置
    initial_advertise_peer_urls = "http://192.168.73.10:2379"  # 你的初始对等广播地址

    # # 先进行备份，然后再进行数据压缩，碎片整理
    # if backup_etcd(endpoints, backup_dir):
    #     compact_and_defrag(endpoints)

    # 数据备份
    backup_etcd(endpoints, backup_dir)

    # # 依据最新备份进行数据恢复
    # restore_etcd(backup_dir, restore_dir, name, initial_cluster, initial_advertise_peer_urls)
    # # 启动etcd
    # etcd_start(timenow, restore_dir)


if __name__ == "__main__":
    main()