logo
Tags

Yingyu's Magic World

分析t_tray表格冗余问题

问题描述: t_tray表格里有冗余tray信息

任务:怎么删除tray里面过期的盘,zk怎么读取的

select * from t_tray;
  • 为什么他会有这么多t_tray之前offline的记录:应该是因为重新部署了环境,盘的uuid变了,他的监听是基于相同的uuid tray的,所以他应该无法更新到原先的那条记录了

  • uuid具体是什么

readDiskUuid 的逻辑是 “读取磁盘初始化时预写入的 UUID”,这个 UUID 可能是随机生成的(但生成动作发生在磁盘初始化阶段),而 readDiskUuid 仅负责读取和验证,确保磁盘是系统认可的托盘,并获取其唯一标识(tray_uuid)

根据jconductor/src/com/netbric/s5/cluster/ClusterManager.java

String trayOnZk = zkBaseDir + "/stores/"+store_id+"/trays";
...
List<String> trays = zk.getChildren(trayOnZk, null);
for(String t : trays)
{

    Tray tr = new Tray();
    tr.uuid = t;
    ...
}

可以看到zk里面的uuid是以trays里面的每个元素组成的,查看getChildren()方法

*** 目前没有看到代码删除的地方,尝试直接删除

数据库参考

MariaDB [s5]> select * from t_tray;
+--------------------------------------+---------------------------------+---------+---------------+-------------
| uuid                                 | device                          | status  | raw_capacity  | object_size 
+--------------------------------------+---------------------------------+---------+---------------+-------------
| 166f6fa1-f7c7-441a-aa05-55ff51b294db | trtype:PCIE traddr:0000.03.00.0 | OFFLINE |  500107862016 |    67108864 
| 1a4b1558-a886-4077-887f-5d226892d850 | /dev/nvme1n1                    | OFFLINE |  500107862016 |    67108864 
| 211558c6-c024-4bb9-9a0c-398f0959dbf7 | /dev/nvme4n1                    | OFFLINE | 2048408248320 |    67108864 
| 258d6c2a-ea1f-4f3f-8bc6-1958f1176b16 | trtype:PCIE traddr:0000.e1.00.0 | OFFLINE | 8001563222016 |    67108864 
| 280b716b-27b9-4606-b2a0-2a5e0faecad5 | trtype:PCIE traddr:0000.03.00.0 | OFFLINE | 1000204886016 |    67108864 
| 2aa3c7d4-d2e9-4aee-ae4a-6b3322a67ff8 | /dev/nvme4n1                    | OFFLINE | 8001563222016 |    67108864 
| 3481ebe8-13c4-40d4-9525-76acf8a354a8 | trtype:PCIE traddr:0000.63.00.0 | OFFLINE | 2048408248320 |    67108864 
| 3f40a7d1-054e-4fc1-9c2a-d31a2eccd2f2 | /dev/nvme0n1                    | OFFLINE | 2048408248320 |    67108864 
| 494d92b1-ae7f-482a-b7ad-9c41e269a65e | /dev/nvme2n1                    | OFFLINE | 2048408248320 |    67108864 
| 4ad5638f-0155-4820-a2a6-5ef697813ccc | trtype:PCIE traddr:0000.63.00.0 | OFFLINE | 2048408248320 |    67108864 
| 4b23f3e7-32fd-4a96-8949-4440a6f1dec1 | /dev/nvme1n1                    | OFFLINE | 1000204886016 |    67108864 
| 602b426e-e830-48b2-af78-255bc4c6ddae | /dev/nvme3n1                    | OFFLINE | 8001563222016 |    67108864 
| 670c0812-d96d-48d8-b25d-620661e07a28 | /dev/nvme2n1                    | OFFLINE | 2048408248320 |    67108864 
| 7054366f-c8f1-404f-925e-09bb04288df1 | /dev/nvme1n1                    | OK      | 8001563222016 |    67108864 
| 79969998-4739-4f8f-871f-2b76e1973bb0 | /dev/nvme0n1                    | OFFLINE |  500107862016 |    67108864 
| 88c73032-9b6b-493c-b826-2fccb4e245dd | /dev/nvme0n1                    | OK      | 2048408248320 |    67108864 
| 9dcae78a-355c-4551-8b19-759d900fee41 | /dev/nvme1n1                    | OFFLINE | 1000204886016 |    67108864 
| a8c2d665-b8c0-449a-96d5-8d10f4acc899 | trtype:PCIE traddr:0000.e2.00.0 | OFFLINE | 8001563222016 |    67108864 
| ae294cb4-ba8e-453a-9b1d-04ab03b4f693 | /dev/nvme3n1                    | OFFLINE | 8001563222016 |    67108864 
| ae6582a5-fd11-446c-a8de-22eb7a8a6540 | /dev/nvme1n1                    | OK      |  500107862016 |    67108864 
| af96a884-5f39-4d83-9684-7e318c2b08d7 | /dev/nvme1n1                    | OFFLINE | 1000204886016 |    67108864 
| b4f726d1-fffb-444a-89f9-814314acc680 | /dev/nvme1n1                    | OK      | 1000204886016 |    67108864 
| b843d7cc-86e9-42bd-b5a7-5bcb1585977d | /dev/nvme2n1                    | OFFLINE | 2048408248320 |    67108864 
| c6e9de59-8eeb-4941-9183-a34f6c6091a7 | trtype:PCIE traddr:0000.03.00.0 | OFFLINE | 1000204886016 |    67108864 
| cd7d26e6-9d99-4ea9-b31d-bc57b2a6c43c | /dev/nvme4n1                    | OK      | 2048408248320 |    67108864 
| d32bea16-d3a5-4105-842e-4367f2346e8b | trtype:PCIE traddr:0000.e1.00.0 | OFFLINE | 8001563222016 |    67108864 
| d3ab9c87-6ab9-45ca-8edc-79dcc2434c1a | /dev/nvme0n1                    | OFFLINE |  500107862016 |    67108864 
| d46569d5-a82a-47a3-b34d-608c8afb5e06 | /dev/nvme3n1                    | OK      | 8001563222016 |    67108864 
| da55e8e4-2fe7-4a86-8830-06628d66ac52 | trtype:PCIE traddr:0000.03.00.0 | OFFLINE |  500107862016 |    67108864 
| ddc05e25-ced0-4a34-82eb-e99bc54201bc | /dev/nvme1n1                    | OFFLINE | 8001563222016 |    67108864 
| dfb6fdfd-0f3b-45da-bff2-6011fde996ce | /dev/nvme4n2                    | OFFLINE | 8001563222016 |    67108864 
| eeedb93c-a1ce-4a09-acad-5c348510485a | /dev/nvme2n1                    | OFFLINE | 2048408248320 |    67108864 
| f1eaa584-1fb5-4842-ba5a-a4bbeb5381a1 | trtype:PCIE traddr:0000.04.00.0 | OFFLINE | 2048408248320 |    67108864 
| f7b27f08-0098-4b17-8db6-ffbe14c186d1 | /dev/nvme4n2                    | OFFLINE | 8001563222016 |    67108864 
+--------------------------------------+---------------------------------+---------+---------------+-------------
34 rows in set (0.000 sec)

保留7054366f-c8f1-404f-925e-09bb04288df1, cd7d26e6-9d99-4ea9-b31d-bc57b2a6c43c, d46569d5-a82a-47a3-b34d-608c8afb5e06, ,, ``

ls /pureflash/cluster1/stores/1/trays
[211558c6-c024-4bb9-9a0c-398f0959dbf7, 258d6c2a-ea1f-4f3f-8bc6-1958f1176b16, 2aa3c7d4-d2e9-4aee-ae4a-6b3322a67ff8, 3481ebe8-13c4-40d4-9525-76acf8a354a8, 494d92b1-ae7f-482a-b7ad-9c41e269a65e, 4ad5638f-0155-4820-a2a6-5ef697813ccc, 602b426e-e830-48b2-af78-255bc4c6ddae, 670c0812-d96d-48d8-b25d-620661e07a28, 7054366f-c8f1-404f-925e-09bb04288df1, a8c2d665-b8c0-449a-96d5-8d10f4acc899, ae294cb4-ba8e-453a-9b1d-04ab03b4f693, b843d7cc-86e9-42bd-b5a7-5bcb1585977d, cd7d26e6-9d99-4ea9-b31d-bc57b2a6c43c, d32bea16-d3a5-4105-842e-4367f2346e8b, d46569d5-a82a-47a3-b34d-608c8afb5e06, ddc05e25-ced0-4a34-82eb-e99bc54201bc, dfb6fdfd-0f3b-45da-bff2-6011fde996ce, eeedb93c-a1ce-4a09-acad-5c348510485a, f7b27f08-0098-4b17-8db6-ffbe14c186d1]

  • 删除3481ebe8-13c4-40d4-9525-76acf8a354a8
    deleteall /pureflash/cluster1/stores/1/trays/3481ebe8-13c4-40d4-9525-76acf8a354a8
    
  • 如果删除重复/目录错误会报错
    Node does not exist: /pureflash/cluster1/stores/1/trays/3481ebe8-13c4-40d4-9525-76acf8a354a8
    
  • 数据库手动删除
    DELETE FROM t_tray 
    WHERE uuid = '211558c6-c024-4bb9-9a0c-398f0959dbf7' 
    AND store_id = 1;
    
  • 统一删除
    DELETE FROM t_tray WHERE status = 'OFFLINE';
    

删除过后数据库中计算空间错误的v_tray_alloc_size都没有了,说明是之前部署的时候分配的空间,在节点故障的时候没有清除。

冗余就是在例如/pureflash/cluster1/stores/1/trays/这个下面,有多个devname相同的条目

[zk: localhost:2181(CONNECTED) 25]  ls /pureflash/cluster1/stores/1/trays 
[7054366f-c8f1-404f-925e-09bb04288df1, b843d7cc-86e9-42bd-b5a7-5bcb1585977d, cd7d26e6-9d99-4ea9-b31d-bc57b2a6c43c, d32bea16-d3a5-4105-842e-4367f2346e8b, d46569d5-a82a-47a3-b34d-608c8afb5e06, ddc05e25-ced0-4a34-82eb-e99bc54201bc, dfb6fdfd-0f3b-45da-bff2-6011fde996ce, eeedb93c-a1ce-4a09-acad-5c348510485a, f7b27f08-0098-4b17-8db6-ffbe14c186d1]
[zk: localhost:2181(CONNECTED) 1] get /pureflash/cluster1/stores/1/trays/7054366f-c8f1-404f-925e-09bb04288df1/devname
/dev/nvme1n1
[zk: localhost:2181(CONNECTED) 2] get /pureflash/cluster1/stores/1/trays/b843d7cc-86e9-42bd-b5a7-5bcb1585977d/devname
/dev/nvme2n1
[zk: localhost:2181(CONNECTED) 3] get /pureflash/cluster1/stores/1/trays/cd7d26e6-9d99-4ea9-b31d-bc57b2a6c43c/devname
/dev/nvme4n1
[zk: localhost:2181(CONNECTED) 4] get /pureflash/cluster1/stores/1/trays/d32bea16-d3a5-4105-842e-4367f2346e8b/devname
trtype:PCIE traddr:0000.e1.00.0
[zk: localhost:2181(CONNECTED) 5] get /pureflash/cluster1/stores/1/trays/d46569d5-a82a-47a3-b34d-608c8afb5e06/devname
/dev/nvme3n1
[zk: localhost:2181(CONNECTED) 6] get /pureflash/cluster1/stores/1/trays/d46569d5-a82a-47a3-b34d-608c8afb5e06/devname
/dev/nvme3n1
[zk: localhost:2181(CONNECTED) 7] get /pureflash/cluster1/stores/1/trays/ddc05e25-ced0-4a34-82eb-e99bc54201bc/devname
/dev/nvme1n1
[zk: localhost:2181(CONNECTED) 8] get /pureflash/cluster1/stores/1/trays/dfb6fdfd-0f3b-45da-bff2-6011fde996ce/devname
/dev/nvme4n2
[zk: localhost:2181(CONNECTED) 9] get /pureflash/cluster1/stores/1/trays/eeedb93c-a1ce-4a09-acad-5c348510485a/devname
/dev/nvme2n1
[zk: localhost:2181(CONNECTED) 10] get /pureflash/cluster1/stores/1/trays/f7b27f08-0098-4b17-8db6-ffbe14c186d1/devname
/dev/nvme4n2

对于如何出现这些过期的盘猜测

怀疑因为节点掉了重新挂载,每次的uuid都不同(是由zk读取的,仅写入没有删除旧的)

(25.10.23)原因: 节点掉线后盘掉线了,但是没有清除zk里面的tray信息,导致之前旧的信息一直存在。这样即使清除了MariaDB数据库里的数据,重新部署pfstore时还是会读取zk里的原有数据。需要在pfstore里面加入清除旧数据的代码。

pfconductor会修改zk里面的内容吗

pfconductor修改zk数据的地方:ClusterManager,但是只能读取和更新状态,增加watcher检测并更新,无法删除zk里的数据。

可以在每次pfstore启动的时候把stores/store_id/trays下所有的内容都删除了

pfstore修改zk相关内容代码

注册tray的地方在pfstore PureFlash/pfs/src/pf_cluster.cpp里的register_tray方法:

int register_tray(int store_id, const uuid_t uuid, const char* devname, int64_t capacity, int64_t obj_size)
{
	char zk_node_name[128];
	char value_buf[128];
	char uuid_str[64];
	int rc;
	uuid_unparse(uuid, uuid_str);
	snprintf(zk_node_name, sizeof(zk_node_name), "stores/%d/trays/%s", store_id, uuid_str);
	if ((rc = app_context.zk_client.create_node(zk_node_name, false, NULL)) != ZOK)
		return rc;

	snprintf(zk_node_name, sizeof(zk_node_name), "stores/%d/trays/%s/devname", store_id, uuid_str);
	if ((rc = app_context.zk_client.create_node(zk_node_name, false, devname)) != ZOK)
		return rc;


	snprintf(zk_node_name, sizeof(zk_node_name), "stores/%d/trays/%s/capacity", store_id, uuid_str);
	snprintf(value_buf, sizeof(value_buf), "%ld", capacity);
	if ((rc = app_context.zk_client.create_node(zk_node_name, false, value_buf)) != ZOK)
		return rc;
	snprintf(zk_node_name, sizeof(zk_node_name), "stores/%d/trays/%s/object_size", store_id, uuid_str);
	snprintf(value_buf, sizeof(value_buf), "%ld", obj_size);
	if ((rc = app_context.zk_client.create_node(zk_node_name, false, value_buf)) != ZOK)
		return rc;
	set_tray_state(store_id, uuid, "OK", true);
	return 0;
}

[修改1] 直接删除节点所有tray信息

  • PureFlash/pfs/src/pf_main.cpp: 251023-image2

[修改1] 启动前清理共享盘旧数据; 删除所有tray节点数据,清理旧数据.

增加delete_old_shard_disk思路:

  • PureFlash/common/src/pf_zk_client.cpp:

在zk_client里面加一个get_data_shared_disk_uuid的函数,不然好像获取不到之前的uuid

251023-image3

std::vector<std::string> PfZkClient::get_data_shared_disk_uuid(void)
{
	std::vector<std::string> uuid_list; // 存储所有UUID的向量
    int rc = 0;
    char zk_path[256];

    // 构建共享盘UUID的父路径:/pureflash/<集群名>/shared_disks
    snprintf(zk_path, sizeof(zk_path), "/pureflash/%s/shared_disks", cluster_name.c_str());

    struct String_vector children = { 0 };
    rc = zoo_get_children(zkhandle, zk_path, 0, &children);
    if (rc != ZOK) {
        S5LOG_ERROR("Failed to get children on path:%s, rc:%d", zk_path, rc);
        return uuid_list;
    }
    DeferCall _a([&children]() { deallocate_String_vector(&children); });
    if (children.count <= 0) {
        S5LOG_INFO("No children under path:%s", zk_path);
        return uuid_list;
    }

    // 将所有子节点(UUID)添加到向量中
    for (int i = 0; i < children.count; ++i) {
        // 子节点名称即UUID,直接转换为std::string并添加到列表
        uuid_list.emplace_back(children.data[i]);
    }

    S5LOG_INFO("Got %d shared disk UUIDs from ZK", uuid_list.size());
    return uuid_list;
}

  • PureFlash/common/include/pf_zk_client.h:增加相关头文件
class PfZkClient {
public:
	PfZkClient(){ zkhandle = NULL; }
	~PfZkClient();
	int init(const char* zk_ip, int zk_timeout, const char* cluster_name);
	int create_node(const std::string& node_path, bool is_ephemeral, const char* node_data);
	int delete_node(const std::string& node_path);
	int wait_lock(const std::string& lock_path, const char* myid);
	std::string get_data_port(int store_id, int port_idx);
	std::vector<std::string> get_data_shared_disk_uuid(void);  //新增
	int watch_disk_owner(const char* disk_uuid, std::function<void(const char*)> on_new_owner);
	//members:
	zhandle_t *zkhandle;
	std::string cluster_name;
};

#endif //PUREFLASH_S5_ZK_CLIENT_H
  • PureFlash/pfs/src/pf_cluster.cpp 251023-image4
    int delete_old_shard_disk(int store_id)
    { 
      char zk_node_name[64];
      int rc = 0;
      std::vector<std::string> uuid_list;
      // std::string all_uuids = app_context.zk_client.get_data_shared_disk_uuid();
      // 获取所有共享盘UUID
      std::vector<std::string> all_uuids = app_context.zk_client.get_data_shared_disk_uuid();
    
      // 遍历使用
      for (const std::string& uuid : all_uuids) {
          S5LOG_INFO("Shared disk UUID: %s", uuid.c_str());
          // 后续操作(如删除该UUID下的指定store_id节点)
          snprintf(zk_node_name, sizeof(zk_node_name), "shared_disks/%s/%d", uuid.c_str(), store_id);
          rc = app_context.zk_client.delete_node(zk_node_name);
      }
    
      return rc;
    }
    
  • PureFlash/pfs/src/pf_main.cpp:主函数

在遍历tray之前: 251023-image6

// delete existing trays & shard_disk
/* 启动前清理共享盘旧数据; 删除所有tray节点数据,清理旧数据 */

int rc_delete;
rc_delete = delete_old_shard_disk(store_id);

更新pfserver需要重启pfstore来实现

  • 编译命令
    cd /home/flyslice/yangxiao/cocalele/PureFlash
    mkdir build_deb; cd build_deb && cmake -GNinja -DCMAKE_BUILD_TYPE=Debug -DCMAKE_MAKE_PROGRAM=/usr/bin/ninja .. && ninja
    
  • 重启服务
    source /home/flyslice/yangxiao/cocalele/PureFlash/build_deb/env.sh
    nohup pfs -c /etc/pureflash/pfs.conf &
    
  • 需要先停止conductor服务并且重启
    source /home/flyslice/yangxiao/cocalele/jconductor/env-pfc.sh
    nohup pfc -c /etc/pureflash/pfc.conf &