分析t_tray表格冗余问题
问题描述: t_tray表格里有冗余tray信息
任务:怎么删除tray里面过期的盘,zk怎么读取的
select * from t_tray;
-
为什么他会有这么多t_tray之前offline的记录:应该是因为重新部署了环境,盘的uuid变了,他的监听是基于相同的uuid tray的,所以他应该无法更新到原先的那条记录了
-
uuid具体是什么
readDiskUuid 的逻辑是 “读取磁盘初始化时预写入的 UUID”,这个 UUID 可能是随机生成的(但生成动作发生在磁盘初始化阶段),而 readDiskUuid 仅负责读取和验证,确保磁盘是系统认可的托盘,并获取其唯一标识(tray_uuid)
根据jconductor/src/com/netbric/s5/cluster/ClusterManager.java
String trayOnZk = zkBaseDir + "/stores/"+store_id+"/trays";
...
List<String> trays = zk.getChildren(trayOnZk, null);
for(String t : trays)
{
Tray tr = new Tray();
tr.uuid = t;
...
}
可以看到zk里面的uuid是以trays里面的每个元素组成的,查看getChildren()方法
*** 目前没有看到代码删除的地方,尝试直接删除
数据库参考
MariaDB [s5]> select * from t_tray;
+--------------------------------------+---------------------------------+---------+---------------+-------------
| uuid | device | status | raw_capacity | object_size
+--------------------------------------+---------------------------------+---------+---------------+-------------
| 166f6fa1-f7c7-441a-aa05-55ff51b294db | trtype:PCIE traddr:0000.03.00.0 | OFFLINE | 500107862016 | 67108864
| 1a4b1558-a886-4077-887f-5d226892d850 | /dev/nvme1n1 | OFFLINE | 500107862016 | 67108864
| 211558c6-c024-4bb9-9a0c-398f0959dbf7 | /dev/nvme4n1 | OFFLINE | 2048408248320 | 67108864
| 258d6c2a-ea1f-4f3f-8bc6-1958f1176b16 | trtype:PCIE traddr:0000.e1.00.0 | OFFLINE | 8001563222016 | 67108864
| 280b716b-27b9-4606-b2a0-2a5e0faecad5 | trtype:PCIE traddr:0000.03.00.0 | OFFLINE | 1000204886016 | 67108864
| 2aa3c7d4-d2e9-4aee-ae4a-6b3322a67ff8 | /dev/nvme4n1 | OFFLINE | 8001563222016 | 67108864
| 3481ebe8-13c4-40d4-9525-76acf8a354a8 | trtype:PCIE traddr:0000.63.00.0 | OFFLINE | 2048408248320 | 67108864
| 3f40a7d1-054e-4fc1-9c2a-d31a2eccd2f2 | /dev/nvme0n1 | OFFLINE | 2048408248320 | 67108864
| 494d92b1-ae7f-482a-b7ad-9c41e269a65e | /dev/nvme2n1 | OFFLINE | 2048408248320 | 67108864
| 4ad5638f-0155-4820-a2a6-5ef697813ccc | trtype:PCIE traddr:0000.63.00.0 | OFFLINE | 2048408248320 | 67108864
| 4b23f3e7-32fd-4a96-8949-4440a6f1dec1 | /dev/nvme1n1 | OFFLINE | 1000204886016 | 67108864
| 602b426e-e830-48b2-af78-255bc4c6ddae | /dev/nvme3n1 | OFFLINE | 8001563222016 | 67108864
| 670c0812-d96d-48d8-b25d-620661e07a28 | /dev/nvme2n1 | OFFLINE | 2048408248320 | 67108864
| 7054366f-c8f1-404f-925e-09bb04288df1 | /dev/nvme1n1 | OK | 8001563222016 | 67108864
| 79969998-4739-4f8f-871f-2b76e1973bb0 | /dev/nvme0n1 | OFFLINE | 500107862016 | 67108864
| 88c73032-9b6b-493c-b826-2fccb4e245dd | /dev/nvme0n1 | OK | 2048408248320 | 67108864
| 9dcae78a-355c-4551-8b19-759d900fee41 | /dev/nvme1n1 | OFFLINE | 1000204886016 | 67108864
| a8c2d665-b8c0-449a-96d5-8d10f4acc899 | trtype:PCIE traddr:0000.e2.00.0 | OFFLINE | 8001563222016 | 67108864
| ae294cb4-ba8e-453a-9b1d-04ab03b4f693 | /dev/nvme3n1 | OFFLINE | 8001563222016 | 67108864
| ae6582a5-fd11-446c-a8de-22eb7a8a6540 | /dev/nvme1n1 | OK | 500107862016 | 67108864
| af96a884-5f39-4d83-9684-7e318c2b08d7 | /dev/nvme1n1 | OFFLINE | 1000204886016 | 67108864
| b4f726d1-fffb-444a-89f9-814314acc680 | /dev/nvme1n1 | OK | 1000204886016 | 67108864
| b843d7cc-86e9-42bd-b5a7-5bcb1585977d | /dev/nvme2n1 | OFFLINE | 2048408248320 | 67108864
| c6e9de59-8eeb-4941-9183-a34f6c6091a7 | trtype:PCIE traddr:0000.03.00.0 | OFFLINE | 1000204886016 | 67108864
| cd7d26e6-9d99-4ea9-b31d-bc57b2a6c43c | /dev/nvme4n1 | OK | 2048408248320 | 67108864
| d32bea16-d3a5-4105-842e-4367f2346e8b | trtype:PCIE traddr:0000.e1.00.0 | OFFLINE | 8001563222016 | 67108864
| d3ab9c87-6ab9-45ca-8edc-79dcc2434c1a | /dev/nvme0n1 | OFFLINE | 500107862016 | 67108864
| d46569d5-a82a-47a3-b34d-608c8afb5e06 | /dev/nvme3n1 | OK | 8001563222016 | 67108864
| da55e8e4-2fe7-4a86-8830-06628d66ac52 | trtype:PCIE traddr:0000.03.00.0 | OFFLINE | 500107862016 | 67108864
| ddc05e25-ced0-4a34-82eb-e99bc54201bc | /dev/nvme1n1 | OFFLINE | 8001563222016 | 67108864
| dfb6fdfd-0f3b-45da-bff2-6011fde996ce | /dev/nvme4n2 | OFFLINE | 8001563222016 | 67108864
| eeedb93c-a1ce-4a09-acad-5c348510485a | /dev/nvme2n1 | OFFLINE | 2048408248320 | 67108864
| f1eaa584-1fb5-4842-ba5a-a4bbeb5381a1 | trtype:PCIE traddr:0000.04.00.0 | OFFLINE | 2048408248320 | 67108864
| f7b27f08-0098-4b17-8db6-ffbe14c186d1 | /dev/nvme4n2 | OFFLINE | 8001563222016 | 67108864
+--------------------------------------+---------------------------------+---------+---------------+-------------
34 rows in set (0.000 sec)
保留7054366f-c8f1-404f-925e-09bb04288df1, cd7d26e6-9d99-4ea9-b31d-bc57b2a6c43c, d46569d5-a82a-47a3-b34d-608c8afb5e06, ,, ``
ls /pureflash/cluster1/stores/1/trays
[211558c6-c024-4bb9-9a0c-398f0959dbf7, 258d6c2a-ea1f-4f3f-8bc6-1958f1176b16, 2aa3c7d4-d2e9-4aee-ae4a-6b3322a67ff8, 3481ebe8-13c4-40d4-9525-76acf8a354a8, 494d92b1-ae7f-482a-b7ad-9c41e269a65e, 4ad5638f-0155-4820-a2a6-5ef697813ccc, 602b426e-e830-48b2-af78-255bc4c6ddae, 670c0812-d96d-48d8-b25d-620661e07a28, 7054366f-c8f1-404f-925e-09bb04288df1, a8c2d665-b8c0-449a-96d5-8d10f4acc899, ae294cb4-ba8e-453a-9b1d-04ab03b4f693, b843d7cc-86e9-42bd-b5a7-5bcb1585977d, cd7d26e6-9d99-4ea9-b31d-bc57b2a6c43c, d32bea16-d3a5-4105-842e-4367f2346e8b, d46569d5-a82a-47a3-b34d-608c8afb5e06, ddc05e25-ced0-4a34-82eb-e99bc54201bc, dfb6fdfd-0f3b-45da-bff2-6011fde996ce, eeedb93c-a1ce-4a09-acad-5c348510485a, f7b27f08-0098-4b17-8db6-ffbe14c186d1]
- 删除
3481ebe8-13c4-40d4-9525-76acf8a354a8deleteall /pureflash/cluster1/stores/1/trays/3481ebe8-13c4-40d4-9525-76acf8a354a8 - 如果删除重复/目录错误会报错
Node does not exist: /pureflash/cluster1/stores/1/trays/3481ebe8-13c4-40d4-9525-76acf8a354a8 - 数据库手动删除
DELETE FROM t_tray WHERE uuid = '211558c6-c024-4bb9-9a0c-398f0959dbf7' AND store_id = 1; - 统一删除
DELETE FROM t_tray WHERE status = 'OFFLINE';
删除过后数据库中计算空间错误的v_tray_alloc_size都没有了,说明是之前部署的时候分配的空间,在节点故障的时候没有清除。
冗余就是在例如/pureflash/cluster1/stores/1/trays/这个下面,有多个devname相同的条目
[zk: localhost:2181(CONNECTED) 25] ls /pureflash/cluster1/stores/1/trays
[7054366f-c8f1-404f-925e-09bb04288df1, b843d7cc-86e9-42bd-b5a7-5bcb1585977d, cd7d26e6-9d99-4ea9-b31d-bc57b2a6c43c, d32bea16-d3a5-4105-842e-4367f2346e8b, d46569d5-a82a-47a3-b34d-608c8afb5e06, ddc05e25-ced0-4a34-82eb-e99bc54201bc, dfb6fdfd-0f3b-45da-bff2-6011fde996ce, eeedb93c-a1ce-4a09-acad-5c348510485a, f7b27f08-0098-4b17-8db6-ffbe14c186d1]
[zk: localhost:2181(CONNECTED) 1] get /pureflash/cluster1/stores/1/trays/7054366f-c8f1-404f-925e-09bb04288df1/devname
/dev/nvme1n1
[zk: localhost:2181(CONNECTED) 2] get /pureflash/cluster1/stores/1/trays/b843d7cc-86e9-42bd-b5a7-5bcb1585977d/devname
/dev/nvme2n1
[zk: localhost:2181(CONNECTED) 3] get /pureflash/cluster1/stores/1/trays/cd7d26e6-9d99-4ea9-b31d-bc57b2a6c43c/devname
/dev/nvme4n1
[zk: localhost:2181(CONNECTED) 4] get /pureflash/cluster1/stores/1/trays/d32bea16-d3a5-4105-842e-4367f2346e8b/devname
trtype:PCIE traddr:0000.e1.00.0
[zk: localhost:2181(CONNECTED) 5] get /pureflash/cluster1/stores/1/trays/d46569d5-a82a-47a3-b34d-608c8afb5e06/devname
/dev/nvme3n1
[zk: localhost:2181(CONNECTED) 6] get /pureflash/cluster1/stores/1/trays/d46569d5-a82a-47a3-b34d-608c8afb5e06/devname
/dev/nvme3n1
[zk: localhost:2181(CONNECTED) 7] get /pureflash/cluster1/stores/1/trays/ddc05e25-ced0-4a34-82eb-e99bc54201bc/devname
/dev/nvme1n1
[zk: localhost:2181(CONNECTED) 8] get /pureflash/cluster1/stores/1/trays/dfb6fdfd-0f3b-45da-bff2-6011fde996ce/devname
/dev/nvme4n2
[zk: localhost:2181(CONNECTED) 9] get /pureflash/cluster1/stores/1/trays/eeedb93c-a1ce-4a09-acad-5c348510485a/devname
/dev/nvme2n1
[zk: localhost:2181(CONNECTED) 10] get /pureflash/cluster1/stores/1/trays/f7b27f08-0098-4b17-8db6-ffbe14c186d1/devname
/dev/nvme4n2
对于如何出现这些过期的盘猜测
怀疑因为节点掉了重新挂载,每次的uuid都不同(是由zk读取的,仅写入没有删除旧的)
(25.10.23)原因: 节点掉线后盘掉线了,但是没有清除zk里面的tray信息,导致之前旧的信息一直存在。这样即使清除了MariaDB数据库里的数据,重新部署pfstore时还是会读取zk里的原有数据。需要在pfstore里面加入清除旧数据的代码。
pfconductor会修改zk里面的内容吗
pfconductor修改zk数据的地方:ClusterManager,但是只能读取和更新状态,增加watcher检测并更新,无法删除zk里的数据。
可以在每次pfstore启动的时候把stores/store_id/trays下所有的内容都删除了 |
pfstore修改zk相关内容代码
注册tray的地方在pfstore
PureFlash/pfs/src/pf_cluster.cpp里的register_tray方法:
int register_tray(int store_id, const uuid_t uuid, const char* devname, int64_t capacity, int64_t obj_size)
{
char zk_node_name[128];
char value_buf[128];
char uuid_str[64];
int rc;
uuid_unparse(uuid, uuid_str);
snprintf(zk_node_name, sizeof(zk_node_name), "stores/%d/trays/%s", store_id, uuid_str);
if ((rc = app_context.zk_client.create_node(zk_node_name, false, NULL)) != ZOK)
return rc;
snprintf(zk_node_name, sizeof(zk_node_name), "stores/%d/trays/%s/devname", store_id, uuid_str);
if ((rc = app_context.zk_client.create_node(zk_node_name, false, devname)) != ZOK)
return rc;
snprintf(zk_node_name, sizeof(zk_node_name), "stores/%d/trays/%s/capacity", store_id, uuid_str);
snprintf(value_buf, sizeof(value_buf), "%ld", capacity);
if ((rc = app_context.zk_client.create_node(zk_node_name, false, value_buf)) != ZOK)
return rc;
snprintf(zk_node_name, sizeof(zk_node_name), "stores/%d/trays/%s/object_size", store_id, uuid_str);
snprintf(value_buf, sizeof(value_buf), "%ld", obj_size);
if ((rc = app_context.zk_client.create_node(zk_node_name, false, value_buf)) != ZOK)
return rc;
set_tray_state(store_id, uuid, "OK", true);
return 0;
}
[修改1] 直接删除节点所有tray信息
PureFlash/pfs/src/pf_main.cpp:
[修改1] 启动前清理共享盘旧数据; 删除所有tray节点数据,清理旧数据.
增加delete_old_shard_disk思路:
PureFlash/common/src/pf_zk_client.cpp:
在zk_client里面加一个get_data_shared_disk_uuid的函数,不然好像获取不到之前的uuid

std::vector<std::string> PfZkClient::get_data_shared_disk_uuid(void)
{
std::vector<std::string> uuid_list; // 存储所有UUID的向量
int rc = 0;
char zk_path[256];
// 构建共享盘UUID的父路径:/pureflash/<集群名>/shared_disks
snprintf(zk_path, sizeof(zk_path), "/pureflash/%s/shared_disks", cluster_name.c_str());
struct String_vector children = { 0 };
rc = zoo_get_children(zkhandle, zk_path, 0, &children);
if (rc != ZOK) {
S5LOG_ERROR("Failed to get children on path:%s, rc:%d", zk_path, rc);
return uuid_list;
}
DeferCall _a([&children]() { deallocate_String_vector(&children); });
if (children.count <= 0) {
S5LOG_INFO("No children under path:%s", zk_path);
return uuid_list;
}
// 将所有子节点(UUID)添加到向量中
for (int i = 0; i < children.count; ++i) {
// 子节点名称即UUID,直接转换为std::string并添加到列表
uuid_list.emplace_back(children.data[i]);
}
S5LOG_INFO("Got %d shared disk UUIDs from ZK", uuid_list.size());
return uuid_list;
}
PureFlash/common/include/pf_zk_client.h:增加相关头文件
class PfZkClient {
public:
PfZkClient(){ zkhandle = NULL; }
~PfZkClient();
int init(const char* zk_ip, int zk_timeout, const char* cluster_name);
int create_node(const std::string& node_path, bool is_ephemeral, const char* node_data);
int delete_node(const std::string& node_path);
int wait_lock(const std::string& lock_path, const char* myid);
std::string get_data_port(int store_id, int port_idx);
std::vector<std::string> get_data_shared_disk_uuid(void); //新增
int watch_disk_owner(const char* disk_uuid, std::function<void(const char*)> on_new_owner);
//members:
zhandle_t *zkhandle;
std::string cluster_name;
};
#endif //PUREFLASH_S5_ZK_CLIENT_H
PureFlash/pfs/src/pf_cluster.cpp
int delete_old_shard_disk(int store_id) { char zk_node_name[64]; int rc = 0; std::vector<std::string> uuid_list; // std::string all_uuids = app_context.zk_client.get_data_shared_disk_uuid(); // 获取所有共享盘UUID std::vector<std::string> all_uuids = app_context.zk_client.get_data_shared_disk_uuid(); // 遍历使用 for (const std::string& uuid : all_uuids) { S5LOG_INFO("Shared disk UUID: %s", uuid.c_str()); // 后续操作(如删除该UUID下的指定store_id节点) snprintf(zk_node_name, sizeof(zk_node_name), "shared_disks/%s/%d", uuid.c_str(), store_id); rc = app_context.zk_client.delete_node(zk_node_name); } return rc; }PureFlash/pfs/src/pf_main.cpp:主函数
在遍历tray之前:

// delete existing trays & shard_disk
/* 启动前清理共享盘旧数据; 删除所有tray节点数据,清理旧数据 */
int rc_delete;
rc_delete = delete_old_shard_disk(store_id);
更新pfserver需要重启pfstore来实现
- 编译命令
cd /home/flyslice/yangxiao/cocalele/PureFlash mkdir build_deb; cd build_deb && cmake -GNinja -DCMAKE_BUILD_TYPE=Debug -DCMAKE_MAKE_PROGRAM=/usr/bin/ninja .. && ninja - 重启服务
source /home/flyslice/yangxiao/cocalele/PureFlash/build_deb/env.sh nohup pfs -c /etc/pureflash/pfs.conf & - 需要先停止
conductor服务并且重启source /home/flyslice/yangxiao/cocalele/jconductor/env-pfc.sh nohup pfc -c /etc/pureflash/pfc.conf &