[CELEBORN-313] Add rest endpoint to show master group info

### What changes were proposed in this pull request?

<img width="1347" alt="image" src="https://github.com/apache/incubator-celeborn/assets/3898450/43d10bff-6878-4591-9461-889494d797f9">

### Why are the changes needed?

### Does this PR introduce _any_ user-facing change?
No.

### How was this patch tested?

```bash
./bin/celeborn-ratis sh -Draft.rpc.type=NETTY  group info   -peers clb-1:9872,clb-2:9873,clb-3:9874
```

```
group id: c5196f6d-2c34-3ed3-8b8a-47bede733167
leader info: 1(clb-1:9872)

[server {
  id: "1"
  address: "clb-1:9872"
  clientAddress: "clb-1:9097"
  startupRole: FOLLOWER
}
commitIndex: 316
, server {
  id: "2"
  address: "clb-2:9873"
  clientAddress: "clb-2:9098"
  startupRole: FOLLOWER
}
commitIndex: 316
, server {
  id: "3"
  address: "clb-3:9874"
  clientAddress: "clb-3:9099"
  startupRole: FOLLOWER
}
commitIndex: 316
]
```

```bash
curl http://clb-3:9983/masterGroupInfo
```

```
====================== Master Group INFO ==============================
group id: c5196f6d-2c34-3ed3-8b8a-47bede733167
leader info: 1(clb-1:9872)

[server {
  id: "3"
  address: "clb-3:9874"
  clientAddress: "clb-3:9099"
  startupRole: FOLLOWER
}
commitIndex: 316
, server {
  id: "1"
  address: "clb-1:9872"
  clientAddress: "clb-1:9097"
  startupRole: FOLLOWER
}
commitIndex: 316
, server {
  id: "2"
  address: "clb-2:9873"
  clientAddress: "clb-2:9098"
  startupRole: FOLLOWER
}
commitIndex: 316
]
```

Closes #1946 from cxzl25/CELEBORN-313.

Authored-by: sychen <sychen@ctrip.com>
Signed-off-by: Cheng Pan <chengpan@apache.org>
This commit is contained in:
sychen 2023-09-28 20:08:31 +08:00 committed by Cheng Pan
parent e4a60d15e4
commit 5310bcaf6b
No known key found for this signature in database
GPG Key ID: 8001952629BCC75D
5 changed files with 49 additions and 1 deletions

View File

@ -291,6 +291,7 @@ API path listed as below:
|-----------------------|-------------------------------------------------------------------------------------------------------------|
| /metrics/prometheus | List the metrics data in prometheus format of the master. |
| /conf | List the conf setting of the master. |
| /masterGroupInfo | List master group information of the service. It will list all master's LEADER, FOLLOWER information. |
| /workerInfo | List worker information of the service. It will list all registered workers 's information. |
| /lostWorkers | List all lost workers of the master. |
| /excludedWorkers | List all excluded workers of the master. |

View File

@ -483,7 +483,7 @@ public class HARaftServer {
}
}
private GroupInfoReply getGroupInfo() throws IOException {
public GroupInfoReply getGroupInfo() throws IOException {
GroupInfoRequest groupInfoRequest =
new GroupInfoRequest(clientId, raftPeerId, RAFT_GROUP_ID, nextCallId());
return server.getGroupInfo(groupInfoRequest);

View File

@ -28,6 +28,8 @@ import scala.collection.mutable
import scala.util.Random
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.ratis.proto.RaftProtos
import org.apache.ratis.proto.RaftProtos.RaftPeerRole
import org.apache.celeborn.common.CelebornConf
import org.apache.celeborn.common.client.MasterClient
@ -840,6 +842,13 @@ private[celeborn] class Master(
}.asJava
}
override def getMasterGroupInfo: String = {
val sb = new StringBuilder
sb.append("====================== Master Group INFO ==============================\n")
sb.append(getMasterGroupInfoInternal())
sb.toString()
}
override def getWorkerInfo: String = {
val sb = new StringBuilder
sb.append("====================== Workers Info in Master =========================\n")
@ -940,6 +949,40 @@ private[celeborn] class Master(
isActive
}
private def getMasterGroupInfoInternal(): String = {
if (conf.haEnabled) {
val sb = new StringBuilder
val groupInfo = statusSystem.asInstanceOf[HAMasterMetaManager].getRatisServer.getGroupInfo
sb.append(s"group id: ${groupInfo.getGroup.getGroupId.getUuid}\n")
def getLeader(roleInfo: RaftProtos.RoleInfoProto): RaftProtos.RaftPeerProto = {
if (roleInfo == null) {
return null
}
if (roleInfo.getRole == RaftPeerRole.LEADER) {
return roleInfo.getSelf
}
val followerInfo = roleInfo.getFollowerInfo
if (followerInfo == null) {
return null
}
followerInfo.getLeaderInfo.getId
}
val leader = getLeader(groupInfo.getRoleInfoProto)
if (leader == null) {
sb.append("leader not found\n")
} else {
sb.append(s"leader info: ${leader.getId.toStringUtf8}(${leader.getAddress})\n\n")
}
sb.append(groupInfo.getCommitInfos)
sb.append("\n")
sb.toString()
} else {
"HA is not enabled"
}
}
override def initialize(): Unit = {
super.initialize()
logInfo("Master started.")

View File

@ -41,6 +41,8 @@ abstract class HttpService extends Service with Logging {
sb.toString()
}
def getMasterGroupInfo: String = throw new UnsupportedOperationException()
def getWorkerInfo: String
def getLostWorkers: String = throw new UnsupportedOperationException()

View File

@ -65,6 +65,8 @@ class HttpRequestHandler(
path match {
case "/conf" =>
service.getConf
case "/masterGroupInfo" if service.serviceName == Service.MASTER =>
service.getMasterGroupInfo
case "/workerInfo" =>
service.getWorkerInfo
case "/lostWorkers" if service.serviceName == Service.MASTER =>