[CELEBORN-313] Add rest endpoint to show master group info
### What changes were proposed in this pull request? <img width="1347" alt="image" src="https://github.com/apache/incubator-celeborn/assets/3898450/43d10bff-6878-4591-9461-889494d797f9"> ### Why are the changes needed? ### Does this PR introduce _any_ user-facing change? No. ### How was this patch tested? ```bash ./bin/celeborn-ratis sh -Draft.rpc.type=NETTY group info -peers clb-1:9872,clb-2:9873,clb-3:9874 ``` ``` group id: c5196f6d-2c34-3ed3-8b8a-47bede733167 leader info: 1(clb-1:9872) [server { id: "1" address: "clb-1:9872" clientAddress: "clb-1:9097" startupRole: FOLLOWER } commitIndex: 316 , server { id: "2" address: "clb-2:9873" clientAddress: "clb-2:9098" startupRole: FOLLOWER } commitIndex: 316 , server { id: "3" address: "clb-3:9874" clientAddress: "clb-3:9099" startupRole: FOLLOWER } commitIndex: 316 ] ``` ```bash curl http://clb-3:9983/masterGroupInfo ``` ``` ====================== Master Group INFO ============================== group id: c5196f6d-2c34-3ed3-8b8a-47bede733167 leader info: 1(clb-1:9872) [server { id: "3" address: "clb-3:9874" clientAddress: "clb-3:9099" startupRole: FOLLOWER } commitIndex: 316 , server { id: "1" address: "clb-1:9872" clientAddress: "clb-1:9097" startupRole: FOLLOWER } commitIndex: 316 , server { id: "2" address: "clb-2:9873" clientAddress: "clb-2:9098" startupRole: FOLLOWER } commitIndex: 316 ] ``` Closes #1946 from cxzl25/CELEBORN-313. Authored-by: sychen <sychen@ctrip.com> Signed-off-by: Cheng Pan <chengpan@apache.org>
This commit is contained in:
parent
e4a60d15e4
commit
5310bcaf6b
@ -291,6 +291,7 @@ API path listed as below:
|
||||
|-----------------------|-------------------------------------------------------------------------------------------------------------|
|
||||
| /metrics/prometheus | List the metrics data in prometheus format of the master. |
|
||||
| /conf | List the conf setting of the master. |
|
||||
| /masterGroupInfo | List master group information of the service. It will list all master's LEADER, FOLLOWER information. |
|
||||
| /workerInfo | List worker information of the service. It will list all registered workers 's information. |
|
||||
| /lostWorkers | List all lost workers of the master. |
|
||||
| /excludedWorkers | List all excluded workers of the master. |
|
||||
|
||||
@ -483,7 +483,7 @@ public class HARaftServer {
|
||||
}
|
||||
}
|
||||
|
||||
private GroupInfoReply getGroupInfo() throws IOException {
|
||||
public GroupInfoReply getGroupInfo() throws IOException {
|
||||
GroupInfoRequest groupInfoRequest =
|
||||
new GroupInfoRequest(clientId, raftPeerId, RAFT_GROUP_ID, nextCallId());
|
||||
return server.getGroupInfo(groupInfoRequest);
|
||||
|
||||
@ -28,6 +28,8 @@ import scala.collection.mutable
|
||||
import scala.util.Random
|
||||
|
||||
import org.apache.hadoop.fs.{FileSystem, Path}
|
||||
import org.apache.ratis.proto.RaftProtos
|
||||
import org.apache.ratis.proto.RaftProtos.RaftPeerRole
|
||||
|
||||
import org.apache.celeborn.common.CelebornConf
|
||||
import org.apache.celeborn.common.client.MasterClient
|
||||
@ -840,6 +842,13 @@ private[celeborn] class Master(
|
||||
}.asJava
|
||||
}
|
||||
|
||||
override def getMasterGroupInfo: String = {
|
||||
val sb = new StringBuilder
|
||||
sb.append("====================== Master Group INFO ==============================\n")
|
||||
sb.append(getMasterGroupInfoInternal())
|
||||
sb.toString()
|
||||
}
|
||||
|
||||
override def getWorkerInfo: String = {
|
||||
val sb = new StringBuilder
|
||||
sb.append("====================== Workers Info in Master =========================\n")
|
||||
@ -940,6 +949,40 @@ private[celeborn] class Master(
|
||||
isActive
|
||||
}
|
||||
|
||||
private def getMasterGroupInfoInternal(): String = {
|
||||
if (conf.haEnabled) {
|
||||
val sb = new StringBuilder
|
||||
val groupInfo = statusSystem.asInstanceOf[HAMasterMetaManager].getRatisServer.getGroupInfo
|
||||
sb.append(s"group id: ${groupInfo.getGroup.getGroupId.getUuid}\n")
|
||||
|
||||
def getLeader(roleInfo: RaftProtos.RoleInfoProto): RaftProtos.RaftPeerProto = {
|
||||
if (roleInfo == null) {
|
||||
return null
|
||||
}
|
||||
if (roleInfo.getRole == RaftPeerRole.LEADER) {
|
||||
return roleInfo.getSelf
|
||||
}
|
||||
val followerInfo = roleInfo.getFollowerInfo
|
||||
if (followerInfo == null) {
|
||||
return null
|
||||
}
|
||||
followerInfo.getLeaderInfo.getId
|
||||
}
|
||||
|
||||
val leader = getLeader(groupInfo.getRoleInfoProto)
|
||||
if (leader == null) {
|
||||
sb.append("leader not found\n")
|
||||
} else {
|
||||
sb.append(s"leader info: ${leader.getId.toStringUtf8}(${leader.getAddress})\n\n")
|
||||
}
|
||||
sb.append(groupInfo.getCommitInfos)
|
||||
sb.append("\n")
|
||||
sb.toString()
|
||||
} else {
|
||||
"HA is not enabled"
|
||||
}
|
||||
}
|
||||
|
||||
override def initialize(): Unit = {
|
||||
super.initialize()
|
||||
logInfo("Master started.")
|
||||
|
||||
@ -41,6 +41,8 @@ abstract class HttpService extends Service with Logging {
|
||||
sb.toString()
|
||||
}
|
||||
|
||||
def getMasterGroupInfo: String = throw new UnsupportedOperationException()
|
||||
|
||||
def getWorkerInfo: String
|
||||
|
||||
def getLostWorkers: String = throw new UnsupportedOperationException()
|
||||
|
||||
@ -65,6 +65,8 @@ class HttpRequestHandler(
|
||||
path match {
|
||||
case "/conf" =>
|
||||
service.getConf
|
||||
case "/masterGroupInfo" if service.serviceName == Service.MASTER =>
|
||||
service.getMasterGroupInfo
|
||||
case "/workerInfo" =>
|
||||
service.getWorkerInfo
|
||||
case "/lostWorkers" if service.serviceName == Service.MASTER =>
|
||||
|
||||
Loading…
Reference in New Issue
Block a user