FAQ Search Today's Posts Mark Forums Read
» Video Reviews

» Linux Archive

Linux-archive is a website aiming to archive linux email lists and to make them easily accessible for linux users/developers.


» Sponsor

» Partners

» Sponsor

Go Back   Linux Archive > Redhat > Cluster Development

 
 
LinkBack Thread Tools
 
Old 11-10-2010, 01:39 PM
Lon Hohberger
 
Default qdiskd: (STABLE31) Don't write evictions if allow_kill is off

Previously, qdisk master would write an eviction notice to disk
for a hung qdisk node even if allow_kill was off, causing the
other node to reboot.

This patch causes the qdisk master to write S_NONE as the state
of hung nodes on-disk when allow_kill is off instead of S_EVICT.

So, when the node wakes up, it will read the S_NONE state and
take action based on that state instead of reading S_EVICT and
rebooting.

Because there is so much internal qdiskd state which would need
to be fixed on a node which is in this state (including rejoining
the qdisk membership), the only clean method to continue
operations is to restart qdiskd.

Resolves: rhbz#602731

Signed-off-by: Lon Hohberger <lhh@redhat.com>
---
cman/qdisk/main.c | 80 +++++++++++++++++++++++++++++++++++++++++------------
1 files changed, 62 insertions(+), 18 deletions(-)

diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c
index 6a9b821..f0b7a5f 100644
--- a/cman/qdisk/main.c
+++ b/cman/qdisk/main.c
@@ -128,17 +128,36 @@ node_info_init(node_info_t *ni, int max)
}


+static void
+reincarnate(void)
+{
+ char buf[PATH_MAX];
+ char cmd[PATH_MAX];
+
+ logt_print(LOG_CRIT, "Attempting to restart
");
+
+ snprintf(buf, sizeof(buf), "/proc/%d/exe", getpid());
+ if (readlink(buf, cmd, sizeof(cmd)) < 0)
+ goto out_die;
+
+ execlp(cmd, cmd, NULL);
+out_die:
+ logt_print(LOG_CRIT, "Unable to restart; dying.
");
+ exit(-1);
+}
+
+
/**
Check to see if someone tried to evict us but we were out to lunch.
Rare case; usually other nodes would put up the 'Undead' message and
re-evict us.
*/
-static void
+static int
check_self(qd_ctx *ctx, status_block_t *sb)
{
if (!sb->ps_updatenode ||
(sb->ps_updatenode == ctx->qc_my_id)) {
- return;
+ return 0;
}

/* I did not update this??! */
@@ -146,10 +165,16 @@ check_self(qd_ctx *ctx, status_block_t *sb)
case S_EVICT:
/* Someone told us to die. */
reboot(RB_AUTOBOOT);
+ case S_NONE:
+ return -1;
default:
- logt_print(LOG_EMERG, "Unhandled state: %d
", sb->ps_state);
- raise(SIGSTOP);
+ break;
}
+
+ logt_print(LOG_EMERG, "Unhandled state: %d
", sb->ps_state);
+ raise(SIGSTOP);
+
+ return -1;
}


@@ -179,9 +204,11 @@ read_node_blocks(qd_ctx *ctx, node_info_t *ni, int max)
swab_status_block_t(sb);

if (sb->ps_nodeid == ctx->qc_my_id) {
- check_self(ctx, sb);
+ if (check_self(ctx, sb) < 0)
+ reincarnate();
continue;
}
+
/* message. */
memcpy(&(ni[x].ni_last_msg), &(ni[x].ni_msg),
sizeof(ni[x].ni_last_msg));
@@ -297,17 +324,26 @@ check_transitions(qd_ctx *ctx, node_info_t *ni, int max, memb_mask_t mask)
Write eviction notice if we're the master.
*/
if (ctx->qc_status == S_MASTER) {
- logt_print(LOG_NOTICE,
- "Writing eviction notice for node %d
",
- ni[x].ni_status.ps_nodeid);
- qd_write_status(ctx, ni[x].ni_status.ps_nodeid,
- S_EVICT, NULL, NULL, NULL);
+
if (ctx->qc_flags & RF_ALLOW_KILL) {
+ logt_print(LOG_NOTICE,
+ "Writing eviction notice for node %d
",
+ ni[x].ni_status.ps_nodeid);
+ qd_write_status(ctx, ni[x].ni_status.ps_nodeid,
+ S_EVICT, NULL, NULL, NULL);
logt_print(LOG_DEBUG, "Telling CMAN to "
"kill the node
");
cman_kill_node(ctx->qc_cman_admin,
ni[x].ni_status.ps_nodeid);
+ } else {
+ logt_print(LOG_NOTICE,
+ "Node %d should be evicted, but "
+ "allow_kill is off
",
+ ni[x].ni_status.ps_nodeid);
+ qd_write_status(ctx, ni[x].ni_status.ps_nodeid,
+ S_NONE, NULL, NULL, NULL);
}
+
}

/* Clear our master mask for the node after eviction */
@@ -332,20 +368,28 @@ check_transitions(qd_ctx *ctx, node_info_t *ni, int max, memb_mask_t mask)
logt_print(LOG_CRIT, "Node %d is undead.
",
ni[x].ni_status.ps_nodeid);

- logt_print(LOG_ALERT,
- "Writing eviction notice (again) for node %d
",
- ni[x].ni_status.ps_nodeid);
- qd_write_status(ctx, ni[x].ni_status.ps_nodeid,
- S_EVICT, NULL, NULL, NULL);
- ni[x].ni_status.ps_state = S_EVICT;
-
- /* XXX Need to fence it again */
if (ctx->qc_flags & RF_ALLOW_KILL) {
+ logt_print(LOG_ALERT,
+ "Writing eviction notice (again) for node %d
",
+ ni[x].ni_status.ps_nodeid);
+ qd_write_status(ctx, ni[x].ni_status.ps_nodeid,
+ S_EVICT, NULL, NULL, NULL);
+ ni[x].ni_status.ps_state = S_EVICT;
+
+ /* XXX Need to fence it again */
logt_print(LOG_DEBUG, "Telling CMAN to "
"kill the node
");
cman_kill_node(ctx->qc_cman_admin,
ni[x].ni_status.ps_nodeid);
+ } else {
+ /* administrator doesn't care */
+ logt_print(LOG_DEBUG,
+ "Ignoring zombie node %d since "
+ "allow_kill is off
",
+ ni[x].ni_status.ps_nodeid);
+ ni[x].ni_evil_incarnation = 0;
}
+
continue;
}

--
1.7.2.3
 

Thread Tools




All times are GMT. The time now is 09:54 PM.

VBulletin, Copyright ©2000 - 2014, Jelsoft Enterprises Ltd.
Content Relevant URLs by vBSEO ©2007, Crawlability, Inc.
Copyright 2007 - 2008, www.linux-archive.org