qdiskd: (STABLE31) Don't write evictions if allow_kill is off
Previously, qdisk master would write an eviction notice to disk
for a hung qdisk node even if allow_kill was off, causing the
other node to reboot.
This patch causes the qdisk master to write S_NONE as the state
of hung nodes on-disk when allow_kill is off instead of S_EVICT.
So, when the node wakes up, it will read the S_NONE state and
take action based on that state instead of reading S_EVICT and
rebooting.
Because there is so much internal qdiskd state which would need
to be fixed on a node which is in this state (including rejoining
the qdisk membership), the only clean method to continue
operations is to restart qdiskd.
diff --git a/cman/qdisk/main.c b/cman/qdisk/main.c
index 6a9b821..f0b7a5f 100644
--- a/cman/qdisk/main.c
+++ b/cman/qdisk/main.c
@@ -128,17 +128,36 @@ node_info_init(node_info_t *ni, int max)
}
+static void
+reincarnate(void)
+{
+ char buf[PATH_MAX];
+ char cmd[PATH_MAX];
+
+ logt_print(LOG_CRIT, "Attempting to restart
");
+
+ snprintf(buf, sizeof(buf), "/proc/%d/exe", getpid());
+ if (readlink(buf, cmd, sizeof(cmd)) < 0)
+ goto out_die;
+
+ execlp(cmd, cmd, NULL);
+out_die:
+ logt_print(LOG_CRIT, "Unable to restart; dying.
");
+ exit(-1);
+}
+
+
/**
Check to see if someone tried to evict us but we were out to lunch.
Rare case; usually other nodes would put up the 'Undead' message and
re-evict us.
*/
-static void
+static int
check_self(qd_ctx *ctx, status_block_t *sb)
{
if (!sb->ps_updatenode ||
(sb->ps_updatenode == ctx->qc_my_id)) {
- return;
+ return 0;
}
/* I did not update this??! */
@@ -146,10 +165,16 @@ check_self(qd_ctx *ctx, status_block_t *sb)
case S_EVICT:
/* Someone told us to die. */
reboot(RB_AUTOBOOT);
+ case S_NONE:
+ return -1;
default:
- logt_print(LOG_EMERG, "Unhandled state: %d
", sb->ps_state);
- raise(SIGSTOP);
+ break;
}
+
+ logt_print(LOG_EMERG, "Unhandled state: %d
", sb->ps_state);
+ raise(SIGSTOP);
+
+ return -1;
}