[stgt] [PATCH] new timer-based work scheduling

FUJITA Tomonori fujita.tomonori at lab.ntt.co.jp
Mon Jan 17 11:54:49 CET 2011


On Mon, 17 Jan 2011 11:04:17 +0900
FUJITA Tomonori <fujita.tomonori at lab.ntt.co.jp> wrote:

> On Sun, 16 Jan 2011 18:24:59 +0200
> Alexander Nezhinsky <alexandern at Voltaire.COM> wrote:
> 
> > On 01/15/2011 01:14 PM, FUJITA Tomonori wrote:
> > >> Re-implementing the time-based work scheduler. This patch implements
> > >> a timer-based scheme.
> > 
> > > You need this patch for more precise timer, right? In another way, the
> > > problem that you try to fix is that the current timer is too
> > > unexpected.
> > 
> > Yes, the jiffies mechanism relied on the timeout values passed to 
> > epoll_wait(), but the events could pop up before the timed has expired.
> > So under load, the intervals between jiffies counts are practically zero.
> > On the other hand, event handlers can take indefinite times, but this
> > would be undetected by the jiffies count.
> > 
> > In the new scheme, the timer never "underflows" - at least the requested
> > interval is alwys guaranteed. Other event handlers can still delay 
> > the actual work scheduling, but after they return all overdue work units
> > are submitted for execution immediately.
> > 
> > The underlying timer fires every 250ms, setting the expected accuracy of
> > scheduling. I guess that all event handlers practically complete well
> > within this interval, so we can fairly count on delays bounded by, 
> > let's say, ~300ms.
> 
> Even if the timer fires every 250ms, there is no guarantee that the
> file descriptor is handled immediately, right?
> 
> Adding the better timer is fine by me. Can we use timerfd instead of
> using another thread? timerfd is exactly what we want here.
> 
> We still need to care about too old kernels that don't support
> timerfd?

Here's an example to use timerfd.

We need to keep the old jiffies code for old kernels and non Linux but
timerfd-capable kernels can enjoy timerfd.


diff --git a/usr/tgtd.c b/usr/tgtd.c
index 2fd4959..13ff65c 100644
--- a/usr/tgtd.c
+++ b/usr/tgtd.c
@@ -337,7 +337,7 @@ static void event_loop(void)
 
 retry:
 	sched_remains = tgt_exec_scheduled();
-	timeout = sched_remains ? 0 : TGTD_TICK_PERIOD * 1000;
+	timeout = sched_remains ? 0 : -1;
 
 	nevent = epoll_wait(ep_fd, events, ARRAY_SIZE(events), timeout);
 	if (nevent < 0) {
@@ -350,8 +350,7 @@ retry:
 			tev = (struct event_data *) events[i].data.ptr;
 			tev->handler(tev->fd, events[i].events, tev->data);
 		}
-	} else
-		schedule();
+	}
 
 	if (system_active)
 		goto retry;
@@ -517,12 +516,18 @@ int main(int argc, char **argv)
 		}
 	}
 
+	err = work_timer_start();
+	if (err)
+		exit(1);
+
 	bs_init();
 
 	event_loop();
 
 	lld_exit();
 
+	work_timer_stop();
+
 	ipc_exit();
 
 	log_close();
diff --git a/usr/work.c b/usr/work.c
index 3080a59..2373e78 100644
--- a/usr/work.c
+++ b/usr/work.c
@@ -1,8 +1,9 @@
 /*
- * bogus scheduler
+ * work scheduler, loosely timer-based
  *
  * Copyright (C) 2006-2007 FUJITA Tomonori <tomof at acm.org>
  * Copyright (C) 2006-2007 Mike Christie <michaelc at cs.wisc.edu>
+ * Copyright (C) 2011 Alexander Nezhinsky <alexandern at voltaire.com>
  *
  * This program is free software; you can redistribute it and/or
  * modify it under the terms of the GNU General Public License as
@@ -21,27 +22,107 @@
  */
 #include <stdlib.h>
 #include <stdint.h>
+#include <sys/epoll.h>
+#include <sys/timerfd.h>
 
 #include "list.h"
 #include "util.h"
 #include "log.h"
 #include "work.h"
+#include "tgtd.h"
+
+#define WORK_TIMER_INT_MSEC     250
+#define WORK_TIMER_INT_NSEC     (WORK_TIMER_INT_MSEC * 1000 * 1000)
+
+static int timer_fd = -1;
+static unsigned int current_time;
 
-static unsigned int jiffies;
 static LIST_HEAD(active_work_list);
 static LIST_HEAD(inactive_work_list);
 
+static void execute_work(void);
+
+static void work_timer_evt_handler(int fd, int events, void *data)
+{
+	unsigned long long s;
+	static int first;
+	int ret;
+
+	if (!first) {
+		first++;
+		return;
+	}
+
+	ret = read(timer_fd, &s, sizeof(s));
+	if (ret < 0) {
+		if (ret == -EAGAIN)
+			return;
+		eprintf("Failed to read from pipe, %m\n");
+		return;
+	}
+
+	current_time += s * WORK_TIMER_INT_MSEC;
+
+	execute_work();
+}
+
+int work_timer_start(void)
+{
+	struct itimerspec new, old;
+	int ret;
+
+	if (timer_fd != -1)
+		return 0;
+
+	timer_fd = timerfd_create(CLOCK_REALTIME, TFD_NONBLOCK);
+	if (timer_fd < 0) {
+		eprintf("the system doesn't support timerfd");
+		goto timer_err;
+	}
+
+	new.it_value.tv_sec = 0;
+	new.it_value.tv_nsec = 1;
+
+	new.it_interval.tv_sec = 0;
+	new.it_interval.tv_nsec = WORK_TIMER_INT_NSEC;
+
+	ret = timerfd_settime(timer_fd, TFD_TIMER_ABSTIME, &new, &old);
+	if (ret < 0) {
+		eprintf("the system doesn't support timerfd");
+		close(timer_fd);
+		goto timer_err;
+	}
+
+	ret = tgt_event_add(timer_fd, EPOLLIN,
+			    work_timer_evt_handler, NULL);
+	if (ret) {
+		eprintf("failed to add timer event, fd:%d\n", timer_fd);
+		goto timer_err;
+	}
+
+	return 0;
+timer_err:
+	work_timer_stop();
+	return -1;
+}
+
+int work_timer_stop(void)
+{
+	if (timer_fd == -1)
+		return 0;
+
+	tgt_event_del(timer_fd);
+	close(timer_fd);
+
+	return 0;
+}
+
 void add_work(struct tgt_work *work, unsigned int second)
 {
-	unsigned int when;
 	struct tgt_work *ent;
 
 	if (second) {
-		when = second / TGTD_TICK_PERIOD;
-		if (!when)
-			when = 1;
-
-		work->when = when + jiffies;
+		work->when = current_time + second * 1000;
 
 		list_for_each_entry(ent, &inactive_work_list, entry) {
 			if (before(work->when, ent->when))
@@ -49,8 +130,10 @@ void add_work(struct tgt_work *work, unsigned int second)
 		}
 
 		list_add_tail(&work->entry, &ent->entry);
-	} else
+	} else {
 		list_add_tail(&work->entry, &active_work_list);
+		execute_work();
+	}
 }
 
 void del_work(struct tgt_work *work)
@@ -58,20 +141,16 @@ void del_work(struct tgt_work *work)
 	list_del_init(&work->entry);
 }
 
-/*
- * this function is called only when the system is idle. So this
- * scheduler is pretty bogus. Your job would be delayed unexpectedly.
- */
-void schedule(void)
+static void execute_work()
 {
 	struct tgt_work *work, *n;
 
 	list_for_each_entry_safe(work, n, &inactive_work_list, entry) {
-		if (after(jiffies, work->when)) {
-			list_del(&work->entry);
-			list_add_tail(&work->entry, &active_work_list);
-		} else
+		if (before(current_time, work->when))
 			break;
+
+		list_del(&work->entry);
+		list_add_tail(&work->entry, &active_work_list);
 	}
 
 	while (!list_empty(&active_work_list)) {
@@ -80,6 +159,4 @@ void schedule(void)
 		list_del_init(&work->entry);
 		work->func(work->data);
 	}
-
-	jiffies++;
 }
diff --git a/usr/work.h b/usr/work.h
index 3d5e75e..7b1876a 100644
--- a/usr/work.h
+++ b/usr/work.h
@@ -1,8 +1,6 @@
 #ifndef __SCHED_H
 #define __SCHED_H
 
-#define TGTD_TICK_PERIOD 2
-
 struct tgt_work {
 	struct list_head entry;
 	void (*func)(void *);
@@ -10,7 +8,9 @@ struct tgt_work {
 	unsigned int when;
 };
 
-extern void schedule(void);
+extern int work_timer_start(void);
+extern int work_timer_stop(void);
+
 extern void add_work(struct tgt_work *work, unsigned int second);
 extern void del_work(struct tgt_work *work);
 
--
To unsubscribe from this list: send the line "unsubscribe stgt" in
the body of a message to majordomo at vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html



More information about the stgt mailing list