Skip to content

Instantly share code, notes, and snippets.

@Pacifist117
Created September 21, 2013 15:32
Show Gist options
  • Save Pacifist117/6651627 to your computer and use it in GitHub Desktop.
Save Pacifist117/6651627 to your computer and use it in GitHub Desktop.
Makefile | 4 +-
arch/arm/Kconfig | 8 +
arch/arm/include/asm/timex.h | 2 +
arch/arm/include/asm/unistd.h | 3 +
arch/arm/kernel/calls.S | 12 +
arch/arm/kernel/smp.c | 4 +
arch/arm/mach-realview/include/mach/timex.h | 27 +
arch/x86/Kconfig | 8 +
arch/x86/include/asm/entry_arch.h | 1 +
arch/x86/include/asm/feather_trace.h | 17 +
arch/x86/include/asm/feather_trace_32.h | 115 ++
arch/x86/include/asm/feather_trace_64.h | 124 ++
arch/x86/include/asm/hw_irq.h | 3 +
arch/x86/include/asm/irq_vectors.h | 7 +
arch/x86/include/asm/processor.h | 4 +
arch/x86/include/asm/unistd_32.h | 6 +-
arch/x86/include/asm/unistd_64.h | 4 +
arch/x86/kernel/Makefile | 2 +
arch/x86/kernel/cpu/intel_cacheinfo.c | 17 +
arch/x86/kernel/entry_64.S | 2 +
arch/x86/kernel/ft_event.c | 118 ++
arch/x86/kernel/irqinit.c | 3 +
arch/x86/kernel/smp.c | 29 +
arch/x86/kernel/syscall_table_32.S | 12 +
fs/exec.c | 13 +-
fs/inode.c | 2 +
include/linux/fs.h | 21 +-
include/linux/hardirq.h | 4 +
include/linux/hrtimer.h | 32 +
include/linux/sched.h | 19 +-
include/linux/smp.h | 5 +
include/linux/tick.h | 5 +
include/litmus/affinity.h | 80 ++
include/litmus/bheap.h | 77 ++
include/litmus/binheap.h | 206 ++++
include/litmus/budget.h | 35 +
include/litmus/clustered.h | 44 +
include/litmus/debug_trace.h | 37 +
include/litmus/edf_common.h | 25 +
include/litmus/fdso.h | 77 ++
include/litmus/feather_buffer.h | 94 ++
include/litmus/feather_trace.h | 65 +
include/litmus/fp_common.h | 105 ++
include/litmus/fpmath.h | 147 +++
include/litmus/ftdev.h | 55 +
include/litmus/jobs.h | 9 +
include/litmus/litmus.h | 300 +++++
include/litmus/litmus_proc.h | 25 +
include/litmus/locking.h | 28 +
include/litmus/preempt.h | 164 +++
include/litmus/rt_domain.h | 182 +++
include/litmus/rt_param.h | 253 ++++
include/litmus/sched_plugin.h | 113 ++
include/litmus/sched_trace.h | 259 ++++
include/litmus/srp.h | 28 +
include/litmus/trace.h | 145 +++
include/litmus/trace_irq.h | 14 +
include/litmus/unistd_32.h | 21 +
include/litmus/unistd_64.h | 33 +
include/litmus/wait.h | 57 +
include/trace/events/litmus.h | 231 ++++
kernel/exit.c | 4 +
kernel/fork.c | 7 +
kernel/hrtimer.c | 95 ++
kernel/printk.c | 14 +-
kernel/sched.c | 152 +++-
kernel/sched_fair.c | 3 +
kernel/sched_rt.c | 15 +-
kernel/softirq.c | 3 +
kernel/time/tick-sched.c | 47 +
litmus/Kconfig | 282 +++++
litmus/Makefile | 32 +
litmus/affinity.c | 42 +
litmus/bheap.c | 314 +++++
litmus/binheap.c | 388 ++++++
litmus/budget.c | 113 ++
litmus/clustered.c | 111 ++
litmus/ctrldev.c | 160 +++
litmus/edf_common.c | 200 ++++
litmus/fdso.c | 305 +++++
litmus/fp_common.c | 119 ++
litmus/ft_event.c | 43 +
litmus/ftdev.c | 446 +++++++
litmus/jobs.c | 57 +
litmus/litmus.c | 593 ++++++++++
litmus/litmus_proc.c | 347 ++++++
litmus/locking.c | 188 +++
litmus/preempt.c | 137 +++
litmus/rt_domain.c | 349 ++++++
litmus/sched_cedf.c | 856 ++++++++++++++
litmus/sched_gsn_edf.c | 1022 ++++++++++++++++
litmus/sched_litmus.c | 330 ++++++
litmus/sched_pfair.c | 1074 +++++++++++++++++
litmus/sched_pfp.c | 1709 +++++++++++++++++++++++++++
litmus/sched_plugin.c | 227 ++++
litmus/sched_psn_edf.c | 653 ++++++++++
litmus/sched_task_trace.c | 241 ++++
litmus/sched_trace.c | 252 ++++
litmus/srp.c | 295 +++++
litmus/sync.c | 152 +++
litmus/trace.c | 300 +++++
101 files changed, 15179 insertions(+), 40 deletions(-)
diff --git a/Makefile b/Makefile
index 6a5bdad..a327725 100644
--- a/Makefile
+++ b/Makefile
@@ -1,7 +1,7 @@
VERSION = 3
PATCHLEVEL = 0
SUBLEVEL = 0
-EXTRAVERSION =
+EXTRAVERSION =-litmus
NAME = Sneaky Weasel
# *DOCUMENTATION*
@@ -708,7 +708,7 @@ export mod_strip_cmd
ifeq ($(KBUILD_EXTMOD),)
-core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/
+core-y += kernel/ mm/ fs/ ipc/ security/ crypto/ block/ litmus/
vmlinux-dirs := $(patsubst %/,%,$(filter %/, $(init-y) $(init-m) \
$(core-y) $(core-m) $(drivers-y) $(drivers-m) \
diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig
index 9adc278..fb228ea 100644
--- a/arch/arm/Kconfig
+++ b/arch/arm/Kconfig
@@ -2040,3 +2040,11 @@ source "security/Kconfig"
source "crypto/Kconfig"
source "lib/Kconfig"
+
+config ARCH_HAS_SEND_PULL_TIMERS
+ def_bool n
+
+config ARCH_HAS_FEATHER_TRACE
+ def_bool n
+
+source "litmus/Kconfig"
diff --git a/arch/arm/include/asm/timex.h b/arch/arm/include/asm/timex.h
index 3be8de3..8a102a3 100644
--- a/arch/arm/include/asm/timex.h
+++ b/arch/arm/include/asm/timex.h
@@ -16,9 +16,11 @@
typedef unsigned long cycles_t;
+#ifndef get_cycles
static inline cycles_t get_cycles (void)
{
return 0;
}
+#endif
#endif
diff --git a/arch/arm/include/asm/unistd.h b/arch/arm/include/asm/unistd.h
index 2c04ed5..0196edf 100644
--- a/arch/arm/include/asm/unistd.h
+++ b/arch/arm/include/asm/unistd.h
@@ -403,6 +403,9 @@
#define __NR_sendmmsg (__NR_SYSCALL_BASE+374)
#define __NR_setns (__NR_SYSCALL_BASE+375)
+#define __NR_LITMUS (__NR_SYSCALL_BASE+376)
+#include <litmus/unistd_32.h>
+
/*
* The following SWIs are ARM private.
*/
diff --git a/arch/arm/kernel/calls.S b/arch/arm/kernel/calls.S
index 80f7896..ed2ae93 100644
--- a/arch/arm/kernel/calls.S
+++ b/arch/arm/kernel/calls.S
@@ -385,6 +385,18 @@
CALL(sys_syncfs)
CALL(sys_sendmmsg)
/* 375 */ CALL(sys_setns)
+ CALL(sys_set_rt_task_param)
+ CALL(sys_get_rt_task_param)
+ CALL(sys_complete_job)
+ CALL(sys_od_open)
+/* 380 */ CALL(sys_od_close)
+ CALL(sys_litmus_lock)
+ CALL(sys_litmus_unlock)
+ CALL(sys_query_job_no)
+ CALL(sys_wait_for_job_release)
+/* 385 */ CALL(sys_wait_for_ts_release)
+ CALL(sys_release_ts)
+ CALL(sys_null_call)
#ifndef syscalls_counted
.equ syscalls_padding, ((NR_syscalls + 3) & ~3) - NR_syscalls
#define syscalls_counted
diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c
index e7f92a4..5a57429 100644
--- a/arch/arm/kernel/smp.c
+++ b/arch/arm/kernel/smp.c
@@ -40,6 +40,8 @@
#include <asm/ptrace.h>
#include <asm/localtimer.h>
+#include <litmus/preempt.h>
+
/*
* as from 2.5, kernels no longer have an init_tasks structure
* so we need some other way of telling a new secondary core
@@ -572,6 +574,8 @@ asmlinkage void __exception_irq_entry do_IPI(int ipinr, struct pt_regs *regs)
break;
case IPI_RESCHEDULE:
+ /* LITMUS^RT: take action based on scheduler state */
+ sched_state_ipi();
scheduler_ipi();
break;
diff --git a/arch/arm/mach-realview/include/mach/timex.h b/arch/arm/mach-realview/include/mach/timex.h
index 4eeb069..e8bcc40 100644
--- a/arch/arm/mach-realview/include/mach/timex.h
+++ b/arch/arm/mach-realview/include/mach/timex.h
@@ -21,3 +21,30 @@
*/
#define CLOCK_TICK_RATE (50000000 / 16)
+
+#if defined(CONFIG_MACH_REALVIEW_PB11MP) || defined(CONFIG_MACH_REALVIEW_PB1176)
+
+static inline unsigned long realview_get_arm11_cp15_ccnt(void)
+{
+ unsigned long cycles;
+ /* Read CP15 CCNT register. */
+ asm volatile ("mrc p15, 0, %0, c15, c12, 1" : "=r" (cycles));
+ return cycles;
+}
+
+#define get_cycles realview_get_arm11_cp15_ccnt
+
+#elif defined(CONFIG_MACH_REALVIEW_PBA8)
+
+
+static inline unsigned long realview_get_a8_cp15_ccnt(void)
+{
+ unsigned long cycles;
+ /* Read CP15 CCNT register. */
+ asm volatile ("mrc p15, 0, %0, c9, c13, 0" : "=r" (cycles));
+ return cycles;
+}
+
+#define get_cycles realview_get_a8_cp15_ccnt
+
+#endif
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 37357a5..9f5e143 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -2166,3 +2166,11 @@ source "crypto/Kconfig"
source "arch/x86/kvm/Kconfig"
source "lib/Kconfig"
+
+config ARCH_HAS_FEATHER_TRACE
+ def_bool y
+
+config ARCH_HAS_SEND_PULL_TIMERS
+ def_bool y
+
+source "litmus/Kconfig"
diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h
index 1cd6d26..3b0d7ef 100644
--- a/arch/x86/include/asm/entry_arch.h
+++ b/arch/x86/include/asm/entry_arch.h
@@ -13,6 +13,7 @@
BUILD_INTERRUPT(reschedule_interrupt,RESCHEDULE_VECTOR)
BUILD_INTERRUPT(call_function_interrupt,CALL_FUNCTION_VECTOR)
BUILD_INTERRUPT(call_function_single_interrupt,CALL_FUNCTION_SINGLE_VECTOR)
+BUILD_INTERRUPT(pull_timers_interrupt,PULL_TIMERS_VECTOR)
BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR)
BUILD_INTERRUPT(reboot_interrupt,REBOOT_VECTOR)
diff --git a/arch/x86/include/asm/feather_trace.h b/arch/x86/include/asm/feather_trace.h
new file mode 100644
index 0000000..4fd3163
--- /dev/null
+++ b/arch/x86/include/asm/feather_trace.h
@@ -0,0 +1,17 @@
+#ifndef _ARCH_FEATHER_TRACE_H
+#define _ARCH_FEATHER_TRACE_H
+
+#include <asm/msr.h>
+
+static inline unsigned long long ft_timestamp(void)
+{
+ return __native_read_tsc();
+}
+
+#ifdef CONFIG_X86_32
+#include "feather_trace_32.h"
+#else
+#include "feather_trace_64.h"
+#endif
+
+#endif
diff --git a/arch/x86/include/asm/feather_trace_32.h b/arch/x86/include/asm/feather_trace_32.h
new file mode 100644
index 0000000..75e81a9
--- /dev/null
+++ b/arch/x86/include/asm/feather_trace_32.h
@@ -0,0 +1,115 @@
+/* Copyright (c) 2007-2012 Björn Brandenburg, <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Do not directly include this file. Include feather_trace.h instead */
+
+#define feather_callback __attribute__((regparm(3))) __attribute__((used))
+
+/*
+ * Make the compiler reload any register that is not saved in a cdecl function
+ * call (minus the registers that we explicitly clobber as output registers).
+ */
+#define __FT_CLOBBER_LIST0 "memory", "cc", "eax", "edx", "ecx"
+#define __FT_CLOBBER_LIST1 "memory", "cc", "eax", "ecx"
+#define __FT_CLOBBER_LIST2 "memory", "cc", "eax"
+#define __FT_CLOBBER_LIST3 "memory", "cc", "eax"
+
+#define __FT_TMP1(x) "=d" (x)
+#define __FT_ARG1(x) "0" ((long) (x))
+#define __FT_TMP2(x) "=c" (x)
+#define __FT_ARG2(x) "1" ((long) (x))
+
+#define __FT_ARG3(x) "r" ((long) (x))
+
+#define ft_event(id, callback) \
+ __asm__ __volatile__( \
+ "1: jmp 2f \n\t" \
+ " call " #callback " \n\t" \
+ ".section __event_table, \"aw\" \n\t" \
+ ".long " #id ", 0, 1b, 2f \n\t" \
+ ".previous \n\t" \
+ "2: \n\t" \
+ : : : __FT_CLOBBER_LIST0)
+
+#define ft_event0(id, callback) \
+ __asm__ __volatile__( \
+ "1: jmp 2f \n\t" \
+ " movl $" #id ", %%eax \n\t" \
+ " call " #callback " \n\t" \
+ ".section __event_table, \"aw\" \n\t" \
+ ".long " #id ", 0, 1b, 2f \n\t" \
+ ".previous \n\t" \
+ "2: \n\t" \
+ : : : __FT_CLOBBER_LIST0)
+
+#define ft_event1(id, callback, param) \
+ do { \
+ long __ft_tmp1; \
+ __asm__ __volatile__( \
+ "1: jmp 2f \n\t" \
+ " movl $" #id ", %%eax \n\t" \
+ " call " #callback " \n\t" \
+ ".section __event_table, \"aw\" \n\t" \
+ ".long " #id ", 0, 1b, 2f \n\t" \
+ ".previous \n\t" \
+ "2: \n\t" \
+ : __FT_TMP1(__ft_tmp1) \
+ : __FT_ARG1(param) \
+ : __FT_CLOBBER_LIST1); \
+ } while (0);
+
+#define ft_event2(id, callback, param, param2) \
+ do { \
+ long __ft_tmp1, __ft_tmp2; \
+ __asm__ __volatile__( \
+ "1: jmp 2f \n\t" \
+ " movl $" #id ", %%eax \n\t" \
+ " call " #callback " \n\t" \
+ ".section __event_table, \"aw\" \n\t" \
+ ".long " #id ", 0, 1b, 2f \n\t" \
+ ".previous \n\t" \
+ "2: \n\t" \
+ : __FT_TMP1(__ft_tmp1), __FT_TMP2(__ft_tmp2) \
+ : __FT_ARG1(param), __FT_ARG2(param2) \
+ : __FT_CLOBBER_LIST2); \
+ } while (0);
+
+
+#define ft_event3(id, callback, param, param2, param3) \
+ do { \
+ long __ft_tmp1, __ft_tmp2; \
+ __asm__ __volatile__( \
+ "1: jmp 2f \n\t" \
+ " subl $4, %%esp \n\t" \
+ " movl $" #id ", %%eax \n\t" \
+ " movl %2, (%%esp) \n\t" \
+ " call " #callback " \n\t" \
+ " addl $4, %%esp \n\t" \
+ ".section __event_table, \"aw\" \n\t" \
+ ".long " #id ", 0, 1b, 2f \n\t" \
+ ".previous \n\t" \
+ "2: \n\t" \
+ : __FT_TMP1(__ft_tmp1), __FT_TMP2(__ft_tmp2) \
+ : __FT_ARG1(param), __FT_ARG2(param2), __FT_ARG3(param3) \
+ : __FT_CLOBBER_LIST3); \
+ } while (0);
diff --git a/arch/x86/include/asm/feather_trace_64.h b/arch/x86/include/asm/feather_trace_64.h
new file mode 100644
index 0000000..5ce49e2
--- /dev/null
+++ b/arch/x86/include/asm/feather_trace_64.h
@@ -0,0 +1,124 @@
+/* Copyright (c) 2010 Andrea Bastoni, <[email protected]>
+ * Copyright (c) 2012 Björn Brandenburg, <[email protected]>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+/* Do not directly include this file. Include feather_trace.h instead */
+
+/* regparm is the default on x86_64 */
+#define feather_callback __attribute__((used))
+
+#define __FT_EVENT_TABLE(id,from,to) \
+ ".section __event_table, \"aw\"\n\t" \
+ ".balign 8\n\t" \
+ ".quad " #id ", 0, " #from ", " #to " \n\t" \
+ ".previous \n\t"
+
+/*
+ * x86_64 caller only owns rbp, rbx, r12-r15;
+ * the callee can freely modify the others.
+ */
+#define __FT_CLOBBER_LIST0 "memory", "cc", "rdi", "rsi", "rdx", "rcx", \
+ "r8", "r9", "r10", "r11", "rax"
+
+#define __FT_CLOBBER_LIST1 "memory", "cc", "rdi", "rdx", "rcx", \
+ "r8", "r9", "r10", "r11", "rax"
+
+#define __FT_CLOBBER_LIST2 "memory", "cc", "rdi", "rcx", \
+ "r8", "r9", "r10", "r11", "rax"
+
+#define __FT_CLOBBER_LIST3 "memory", "cc", "rdi", \
+ "r8", "r9", "r10", "r11", "rax"
+
+/* The registers RDI, RSI, RDX, RCX, R8 and R9 are used for integer and pointer
+ * arguments. */
+
+/* RSI */
+#define __FT_TMP1(x) "=S" (x)
+#define __FT_ARG1(x) "0" ((long) (x))
+
+/* RDX */
+#define __FT_TMP2(x) "=d" (x)
+#define __FT_ARG2(x) "1" ((long) (x))
+
+/* RCX */
+#define __FT_TMP3(x) "=c" (x)
+#define __FT_ARG3(x) "2" ((long) (x))
+
+#define ft_event(id, callback) \
+ __asm__ __volatile__( \
+ "1: jmp 2f \n\t" \
+ " call " #callback " \n\t" \
+ __FT_EVENT_TABLE(id,1b,2f) \
+ "2: \n\t" \
+ : : : __FT_CLOBBER_LIST0)
+
+#define ft_event0(id, callback) \
+ __asm__ __volatile__( \
+ "1: jmp 2f \n\t" \
+ " movq $" #id ", %%rdi \n\t" \
+ " call " #callback " \n\t" \
+ __FT_EVENT_TABLE(id,1b,2f) \
+ "2: \n\t" \
+ : : : __FT_CLOBBER_LIST0)
+
+#define ft_event1(id, callback, param) \
+ do { \
+ long __ft_tmp1; \
+ __asm__ __volatile__( \
+ "1: jmp 2f \n\t" \
+ " movq $" #id ", %%rdi \n\t" \
+ " call " #callback " \n\t" \
+ __FT_EVENT_TABLE(id,1b,2f) \
+ "2: \n\t" \
+ : __FT_TMP1(__ft_tmp1) \
+ : __FT_ARG1(param) \
+ : __FT_CLOBBER_LIST1); \
+ } while (0);
+
+#define ft_event2(id, callback, param, param2) \
+ do { \
+ long __ft_tmp1, __ft_tmp2; \
+ __asm__ __volatile__( \
+ "1: jmp 2f \n\t" \
+ " movq $" #id ", %%rdi \n\t" \
+ " call " #callback " \n\t" \
+ __FT_EVENT_TABLE(id,1b,2f) \
+ "2: \n\t" \
+ : __FT_TMP1(__ft_tmp1), __FT_TMP2(__ft_tmp2) \
+ : __FT_ARG1(param), __FT_ARG2(param2) \
+ : __FT_CLOBBER_LIST2); \
+ } while (0);
+
+#define ft_event3(id, callback, param, param2, param3) \
+ do { \
+ long __ft_tmp1, __ft_tmp2, __ft_tmp3; \
+ __asm__ __volatile__( \
+ "1: jmp 2f \n\t" \
+ " movq $" #id ", %%rdi \n\t" \
+ " call " #callback " \n\t" \
+ __FT_EVENT_TABLE(id,1b,2f) \
+ "2: \n\t" \
+ : __FT_TMP1(__ft_tmp1), __FT_TMP2(__ft_tmp2), __FT_TMP3(__ft_tmp3) \
+ : __FT_ARG1(param), __FT_ARG2(param2), __FT_ARG3(param3) \
+ : __FT_CLOBBER_LIST3); \
+ } while (0);
diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h
index bb9efe8..c490d89 100644
--- a/arch/x86/include/asm/hw_irq.h
+++ b/arch/x86/include/asm/hw_irq.h
@@ -77,6 +77,8 @@ extern void threshold_interrupt(void);
extern void call_function_interrupt(void);
extern void call_function_single_interrupt(void);
+extern void pull_timers_interrupt(void);
+
/* IOAPIC */
#define IO_APIC_IRQ(x) (((x) >= NR_IRQS_LEGACY) || ((1<<(x)) & io_apic_irqs))
extern unsigned long io_apic_irqs;
@@ -155,6 +157,7 @@ extern asmlinkage void smp_irq_move_cleanup_interrupt(void);
extern void smp_reschedule_interrupt(struct pt_regs *);
extern void smp_call_function_interrupt(struct pt_regs *);
extern void smp_call_function_single_interrupt(struct pt_regs *);
+extern void smp_pull_timers_interrupt(struct pt_regs *);
#ifdef CONFIG_X86_32
extern void smp_invalidate_interrupt(struct pt_regs *);
#else
diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h
index 6e976ee..99a44cf 100644
--- a/arch/x86/include/asm/irq_vectors.h
+++ b/arch/x86/include/asm/irq_vectors.h
@@ -135,6 +135,13 @@
#define INVALIDATE_TLB_VECTOR_START \
(INVALIDATE_TLB_VECTOR_END-NUM_INVALIDATE_TLB_VECTORS+1)
+/*
+ * LITMUS^RT pull timers IRQ vector
+ * Make sure it's below the above max 32 vectors.
+ */
+#define PULL_TIMERS_VECTOR 0xce
+
+
#define NR_VECTORS 256
#define FPU_IRQ 13
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 2193715..b844edc 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -166,6 +166,10 @@ extern void print_cpu_info(struct cpuinfo_x86 *);
extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c);
extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c);
extern unsigned short num_cache_leaves;
+#ifdef CONFIG_SYSFS
+extern int get_shared_cpu_map(cpumask_var_t mask,
+ unsigned int cpu, int index);
+#endif
extern void detect_extended_topology(struct cpuinfo_x86 *c);
extern void detect_ht(struct cpuinfo_x86 *c);
diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h
index 593485b3..2f6e127 100644
--- a/arch/x86/include/asm/unistd_32.h
+++ b/arch/x86/include/asm/unistd_32.h
@@ -353,9 +353,13 @@
#define __NR_sendmmsg 345
#define __NR_setns 346
+#define __NR_LITMUS 347
+
+#include "litmus/unistd_32.h"
+
#ifdef __KERNEL__
-#define NR_syscalls 347
+#define NR_syscalls 347 + NR_litmus_syscalls
#define __ARCH_WANT_IPC_PARSE_VERSION
#define __ARCH_WANT_OLD_READDIR
diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h
index 705bf13..e347f07 100644
--- a/arch/x86/include/asm/unistd_64.h
+++ b/arch/x86/include/asm/unistd_64.h
@@ -682,6 +682,10 @@ __SYSCALL(__NR_sendmmsg, sys_sendmmsg)
#define __NR_setns 308
__SYSCALL(__NR_setns, sys_setns)
+#define __NR_LITMUS 309
+
+#include "litmus/unistd_64.h"
+
#ifndef __NO_STUBS
#define __ARCH_WANT_OLD_READDIR
#define __ARCH_WANT_OLD_STAT
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 90b06d4..d727f8f 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -116,6 +116,8 @@ obj-$(CONFIG_X86_CHECK_BIOS_CORRUPTION) += check.o
obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
obj-$(CONFIG_OF) += devicetree.o
+obj-$(CONFIG_FEATHER_TRACE) += ft_event.o
+
###
# 64 bit specific files
ifeq ($(CONFIG_X86_64),y)
diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c
index c105c53..0bf1264 100644
--- a/arch/x86/kernel/cpu/intel_cacheinfo.c
+++ b/arch/x86/kernel/cpu/intel_cacheinfo.c
@@ -747,6 +747,23 @@ unsigned int __cpuinit init_intel_cacheinfo(struct cpuinfo_x86 *c)
static DEFINE_PER_CPU(struct _cpuid4_info *, ici_cpuid4_info);
#define CPUID4_INFO_IDX(x, y) (&((per_cpu(ici_cpuid4_info, x))[y]))
+/* returns CPUs that share the index cache with cpu */
+int get_shared_cpu_map(cpumask_var_t mask, unsigned int cpu, int index)
+{
+ int ret = 0;
+ struct _cpuid4_info *this_leaf;
+
+ if (index >= num_cache_leaves) {
+ index = num_cache_leaves - 1;
+ ret = index;
+ }
+
+ this_leaf = CPUID4_INFO_IDX(cpu,index);
+ cpumask_copy(mask, to_cpumask(this_leaf->shared_cpu_map));
+
+ return ret;
+}
+
#ifdef CONFIG_SMP
static void __cpuinit cache_shared_cpu_map_setup(unsigned int cpu, int index)
{
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 8a445a0..47a4bcd 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -1003,6 +1003,8 @@ apicinterrupt CALL_FUNCTION_VECTOR \
call_function_interrupt smp_call_function_interrupt
apicinterrupt RESCHEDULE_VECTOR \
reschedule_interrupt smp_reschedule_interrupt
+apicinterrupt PULL_TIMERS_VECTOR \
+ pull_timers_interrupt smp_pull_timers_interrupt
#endif
apicinterrupt ERROR_APIC_VECTOR \
diff --git a/arch/x86/kernel/ft_event.c b/arch/x86/kernel/ft_event.c
new file mode 100644
index 0000000..37cc332
--- /dev/null
+++ b/arch/x86/kernel/ft_event.c
@@ -0,0 +1,118 @@
+#include <linux/types.h>
+
+#include <litmus/feather_trace.h>
+
+/* the feather trace management functions assume
+ * exclusive access to the event table
+ */
+
+#ifndef CONFIG_DEBUG_RODATA
+
+#define BYTE_JUMP 0xeb
+#define BYTE_JUMP_LEN 0x02
+
+/* for each event, there is an entry in the event table */
+struct trace_event {
+ long id;
+ long count;
+ long start_addr;
+ long end_addr;
+};
+
+extern struct trace_event __start___event_table[];
+extern struct trace_event __stop___event_table[];
+
+/* Workaround: if no events are defined, then the event_table section does not
+ * exist and the above references cause linker errors. This could probably be
+ * fixed by adjusting the linker script, but it is easier to maintain for us if
+ * we simply create a dummy symbol in the event table section.
+ */
+int __event_table_dummy[0] __attribute__ ((section("__event_table")));
+
+int ft_enable_event(unsigned long id)
+{
+ struct trace_event* te = __start___event_table;
+ int count = 0;
+ char* delta;
+ unsigned char* instr;
+
+ while (te < __stop___event_table) {
+ if (te->id == id && ++te->count == 1) {
+ instr = (unsigned char*) te->start_addr;
+ /* make sure we don't clobber something wrong */
+ if (*instr == BYTE_JUMP) {
+ delta = (((unsigned char*) te->start_addr) + 1);
+ *delta = 0;
+ }
+ }
+ if (te->id == id)
+ count++;
+ te++;
+ }
+
+ printk(KERN_DEBUG "ft_enable_event: enabled %d events\n", count);
+ return count;
+}
+
+int ft_disable_event(unsigned long id)
+{
+ struct trace_event* te = __start___event_table;
+ int count = 0;
+ char* delta;
+ unsigned char* instr;
+
+ while (te < __stop___event_table) {
+ if (te->id == id && --te->count == 0) {
+ instr = (unsigned char*) te->start_addr;
+ if (*instr == BYTE_JUMP) {
+ delta = (((unsigned char*) te->start_addr) + 1);
+ *delta = te->end_addr - te->start_addr -
+ BYTE_JUMP_LEN;
+ }
+ }
+ if (te->id == id)
+ count++;
+ te++;
+ }
+
+ printk(KERN_DEBUG "ft_disable_event: disabled %d events\n", count);
+ return count;
+}
+
+int ft_disable_all_events(void)
+{
+ struct trace_event* te = __start___event_table;
+ int count = 0;
+ char* delta;
+ unsigned char* instr;
+
+ while (te < __stop___event_table) {
+ if (te->count) {
+ instr = (unsigned char*) te->start_addr;
+ if (*instr == BYTE_JUMP) {
+ delta = (((unsigned char*) te->start_addr)
+ + 1);
+ *delta = te->end_addr - te->start_addr -
+ BYTE_JUMP_LEN;
+ te->count = 0;
+ count++;
+ }
+ }
+ te++;
+ }
+ return count;
+}
+
+int ft_is_event_enabled(unsigned long id)
+{
+ struct trace_event* te = __start___event_table;
+
+ while (te < __stop___event_table) {
+ if (te->id == id)
+ return te->count;
+ te++;
+ }
+ return 0;
+}
+
+#endif
diff --git a/arch/x86/kernel/irqinit.c b/arch/x86/kernel/irqinit.c
index f470e4e..48acf71 100644
--- a/arch/x86/kernel/irqinit.c
+++ b/arch/x86/kernel/irqinit.c
@@ -252,6 +252,9 @@ static void __init smp_intr_init(void)
alloc_intr_gate(CALL_FUNCTION_SINGLE_VECTOR,
call_function_single_interrupt);
+ /* IPI for hrtimer pulling on remote cpus */
+ alloc_intr_gate(PULL_TIMERS_VECTOR, pull_timers_interrupt);
+
/* Low priority IPI to cleanup after moving an irq */
set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt);
set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors);
diff --git a/arch/x86/kernel/smp.c b/arch/x86/kernel/smp.c
index 013e7eb..7539d84 100644
--- a/arch/x86/kernel/smp.c
+++ b/arch/x86/kernel/smp.c
@@ -23,6 +23,9 @@
#include <linux/cpu.h>
#include <linux/gfp.h>
+#include <litmus/preempt.h>
+#include <litmus/debug_trace.h>
+
#include <asm/mtrr.h>
#include <asm/tlbflush.h>
#include <asm/mmu_context.h>
@@ -147,6 +150,16 @@ void native_send_call_func_ipi(const struct cpumask *mask)
free_cpumask_var(allbutself);
}
+/* trigger timers on remote cpu */
+void smp_send_pull_timers(int cpu)
+{
+ if (unlikely(cpu_is_offline(cpu))) {
+ WARN_ON(1);
+ return;
+ }
+ apic->send_IPI_mask(cpumask_of(cpu), PULL_TIMERS_VECTOR);
+}
+
/*
* this function calls the 'stop' function on all other CPUs in the system.
*/
@@ -204,6 +217,11 @@ void smp_reschedule_interrupt(struct pt_regs *regs)
/*
* KVM uses this interrupt to force a cpu out of guest mode
*/
+
+ /* LITMUS^RT: this IPI might need to trigger the sched state machine.
+ * Starting from 3.0 schedule_ipi() actually does something. This may
+ * increase IPI latencies compared with previous versions. */
+ sched_state_ipi();
}
void smp_call_function_interrupt(struct pt_regs *regs)
@@ -224,6 +242,17 @@ void smp_call_function_single_interrupt(struct pt_regs *regs)
irq_exit();
}
+extern void hrtimer_pull(void);
+
+void smp_pull_timers_interrupt(struct pt_regs *regs)
+{
+ ack_APIC_irq();
+ irq_enter();
+ TRACE("pull timer interrupt\n");
+ hrtimer_pull();
+ irq_exit();
+}
+
struct smp_ops smp_ops = {
.smp_prepare_boot_cpu = native_smp_prepare_boot_cpu,
.smp_prepare_cpus = native_smp_prepare_cpus,
diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S
index fbb0a04..d012622 100644
--- a/arch/x86/kernel/syscall_table_32.S
+++ b/arch/x86/kernel/syscall_table_32.S
@@ -346,3 +346,15 @@ ENTRY(sys_call_table)
.long sys_syncfs
.long sys_sendmmsg /* 345 */
.long sys_setns
+ .long sys_set_rt_task_param /* LITMUS^RT 347 */
+ .long sys_get_rt_task_param
+ .long sys_complete_job
+ .long sys_od_open
+ .long sys_od_close
+ .long sys_litmus_lock /* +5 */
+ .long sys_litmus_unlock
+ .long sys_query_job_no
+ .long sys_wait_for_job_release
+ .long sys_wait_for_ts_release
+ .long sys_release_ts /* +10 */
+ .long sys_null_call
diff --git a/fs/exec.c b/fs/exec.c
index 6075a1e..9984562 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -19,7 +19,7 @@
* current->executable is only used by the procfs. This allows a dispatch
* table to check for several different types of binary formats. We keep
* trying until we recognize the file or we run out of supported binary
- * formats.
+ * formats.
*/
#include <linux/slab.h>
@@ -56,6 +56,8 @@
#include <linux/oom.h>
#include <linux/compat.h>
+#include <litmus/litmus.h>
+
#include <asm/uaccess.h>
#include <asm/mmu_context.h>
#include <asm/tlb.h>
@@ -85,7 +87,7 @@ int __register_binfmt(struct linux_binfmt * fmt, int insert)
insert ? list_add(&fmt->lh, &formats) :
list_add_tail(&fmt->lh, &formats);
write_unlock(&binfmt_lock);
- return 0;
+ return 0;
}
EXPORT_SYMBOL(__register_binfmt);
@@ -1160,7 +1162,7 @@ void setup_new_exec(struct linux_binprm * bprm)
group */
current->self_exec_id++;
-
+
flush_signal_handlers(current, 0);
flush_old_files(current->files);
}
@@ -1250,8 +1252,8 @@ int check_unsafe_exec(struct linux_binprm *bprm)
return res;
}
-/*
- * Fill the binprm structure from the inode.
+/*
+ * Fill the binprm structure from the inode.
* Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
*
* This may be called multiple times for binary chains (scripts for example).
@@ -1459,6 +1461,7 @@ static int do_execve_common(const char *filename,
goto out_unmark;
sched_exec();
+ litmus_exec();
bprm->file = file;
bprm->filename = filename;
diff --git a/fs/inode.c b/fs/inode.c
index 43566d1..dbf0e76 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -308,6 +308,8 @@ void inode_init_once(struct inode *inode)
#ifdef CONFIG_FSNOTIFY
INIT_HLIST_HEAD(&inode->i_fsnotify_marks);
#endif
+ INIT_LIST_HEAD(&inode->i_obj_list);
+ mutex_init(&inode->i_obj_mutex);
}
EXPORT_SYMBOL(inode_init_once);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b5b9792..8d5834b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -17,8 +17,8 @@
* nr_file rlimit, so it's safe to set up a ridiculously high absolute
* upper limit on files-per-process.
*
- * Some programs (notably those using select()) may have to be
- * recompiled to take full advantage of the new limits..
+ * Some programs (notably those using select()) may have to be
+ * recompiled to take full advantage of the new limits..
*/
/* Fixed constants first: */
@@ -172,7 +172,7 @@ struct inodes_stat_t {
#define SEL_EX 4
/* public flags for file_system_type */
-#define FS_REQUIRES_DEV 1
+#define FS_REQUIRES_DEV 1
#define FS_BINARY_MOUNTDATA 2
#define FS_HAS_SUBTYPE 4
#define FS_REVAL_DOT 16384 /* Check the paths ".", ".." for staleness */
@@ -480,7 +480,7 @@ struct iattr {
*/
#include <linux/quota.h>
-/**
+/**
* enum positive_aop_returns - aop return codes with specific semantics
*
* @AOP_WRITEPAGE_ACTIVATE: Informs the caller that page writeback has
@@ -490,7 +490,7 @@ struct iattr {
* be a candidate for writeback again in the near
* future. Other callers must be careful to unlock
* the page if they get this return. Returned by
- * writepage();
+ * writepage();
*
* @AOP_TRUNCATED_PAGE: The AOP method that was handed a locked page has
* unlocked it and the page might have been truncated.
@@ -734,6 +734,7 @@ static inline int mapping_writably_mapped(struct address_space *mapping)
struct posix_acl;
#define ACL_NOT_CACHED ((void *)(-1))
+struct inode_obj_id_table;
struct inode {
/* RCU path lookup touches following: */
@@ -807,6 +808,8 @@ struct inode {
struct posix_acl *i_acl;
struct posix_acl *i_default_acl;
#endif
+ struct list_head i_obj_list;
+ struct mutex i_obj_mutex;
void *i_private; /* fs or device private pointer */
};
@@ -1032,10 +1035,10 @@ static inline int file_check_writeable(struct file *filp)
#define MAX_NON_LFS ((1UL<<31) - 1)
-/* Page cache limit. The filesystems should put that into their s_maxbytes
- limits, otherwise bad things can happen in VM. */
+/* Page cache limit. The filesystems should put that into their s_maxbytes
+ limits, otherwise bad things can happen in VM. */
#if BITS_PER_LONG==32
-#define MAX_LFS_FILESIZE (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
+#define MAX_LFS_FILESIZE (((u64)PAGE_CACHE_SIZE << (BITS_PER_LONG-1))-1)
#elif BITS_PER_LONG==64
#define MAX_LFS_FILESIZE 0x7fffffffffffffffUL
#endif
@@ -2234,7 +2237,7 @@ extern void free_write_pipe(struct file *);
extern int kernel_read(struct file *, loff_t, char *, unsigned long);
extern struct file * open_exec(const char *);
-
+
/* fs/dcache.c -- generic fs support functions */
extern int is_subdir(struct dentry *, struct dentry *);
extern int path_is_under(struct path *, struct path *);
diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h
index ba36217..e6dd5a4 100644
--- a/include/linux/hardirq.h
+++ b/include/linux/hardirq.h
@@ -6,6 +6,8 @@
#include <linux/ftrace_irq.h>
#include <asm/hardirq.h>
+#include <litmus/trace_irq.h>
+
/*
* We put the hardirq and softirq counter into the preemption
* counter. The bitmask has the following meaning:
@@ -186,6 +188,7 @@ extern void rcu_nmi_exit(void);
account_system_vtime(current); \
add_preempt_count(HARDIRQ_OFFSET); \
trace_hardirq_enter(); \
+ ft_irq_fired(); \
} while (0)
/*
@@ -216,6 +219,7 @@ extern void irq_exit(void);
lockdep_off(); \
rcu_nmi_enter(); \
trace_hardirq_enter(); \
+ ft_irq_fired(); \
} while (0)
#define nmi_exit() \
diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h
index fd0dc30..d91bba5 100644
--- a/include/linux/hrtimer.h
+++ b/include/linux/hrtimer.h
@@ -174,6 +174,7 @@ enum hrtimer_base_type {
* @nr_hangs: Total number of hrtimer interrupt hangs
* @max_hang_time: Maximum time spent in hrtimer_interrupt
* @clock_base: array of clock bases for this cpu
+ * @to_pull: LITMUS^RT list of timers to be pulled on this cpu
*/
struct hrtimer_cpu_base {
raw_spinlock_t lock;
@@ -188,8 +189,32 @@ struct hrtimer_cpu_base {
ktime_t max_hang_time;
#endif
struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES];
+ struct list_head to_pull;
};
+#ifdef CONFIG_ARCH_HAS_SEND_PULL_TIMERS
+
+#define HRTIMER_START_ON_INACTIVE 0
+#define HRTIMER_START_ON_QUEUED 1
+
+/*
+ * struct hrtimer_start_on_info - save timer info on remote cpu
+ * @list: list of hrtimer_start_on_info on remote cpu (to_pull)
+ * @timer: timer to be triggered on remote cpu
+ * @time: time event
+ * @mode: timer mode
+ * @state: activity flag
+ */
+struct hrtimer_start_on_info {
+ struct list_head list;
+ struct hrtimer *timer;
+ ktime_t time;
+ enum hrtimer_mode mode;
+ atomic_t state;
+};
+
+#endif
+
static inline void hrtimer_set_expires(struct hrtimer *timer, ktime_t time)
{
timer->node.expires = time;
@@ -355,6 +380,13 @@ __hrtimer_start_range_ns(struct hrtimer *timer, ktime_t tim,
unsigned long delta_ns,
const enum hrtimer_mode mode, int wakeup);
+#ifdef CONFIG_ARCH_HAS_SEND_PULL_TIMERS
+extern void hrtimer_start_on_info_init(struct hrtimer_start_on_info *info);
+extern int hrtimer_start_on(int cpu, struct hrtimer_start_on_info *info,
+ struct hrtimer *timer, ktime_t time,
+ const enum hrtimer_mode mode);
+#endif
+
extern int hrtimer_cancel(struct hrtimer *timer);
extern int hrtimer_try_to_cancel(struct hrtimer *timer);
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 14a6c7b..9c990d1 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -39,6 +39,7 @@
#define SCHED_BATCH 3
/* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE 5
+#define SCHED_LITMUS 6
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
#define SCHED_RESET_ON_FORK 0x40000000
@@ -93,6 +94,9 @@ struct sched_param {
#include <asm/processor.h>
+#include <litmus/rt_param.h>
+#include <litmus/preempt.h>
+
struct exec_domain;
struct futex_pi_state;
struct robust_list_head;
@@ -1209,6 +1213,7 @@ struct sched_rt_entity {
};
struct rcu_node;
+struct od_table_entry;
enum perf_event_task_context {
perf_invalid_context = -1,
@@ -1313,9 +1318,9 @@ struct task_struct {
unsigned long stack_canary;
#endif
- /*
+ /*
* pointers to (original) parent process, youngest child, younger sibling,
- * older sibling, respectively. (p->father can be replaced with
+ * older sibling, respectively. (p->father can be replaced with
* p->real_parent->pid)
*/
struct task_struct *real_parent; /* real parent process */
@@ -1526,6 +1531,13 @@ struct task_struct {
int make_it_fail;
#endif
struct prop_local_single dirties;
+
+ /* LITMUS RT parameters and state */
+ struct rt_param rt_param;
+
+ /* references to PI semaphores, etc. */
+ struct od_table_entry *od_table;
+
#ifdef CONFIG_LATENCYTOP
int latency_record_count;
struct latency_record latency_record[LT_SAVECOUNT];
@@ -2136,7 +2148,7 @@ static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, s
spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
return ret;
-}
+}
extern void block_all_signals(int (*notifier)(void *priv), void *priv,
sigset_t *mask);
@@ -2446,6 +2458,7 @@ static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
static inline void set_tsk_need_resched(struct task_struct *tsk)
{
set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
+ sched_state_will_schedule(tsk);
}
static inline void clear_tsk_need_resched(struct task_struct *tsk)
diff --git a/include/linux/smp.h b/include/linux/smp.h
index 8cc38d3..53b1bee 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -82,6 +82,11 @@ int smp_call_function_any(const struct cpumask *mask,
smp_call_func_t func, void *info, int wait);
/*
+ * sends a 'pull timer' event to a remote CPU
+ */
+extern void smp_send_pull_timers(int cpu);
+
+/*
* Generic and arch helpers
*/
#ifdef CONFIG_USE_GENERIC_SMP_HELPERS
diff --git a/include/linux/tick.h b/include/linux/tick.h
index b232ccc..1e29bd5 100644
--- a/include/linux/tick.h
+++ b/include/linux/tick.h
@@ -74,6 +74,11 @@ extern int tick_is_oneshot_available(void);
extern struct tick_device *tick_get_device(int cpu);
# ifdef CONFIG_HIGH_RES_TIMERS
+/* LITMUS^RT tick alignment */
+#define LINUX_DEFAULT_TICKS 0
+#define LITMUS_ALIGNED_TICKS 1
+#define LITMUS_STAGGERED_TICKS 2
+
extern int tick_init_highres(void);
extern int tick_program_event(ktime_t expires, int force);
extern void tick_setup_sched_timer(void);
diff --git a/include/litmus/affinity.h b/include/litmus/affinity.h
new file mode 100644
index 0000000..ca2e442
--- /dev/null
+++ b/include/litmus/affinity.h
@@ -0,0 +1,80 @@
+#ifndef __LITMUS_AFFINITY_H
+#define __LITMUS_AFFINITY_H
+
+#include <linux/cpumask.h>
+
+/*
+ L1 (instr) = depth 0
+ L1 (data) = depth 1
+ L2 = depth 2
+ L3 = depth 3
+ */
+#define NUM_CACHE_LEVELS 4
+
+struct neighborhood
+{
+ unsigned int size[NUM_CACHE_LEVELS];
+ cpumask_var_t neighbors[NUM_CACHE_LEVELS];
+};
+
+/* topology info is stored redundently in a big array for fast lookups */
+extern struct neighborhood neigh_info[NR_CPUS];
+
+void init_topology(void); /* called by Litmus module's _init_litmus() */
+
+/* Works like:
+void get_nearest_available_cpu(
+ cpu_entry_t **nearest,
+ cpu_entry_t *start,
+ cpu_entry_t *entries,
+ int release_master)
+
+Set release_master = NO_CPU for no Release Master.
+
+We use a macro here to exploit the fact that C-EDF and G-EDF
+have similar structures for their cpu_entry_t structs, even though
+they do not share a common base-struct. The macro allows us to
+avoid code duplication.
+
+TODO: Factor out the job-to-processor linking from C/G-EDF into
+a reusable "processor mapping". (See B.B.'s RTSS'09 paper &
+dissertation.)
+ */
+#define get_nearest_available_cpu(nearest, start, entries, release_master) \
+{ \
+ (nearest) = NULL; \
+ if (!(start)->linked) { \
+ (nearest) = (start); \
+ } else { \
+ int __level; \
+ int __cpu; \
+ int __release_master = ((release_master) == NO_CPU) ? -1 : (release_master); \
+ struct neighborhood *__neighbors = &neigh_info[(start)->cpu]; \
+ \
+ for (__level = 0; (__level < NUM_CACHE_LEVELS) && !(nearest); ++__level) { \
+ if (__neighbors->size[__level] > 1) { \
+ for_each_cpu(__cpu, __neighbors->neighbors[__level]) { \
+ if (__cpu != __release_master) { \
+ cpu_entry_t *__entry = &per_cpu((entries), __cpu); \
+ if (!__entry->linked) { \
+ (nearest) = __entry; \
+ break; \
+ } \
+ } \
+ } \
+ } else if (__neighbors->size[__level] == 0) { \
+ break; \
+ } \
+ } \
+ } \
+ \
+ if ((nearest)) { \
+ TRACE("P%d is closest available CPU to P%d\n", \
+ (nearest)->cpu, (start)->cpu); \
+ } else { \
+ TRACE("Could not find an available CPU close to P%d\n", \
+ (start)->cpu); \
+ } \
+}
+
+#endif
diff --git a/include/litmus/bheap.h b/include/litmus/bheap.h
new file mode 100644
index 0000000..cf4864a
--- /dev/null
+++ b/include/litmus/bheap.h
@@ -0,0 +1,77 @@
+/* bheaps.h -- Binomial Heaps
+ *
+ * (c) 2008, 2009 Bjoern Brandenburg
+ */
+
+#ifndef BHEAP_H
+#define BHEAP_H
+
+#define NOT_IN_HEAP UINT_MAX
+
+struct bheap_node {
+ struct bheap_node* parent;
+ struct bheap_node* next;
+ struct bheap_node* child;
+
+ unsigned int degree;
+ void* value;
+ struct bheap_node** ref;
+};
+
+struct bheap {
+ struct bheap_node* head;
+ /* We cache the minimum of the heap.
+ * This speeds up repeated peek operations.
+ */
+ struct bheap_node* min;
+};
+
+typedef int (*bheap_prio_t)(struct bheap_node* a, struct bheap_node* b);
+
+void bheap_init(struct bheap* heap);
+void bheap_node_init(struct bheap_node** ref_to_bheap_node_ptr, void* value);
+
+static inline int bheap_node_in_heap(struct bheap_node* h)
+{
+ return h->degree != NOT_IN_HEAP;
+}
+
+static inline int bheap_empty(struct bheap* heap)
+{
+ return heap->head == NULL && heap->min == NULL;
+}
+
+/* insert (and reinitialize) a node into the heap */
+void bheap_insert(bheap_prio_t higher_prio,
+ struct bheap* heap,
+ struct bheap_node* node);
+
+/* merge addition into target */
+void bheap_union(bheap_prio_t higher_prio,
+ struct bheap* target,
+ struct bheap* addition);
+
+struct bheap_node* bheap_peek(bheap_prio_t higher_prio,
+ struct bheap* heap);
+
+struct bheap_node* bheap_take(bheap_prio_t higher_prio,
+ struct bheap* heap);
+
+void bheap_uncache_min(bheap_prio_t higher_prio, struct bheap* heap);
+int bheap_decrease(bheap_prio_t higher_prio, struct bheap_node* node);
+
+void bheap_delete(bheap_prio_t higher_prio,
+ struct bheap* heap,
+ struct bheap_node* node);
+
+/* allocate from memcache */
+struct bheap_node* bheap_node_alloc(int gfp_flags);
+void bheap_node_free(struct bheap_node* hn);
+
+/* allocate a heap node for value and insert into the heap */
+int bheap_add(bheap_prio_t higher_prio, struct bheap* heap,
+ void* value, int gfp_flags);
+
+void* bheap_take_del(bheap_prio_t higher_prio,
+ struct bheap* heap);
+#endif
diff --git a/include/litmus/binheap.h b/include/litmus/binheap.h
new file mode 100644
index 0000000..901a30a
--- /dev/null
+++ b/include/litmus/binheap.h
@@ -0,0 +1,206 @@
+#ifndef LITMUS_BINARY_HEAP_H
+#define LITMUS_BINARY_HEAP_H
+
+#include <linux/kernel.h>
+
+/**
+ * Simple binary heap with add, arbitrary delete, delete_root, and top
+ * operations.
+ *
+ * Style meant to conform with list.h.
+ *
+ * Motivation: Linux's prio_heap.h is of fixed size. Litmus's binomial
+ * heap may be overkill (and perhaps not general enough) for some applications.
+ *
+ * Note: In order to make node swaps fast, a node inserted with a data pointer
+ * may not always hold said data pointer. This is similar to the binomial heap
+ * implementation. This does make node deletion tricky since we have to
+ * (1) locate the node that holds the data pointer to delete, and (2) the
+ * node that was originally inserted with said data pointer. These have to be
+ * coalesced into a single node before removal (see usage of
+ * __binheap_safe_swap()). We have to track node references to accomplish this.
+ */
+
+struct binheap_node {
+ void *data;
+ struct binheap_node *parent;
+ struct binheap_node *left;
+ struct binheap_node *right;
+
+ /* pointer to binheap_node that holds *data for which this binheap_node
+ * was originally inserted. (*data "owns" this node)
+ */
+ struct binheap_node *ref;
+ struct binheap_node **ref_ptr;
+};
+
+/**
+ * Signature of compator function. Assumed 'less-than' (min-heap).
+ * Pass in 'greater-than' for max-heap.
+ *
+ * TODO: Consider macro-based implementation that allows comparator to be
+ * inlined (similar to Linux red/black tree) for greater efficiency.
+ */
+typedef int (*binheap_order_t)(struct binheap_node *a,
+ struct binheap_node *b);
+
+
+struct binheap {
+ struct binheap_node *root;
+
+ /* pointer to node to take next inserted child */
+ struct binheap_node *next;
+
+ /* pointer to last node in complete binary tree */
+ struct binheap_node *last;
+
+ /* comparator function pointer */
+ binheap_order_t compare;
+};
+
+
+/* Initialized heap nodes not in a heap have parent
+ * set to BINHEAP_POISON.
+ */
+#define BINHEAP_POISON ((void*)(0xdeadbeef))
+
+
+/**
+ * binheap_entry - get the struct for this heap node.
+ * Only valid when called upon heap nodes other than the root handle.
+ * @ptr: the heap node.
+ * @type: the type of struct pointed to by binheap_node::data.
+ * @member: unused.
+ */
+#define binheap_entry(ptr, type, member) \
+((type *)((ptr)->data))
+
+/**
+ * binheap_node_container - get the struct that contains this node.
+ * Only valid when called upon heap nodes other than the root handle.
+ * @ptr: the heap node.
+ * @type: the type of struct the node is embedded in.
+ * @member: the name of the binheap_struct within the (type) struct.
+ */
+#define binheap_node_container(ptr, type, member) \
+container_of((ptr), type, member)
+
+/**
+ * binheap_top_entry - get the struct for the node at the top of the heap.
+ * Only valid when called upon the heap handle node.
+ * @ptr: the special heap-handle node.
+ * @type: the type of the struct the head is embedded in.
+ * @member: the name of the binheap_struct within the (type) struct.
+ */
+#define binheap_top_entry(ptr, type, member) \
+binheap_entry((ptr)->root, type, member)
+
+/**
+ * binheap_delete_root - remove the root element from the heap.
+ * @handle: handle to the heap.
+ * @type: the type of the struct the head is embedded in.
+ * @member: the name of the binheap_struct within the (type) struct.
+ */
+#define binheap_delete_root(handle, type, member) \
+__binheap_delete_root((handle), &((type *)((handle)->root->data))->member)
+
+/**
+ * binheap_delete - remove an arbitrary element from the heap.
+ * @to_delete: pointer to node to be removed.
+ * @handle: handle to the heap.
+ */
+#define binheap_delete(to_delete, handle) \
+__binheap_delete((to_delete), (handle))
+
+/**
+ * binheap_add - insert an element to the heap
+ * new_node: node to add.
+ * @handle: handle to the heap.
+ * @type: the type of the struct the head is embedded in.
+ * @member: the name of the binheap_struct within the (type) struct.
+ */
+#define binheap_add(new_node, handle, type, member) \
+__binheap_add((new_node), (handle), container_of((new_node), type, member))
+
+/**
+ * binheap_decrease - re-eval the position of a node (based upon its
+ * original data pointer).
+ * @handle: handle to the heap.
+ * @orig_node: node that was associated with the data pointer
+ * (whose value has changed) when said pointer was
+ * added to the heap.
+ */
+#define binheap_decrease(orig_node, handle) \
+__binheap_decrease((orig_node), (handle))
+
+#define BINHEAP_NODE_INIT() { NULL, BINHEAP_POISON, NULL, NULL , NULL, NULL}
+
+#define BINHEAP_NODE(name) \
+ struct binheap_node name = BINHEAP_NODE_INIT()
+
+
+static inline void INIT_BINHEAP_NODE(struct binheap_node *n)
+{
+ n->data = NULL;
+ n->parent = BINHEAP_POISON;
+ n->left = NULL;
+ n->right = NULL;
+ n->ref = NULL;
+ n->ref_ptr = NULL;
+}
+
+static inline void INIT_BINHEAP_HANDLE(struct binheap *handle,
+ binheap_order_t compare)
+{
+ handle->root = NULL;
+ handle->next = NULL;
+ handle->last = NULL;
+ handle->compare = compare;
+}
+
+/* Returns true if binheap is empty. */
+static inline int binheap_empty(struct binheap *handle)
+{
+ return(handle->root == NULL);
+}
+
+/* Returns true if binheap node is in a heap. */
+static inline int binheap_is_in_heap(struct binheap_node *node)
+{
+ return (node->parent != BINHEAP_POISON);
+}
+
+/* Returns true if binheap node is in given heap. */
+int binheap_is_in_this_heap(struct binheap_node *node, struct binheap* heap);
+
+/* Add a node to a heap */
+void __binheap_add(struct binheap_node *new_node,
+ struct binheap *handle,
+ void *data);
+
+/**
+ * Removes the root node from the heap. The node is removed after coalescing
+ * the binheap_node with its original data pointer at the root of the tree.
+ *
+ * The 'last' node in the tree is then swapped up to the root and bubbled
+ * down.
+ */
+void __binheap_delete_root(struct binheap *handle,
+ struct binheap_node *container);
+
+/**
+ * Delete an arbitrary node. Bubble node to delete up to the root,
+ * and then delete to root.
+ */
+void __binheap_delete(struct binheap_node *node_to_delete,
+ struct binheap *handle);
+
+/**
+ * Bubble up a node whose pointer has decreased in value.
+ */
+void __binheap_decrease(struct binheap_node *orig_node,
+ struct binheap *handle);
+
+
+#endif
+
diff --git a/include/litmus/budget.h b/include/litmus/budget.h
new file mode 100644
index 0000000..33344ee
--- /dev/null
+++ b/include/litmus/budget.h
@@ -0,0 +1,35 @@
+#ifndef _LITMUS_BUDGET_H_
+#define _LITMUS_BUDGET_H_
+
+/* Update the per-processor enforcement timer (arm/reproram/cancel) for
+ * the next task. */
+void update_enforcement_timer(struct task_struct* t);
+
+inline static int budget_exhausted(struct task_struct* t)
+{
+ return get_exec_time(t) >= get_exec_cost(t);
+}
+
+inline static lt_t budget_remaining(struct task_struct* t)
+{
+ if (!budget_exhausted(t))
+ return get_exec_cost(t) - get_exec_time(t);
+ else
+ /* avoid overflow */
+ return 0;
+}
+
+#define budget_enforced(t) (tsk_rt(t)->task_params.budget_policy != NO_ENFORCEMENT)
+
+#define budget_precisely_enforced(t) (tsk_rt(t)->task_params.budget_policy \
+ == PRECISE_ENFORCEMENT)
+
+static inline int requeue_preempted_job(struct task_struct* t)
+{
+ /* Add task to ready queue only if not subject to budget enforcement or
+ * if the job has budget remaining. t may be NULL.
+ */
+ return t && (!budget_exhausted(t) || !budget_enforced(t));
+}
+
+#endif
diff --git a/include/litmus/clustered.h b/include/litmus/clustered.h
new file mode 100644
index 0000000..0c18dcb
--- /dev/null
+++ b/include/litmus/clustered.h
@@ -0,0 +1,44 @@
+#ifndef CLUSTERED_H
+#define CLUSTERED_H
+
+/* Which cache level should be used to group CPUs into clusters?
+ * GLOBAL_CLUSTER means that all CPUs form a single cluster (just like under
+ * global scheduling).
+ */
+enum cache_level {
+ GLOBAL_CLUSTER = 0,
+ L1_CLUSTER = 1,
+ L2_CLUSTER = 2,
+ L3_CLUSTER = 3
+};
+
+int parse_cache_level(const char *str, enum cache_level *level);
+const char* cache_level_name(enum cache_level level);
+
+/* expose a cache level in a /proc dir */
+struct proc_dir_entry* create_cluster_file(struct proc_dir_entry* parent,
+ enum cache_level* level);
+
+
+
+struct scheduling_cluster {
+ unsigned int id;
+ /* list of CPUs that are part of this cluster */
+ struct list_head cpus;
+};
+
+struct cluster_cpu {
+ unsigned int id; /* which CPU is this? */
+ struct list_head cluster_list; /* List of the CPUs in this cluster. */
+ struct scheduling_cluster* cluster; /* The cluster that this CPU belongs to. */
+};
+
+int get_cluster_size(enum cache_level level);
+
+int assign_cpus_to_clusters(enum cache_level level,
+ struct scheduling_cluster* clusters[],
+ unsigned int num_clusters,
+ struct cluster_cpu* cpus[],
+ unsigned int num_cpus);
+
+#endif
diff --git a/include/litmus/debug_trace.h b/include/litmus/debug_trace.h
new file mode 100644
index 0000000..48d086d
--- /dev/null
+++ b/include/litmus/debug_trace.h
@@ -0,0 +1,37 @@
+#ifndef LITMUS_DEBUG_TRACE_H
+#define LITMUS_DEBUG_TRACE_H
+
+#ifdef CONFIG_SCHED_DEBUG_TRACE
+void sched_trace_log_message(const char* fmt, ...);
+void dump_trace_buffer(int max);
+#else
+
+#define sched_trace_log_message(fmt, ...)
+
+#endif
+
+extern atomic_t __log_seq_no;
+
+#ifdef CONFIG_SCHED_DEBUG_TRACE_CALLER
+#define TRACE_PREFIX "%d P%d [%s@%s:%d]: "
+#define TRACE_ARGS atomic_add_return(1, &__log_seq_no), \
+ raw_smp_processor_id(), \
+ __FUNCTION__, __FILE__, __LINE__
+#else
+#define TRACE_PREFIX "%d P%d: "
+#define TRACE_ARGS atomic_add_return(1, &__log_seq_no), \
+ raw_smp_processor_id()
+#endif
+
+#define TRACE(fmt, args...) \
+ sched_trace_log_message(TRACE_PREFIX fmt, \
+ TRACE_ARGS, ## args)
+
+#define TRACE_TASK(t, fmt, args...) \
+ TRACE("(%s/%d:%d) " fmt, (t)->comm, (t)->pid, \
+ (t)->rt_param.job_params.job_no, ##args)
+
+#define TRACE_CUR(fmt, args...) \
+ TRACE_TASK(current, fmt, ## args)
+
+#endif
diff --git a/include/litmus/edf_common.h b/include/litmus/edf_common.h
new file mode 100644
index 0000000..bbaf22e
--- /dev/null
+++ b/include/litmus/edf_common.h
@@ -0,0 +1,25 @@
+/*
+ * EDF common data structures and utility functions shared by all EDF
+ * based scheduler plugins
+ */
+
+/* CLEANUP: Add comments and make it less messy.
+ *
+ */
+
+#ifndef __UNC_EDF_COMMON_H__
+#define __UNC_EDF_COMMON_H__
+
+#include <litmus/rt_domain.h>
+
+void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+ release_jobs_t release);
+
+int edf_higher_prio(struct task_struct* first,
+ struct task_struct* second);
+
+int edf_ready_order(struct bheap_node* a, struct bheap_node* b);
+
+int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t);
+
+#endif
diff --git a/include/litmus/fdso.h b/include/litmus/fdso.h
new file mode 100644
index 0000000..f2115b8
--- /dev/null
+++ b/include/litmus/fdso.h
@@ -0,0 +1,77 @@
+/* fdso.h - file descriptor attached shared objects
+ *
+ * (c) 2007 B. Brandenburg, LITMUS^RT project
+ */
+
+#ifndef _LINUX_FDSO_H_
+#define _LINUX_FDSO_H_
+
+#include <linux/list.h>
+#include <asm/atomic.h>
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+
+#define MAX_OBJECT_DESCRIPTORS 85
+
+typedef enum {
+ MIN_OBJ_TYPE = 0,
+
+ FMLP_SEM = 0,
+ SRP_SEM = 1,
+
+ MPCP_SEM = 2,
+ MPCP_VS_SEM = 3,
+ DPCP_SEM = 4,
+
+ PCP_SEM = 5,
+
+ MAX_OBJ_TYPE = 5
+} obj_type_t;
+
+struct inode_obj_id {
+ struct list_head list;
+ atomic_t count;
+ struct inode* inode;
+
+ obj_type_t type;
+ void* obj;
+ unsigned int id;
+};
+
+struct fdso_ops;
+
+struct od_table_entry {
+ unsigned int used;
+
+ struct inode_obj_id* obj;
+ const struct fdso_ops* class;
+};
+
+struct fdso_ops {
+ int (*create)(void** obj_ref, obj_type_t type, void* __user);
+ void (*destroy)(obj_type_t type, void*);
+ int (*open) (struct od_table_entry*, void* __user);
+ int (*close) (struct od_table_entry*);
+};
+
+/* translate a userspace supplied od into the raw table entry
+ * returns NULL if od is invalid
+ */
+struct od_table_entry* get_entry_for_od(int od);
+
+/* translate a userspace supplied od into the associated object
+ * returns NULL if od is invalid
+ */
+static inline void* od_lookup(int od, obj_type_t type)
+{
+ struct od_table_entry* e = get_entry_for_od(od);
+ return e && e->obj->type == type ? e->obj->obj : NULL;
+}
+
+#define lookup_fmlp_sem(od)((struct pi_semaphore*) od_lookup(od, FMLP_SEM))
+#define lookup_srp_sem(od) ((struct srp_semaphore*) od_lookup(od, SRP_SEM))
+#define lookup_ics(od) ((struct ics*) od_lookup(od, ICS_ID))
+
+
+#endif
diff --git a/include/litmus/feather_buffer.h b/include/litmus/feather_buffer.h
new file mode 100644
index 0000000..6c18277
--- /dev/null
+++ b/include/litmus/feather_buffer.h
@@ -0,0 +1,94 @@
+#ifndef _FEATHER_BUFFER_H_
+#define _FEATHER_BUFFER_H_
+
+/* requires UINT_MAX and memcpy */
+
+#define SLOT_FREE 0
+#define SLOT_BUSY 1
+#define SLOT_READY 2
+
+struct ft_buffer {
+ unsigned int slot_count;
+ unsigned int slot_size;
+
+ int free_count;
+ unsigned int write_idx;
+ unsigned int read_idx;
+
+ char* slots;
+ void* buffer_mem;
+ unsigned int failed_writes;
+};
+
+static inline int init_ft_buffer(struct ft_buffer* buf,
+ unsigned int slot_count,
+ unsigned int slot_size,
+ char* slots,
+ void* buffer_mem)
+{
+ int i = 0;
+ if (!slot_count || UINT_MAX % slot_count != slot_count - 1) {
+ /* The slot count must divide UNIT_MAX + 1 so that when it
+ * wraps around the index correctly points to 0.
+ */
+ return 0;
+ } else {
+ buf->slot_count = slot_count;
+ buf->slot_size = slot_size;
+ buf->slots = slots;
+ buf->buffer_mem = buffer_mem;
+ buf->free_count = slot_count;
+ buf->write_idx = 0;
+ buf->read_idx = 0;
+ buf->failed_writes = 0;
+ for (i = 0; i < slot_count; i++)
+ buf->slots[i] = SLOT_FREE;
+ return 1;
+ }
+}
+
+static inline int ft_buffer_start_write(struct ft_buffer* buf, void **ptr)
+{
+ int free = fetch_and_dec(&buf->free_count);
+ unsigned int idx;
+ if (free <= 0) {
+ fetch_and_inc(&buf->free_count);
+ *ptr = 0;
+ fetch_and_inc(&buf->failed_writes);
+ return 0;
+ } else {
+ idx = fetch_and_inc((int*) &buf->write_idx) % buf->slot_count;
+ buf->slots[idx] = SLOT_BUSY;
+ *ptr = ((char*) buf->buffer_mem) + idx * buf->slot_size;
+ return 1;
+ }
+}
+
+static inline void ft_buffer_finish_write(struct ft_buffer* buf, void *ptr)
+{
+ unsigned int idx = ((char*) ptr - (char*) buf->buffer_mem) / buf->slot_size;
+ buf->slots[idx] = SLOT_READY;
+}
+
+
+/* exclusive reader access is assumed */
+static inline int ft_buffer_read(struct ft_buffer* buf, void* dest)
+{
+ unsigned int idx;
+ if (buf->free_count == buf->slot_count)
+ /* nothing available */
+ return 0;
+ idx = buf->read_idx % buf->slot_count;
+ if (buf->slots[idx] == SLOT_READY) {
+ memcpy(dest, ((char*) buf->buffer_mem) + idx * buf->slot_size,
+ buf->slot_size);
+ buf->slots[idx] = SLOT_FREE;
+ buf->read_idx++;
+ fetch_and_inc(&buf->free_count);
+ return 1;
+ } else
+ return 0;
+}
+
+
+#endif
diff --git a/include/litmus/feather_trace.h b/include/litmus/feather_trace.h
new file mode 100644
index 0000000..028dfb2
--- /dev/null
+++ b/include/litmus/feather_trace.h
@@ -0,0 +1,65 @@
+#ifndef _FEATHER_TRACE_H_
+#define _FEATHER_TRACE_H_
+
+#include <asm/atomic.h>
+
+int ft_enable_event(unsigned long id);
+int ft_disable_event(unsigned long id);
+int ft_is_event_enabled(unsigned long id);
+int ft_disable_all_events(void);
+
+/* atomic_* funcitons are inline anyway */
+static inline int fetch_and_inc(int *val)
+{
+ return atomic_add_return(1, (atomic_t*) val) - 1;
+}
+
+static inline int fetch_and_dec(int *val)
+{
+ return atomic_sub_return(1, (atomic_t*) val) + 1;
+}
+
+/* Don't use rewriting implementation if kernel text pages are read-only.
+ * Ftrace gets around this by using the identity mapping, but that's more
+ * effort that is warrented right now for Feather-Trace.
+ * Eventually, it may make sense to replace Feather-Trace with ftrace.
+ */
+#if defined(CONFIG_ARCH_HAS_FEATHER_TRACE) && !defined(CONFIG_DEBUG_RODATA)
+
+#include <asm/feather_trace.h>
+
+#else /* !__ARCH_HAS_FEATHER_TRACE */
+
+/* provide default implementation */
+
+#include <asm/timex.h> /* for get_cycles() */
+
+static inline unsigned long long ft_timestamp(void)
+{
+ return get_cycles();
+}
+
+#define feather_callback
+
+#define MAX_EVENTS 1024
+
+extern int ft_events[MAX_EVENTS];
+
+#define ft_event(id, callback) \
+ if (ft_events[id]) callback();
+
+#define ft_event0(id, callback) \
+ if (ft_events[id]) callback(id);
+
+#define ft_event1(id, callback, param) \
+ if (ft_events[id]) callback(id, param);
+
+#define ft_event2(id, callback, param, param2) \
+ if (ft_events[id]) callback(id, param, param2);
+
+#define ft_event3(id, callback, p, p2, p3) \
+ if (ft_events[id]) callback(id, p, p2, p3);
+
+#endif /* __ARCH_HAS_FEATHER_TRACE */
+
+#endif
diff --git a/include/litmus/fp_common.h b/include/litmus/fp_common.h
new file mode 100644
index 0000000..19356c0
--- /dev/null
+++ b/include/litmus/fp_common.h
@@ -0,0 +1,105 @@
+/* Fixed-priority scheduler support.
+ */
+
+#ifndef __FP_COMMON_H__
+#define __FP_COMMON_H__
+
+#include <litmus/rt_domain.h>
+
+#include <asm/bitops.h>
+
+
+void fp_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+ release_jobs_t release);
+
+int fp_higher_prio(struct task_struct* first,
+ struct task_struct* second);
+
+int fp_ready_order(struct bheap_node* a, struct bheap_node* b);
+
+#define FP_PRIO_BIT_WORDS (LITMUS_MAX_PRIORITY / BITS_PER_LONG)
+
+#if (LITMUS_MAX_PRIORITY % BITS_PER_LONG)
+#error LITMUS_MAX_PRIORITY must be a multiple of BITS_PER_LONG
+#endif
+
+/* bitmask-inexed priority queue */
+struct fp_prio_queue {
+ unsigned long bitmask[FP_PRIO_BIT_WORDS];
+ struct bheap queue[LITMUS_MAX_PRIORITY];
+};
+
+void fp_prio_queue_init(struct fp_prio_queue* q);
+
+static inline void fpq_set(struct fp_prio_queue* q, unsigned int index)
+{
+ unsigned long *word = q->bitmask + (index / BITS_PER_LONG);
+ __set_bit(index % BITS_PER_LONG, word);
+}
+
+static inline void fpq_clear(struct fp_prio_queue* q, unsigned int index)
+{
+ unsigned long *word = q->bitmask + (index / BITS_PER_LONG);
+ __clear_bit(index % BITS_PER_LONG, word);
+}
+
+static inline unsigned int fpq_find(struct fp_prio_queue* q)
+{
+ int i;
+
+ /* loop optimizer should unroll this */
+ for (i = 0; i < FP_PRIO_BIT_WORDS; i++)
+ if (q->bitmask[i])
+ return __ffs(q->bitmask[i]) + i * BITS_PER_LONG;
+
+ return LITMUS_MAX_PRIORITY; /* nothing found */
+}
+
+static inline void fp_prio_add(struct fp_prio_queue* q, struct task_struct* t, unsigned int index)
+{
+ BUG_ON(index >= LITMUS_MAX_PRIORITY);
+ BUG_ON(bheap_node_in_heap(tsk_rt(t)->heap_node));
+
+ fpq_set(q, index);
+ bheap_insert(fp_ready_order, &q->queue[index], tsk_rt(t)->heap_node);
+}
+
+static inline void fp_prio_remove(struct fp_prio_queue* q, struct task_struct* t, unsigned int index)
+{
+ BUG_ON(!is_queued(t));
+
+ bheap_delete(fp_ready_order, &q->queue[index], tsk_rt(t)->heap_node);
+ if (likely(bheap_empty(&q->queue[index])))
+ fpq_clear(q, index);
+}
+
+static inline struct task_struct* fp_prio_peek(struct fp_prio_queue* q)
+{
+ unsigned int idx = fpq_find(q);
+ struct bheap_node* hn;
+
+ if (idx < LITMUS_MAX_PRIORITY) {
+ hn = bheap_peek(fp_ready_order, &q->queue[idx]);
+ return bheap2task(hn);
+ } else
+ return NULL;
+}
+
+static inline struct task_struct* fp_prio_take(struct fp_prio_queue* q)
+{
+ unsigned int idx = fpq_find(q);
+ struct bheap_node* hn;
+
+ if (idx < LITMUS_MAX_PRIORITY) {
+ hn = bheap_take(fp_ready_order, &q->queue[idx]);
+ if (likely(bheap_empty(&q->queue[idx])))
+ fpq_clear(q, idx);
+ return bheap2task(hn);
+ } else
+ return NULL;
+}
+
+int fp_preemption_needed(struct fp_prio_queue* q, struct task_struct *t);
+
+
+#endif
diff --git a/include/litmus/fpmath.h b/include/litmus/fpmath.h
new file mode 100644
index 0000000..642de98
--- /dev/null
+++ b/include/litmus/fpmath.h
@@ -0,0 +1,147 @@
+#ifndef __FP_MATH_H__
+#define __FP_MATH_H__
+
+#include <linux/math64.h>
+
+#ifndef __KERNEL__
+#include <stdint.h>
+#define abs(x) (((x) < 0) ? -(x) : x)
+#endif
+
+// Use 64-bit because we want to track things at the nanosecond scale.
+// This can lead to very large numbers.
+typedef int64_t fpbuf_t;
+typedef struct
+{
+ fpbuf_t val;
+} fp_t;
+
+#define FP_SHIFT 10
+#define ROUND_BIT (FP_SHIFT - 1)
+
+#define _fp(x) ((fp_t) {x})
+
+#ifdef __KERNEL__
+static const fp_t LITMUS_FP_ZERO = {.val = 0};
+static const fp_t LITMUS_FP_ONE = {.val = (1 << FP_SHIFT)};
+#endif
+
+static inline fp_t FP(fpbuf_t x)
+{
+ return _fp(((fpbuf_t) x) << FP_SHIFT);
+}
+
+/* divide two integers to obtain a fixed point value */
+static inline fp_t _frac(fpbuf_t a, fpbuf_t b)
+{
+ return _fp(div64_s64(FP(a).val, (b)));
+}
+
+static inline fpbuf_t _point(fp_t x)
+{
+ return (x.val % (1 << FP_SHIFT));
+
+}
+
+#define fp2str(x) x.val
+/*(x.val >> FP_SHIFT), (x.val % (1 << FP_SHIFT)) */
+#define _FP_ "%ld/1024"
+
+static inline fpbuf_t _floor(fp_t x)
+{
+ return x.val >> FP_SHIFT;
+}
+
+/* FIXME: negative rounding */
+static inline fpbuf_t _round(fp_t x)
+{
+ return _floor(x) + ((x.val >> ROUND_BIT) & 1);
+}
+
+/* multiply two fixed point values */
+static inline fp_t _mul(fp_t a, fp_t b)
+{
+ return _fp((a.val * b.val) >> FP_SHIFT);
+}
+
+static inline fp_t _div(fp_t a, fp_t b)
+{
+#if !defined(__KERNEL__) && !defined(unlikely)
+#define unlikely(x) (x)
+#define DO_UNDEF_UNLIKELY
+#endif
+ /* try not to overflow */
+ if (unlikely( a.val > (2l << ((sizeof(fpbuf_t)*8) - FP_SHIFT)) ))
+ return _fp((a.val / b.val) << FP_SHIFT);
+ else
+ return _fp((a.val << FP_SHIFT) / b.val);
+#ifdef DO_UNDEF_UNLIKELY
+#undef unlikely
+#undef DO_UNDEF_UNLIKELY
+#endif
+}
+
+static inline fp_t _add(fp_t a, fp_t b)
+{
+ return _fp(a.val + b.val);
+}
+
+static inline fp_t _sub(fp_t a, fp_t b)
+{
+ return _fp(a.val - b.val);
+}
+
+static inline fp_t _neg(fp_t x)
+{
+ return _fp(-x.val);
+}
+
+static inline fp_t _abs(fp_t x)
+{
+ return _fp(abs(x.val));
+}
+
+/* works the same as casting float/double to integer */
+static inline fpbuf_t _fp_to_integer(fp_t x)
+{
+ return _floor(_abs(x)) * ((x.val > 0) ? 1 : -1);
+}
+
+static inline fp_t _integer_to_fp(fpbuf_t x)
+{
+ return _frac(x,1);
+}
+
+static inline int _leq(fp_t a, fp_t b)
+{
+ return a.val <= b.val;
+}
+
+static inline int _geq(fp_t a, fp_t b)
+{
+ return a.val >= b.val;
+}
+
+static inline int _lt(fp_t a, fp_t b)
+{
+ return a.val < b.val;
+}
+
+static inline int _gt(fp_t a, fp_t b)
+{
+ return a.val > b.val;
+}
+
+static inline int _eq(fp_t a, fp_t b)
+{
+ return a.val == b.val;
+}
+
+static inline fp_t _max(fp_t a, fp_t b)
+{
+ if (a.val < b.val)
+ return b;
+ else
+ return a;
+}
+#endif
diff --git a/include/litmus/ftdev.h b/include/litmus/ftdev.h
new file mode 100644
index 0000000..0b95987
--- /dev/null
+++ b/include/litmus/ftdev.h
@@ -0,0 +1,55 @@
+#ifndef _LITMUS_FTDEV_H_
+#define _LITMUS_FTDEV_H_
+
+#include <litmus/feather_trace.h>
+#include <litmus/feather_buffer.h>
+#include <linux/mutex.h>
+#include <linux/cdev.h>
+
+#define FTDEV_ENABLE_CMD 0
+#define FTDEV_DISABLE_CMD 1
+
+struct ftdev;
+
+/* return 0 if buffer can be opened, otherwise -$REASON */
+typedef int (*ftdev_can_open_t)(struct ftdev* dev, unsigned int buf_no);
+/* return 0 on success, otherwise -$REASON */
+typedef int (*ftdev_alloc_t)(struct ftdev* dev, unsigned int buf_no);
+typedef void (*ftdev_free_t)(struct ftdev* dev, unsigned int buf_no);
+/* Let devices handle writes from userspace. No synchronization provided. */
+typedef ssize_t (*ftdev_write_t)(struct ft_buffer* buf, size_t len, const char __user *from);
+
+struct ftdev_event;
+
+struct ftdev_minor {
+ struct ft_buffer* buf;
+ unsigned int readers;
+ struct mutex lock;
+ /* FIXME: filter for authorized events */
+ struct ftdev_event* events;
+ struct device* device;
+ struct ftdev* ftdev;
+};
+
+struct ftdev {
+ dev_t major;
+ struct cdev cdev;
+ struct class* class;
+ const char* name;
+ struct ftdev_minor* minor;
+ unsigned int minor_cnt;
+ ftdev_alloc_t alloc;
+ ftdev_free_t free;
+ ftdev_can_open_t can_open;
+ ftdev_write_t write;
+};
+
+struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size);
+void free_ft_buffer(struct ft_buffer* buf);
+
+int ftdev_init( struct ftdev* ftdev, struct module* owner,
+ const int minor_cnt, const char* name);
+void ftdev_exit(struct ftdev* ftdev);
+int register_ftdev(struct ftdev* ftdev);
+
+#endif
diff --git a/include/litmus/jobs.h b/include/litmus/jobs.h
new file mode 100644
index 0000000..9bd361e
--- /dev/null
+++ b/include/litmus/jobs.h
@@ -0,0 +1,9 @@
+#ifndef __LITMUS_JOBS_H__
+#define __LITMUS_JOBS_H__
+
+void prepare_for_next_period(struct task_struct *t);
+void release_at(struct task_struct *t, lt_t start);
+long complete_job(void);
+
+#endif
+
diff --git a/include/litmus/litmus.h b/include/litmus/litmus.h
new file mode 100644
index 0000000..875783e
--- /dev/null
+++ b/include/litmus/litmus.h
@@ -0,0 +1,300 @@
+/*
+ * Constant definitions related to
+ * scheduling policy.
+ */
+
+#ifndef _LINUX_LITMUS_H_
+#define _LINUX_LITMUS_H_
+
+#include <litmus/debug_trace.h>
+
+#ifdef CONFIG_RELEASE_MASTER
+extern atomic_t release_master_cpu;
+#endif
+
+/* in_list - is a given list_head queued on some list?
+ */
+static inline int in_list(struct list_head* list)
+{
+ return !( /* case 1: deleted */
+ (list->next == LIST_POISON1 &&
+ list->prev == LIST_POISON2)
+ ||
+ /* case 2: initialized */
+ (list->next == list &&
+ list->prev == list)
+ );
+}
+
+struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq);
+
+#define NO_CPU 0xffffffff
+
+void litmus_fork(struct task_struct *tsk);
+void litmus_exec(void);
+/* clean up real-time state of a task */
+void exit_litmus(struct task_struct *dead_tsk);
+
+long litmus_admit_task(struct task_struct *tsk);
+void litmus_exit_task(struct task_struct *tsk);
+
+#define is_realtime(t) ((t)->policy == SCHED_LITMUS)
+#define rt_transition_pending(t) \
+ ((t)->rt_param.transition_pending)
+
+#define tsk_rt(t) (&(t)->rt_param)
+
+/* Realtime utility macros */
+#define is_priority_boosted(t) (tsk_rt(t)->priority_boosted)
+#define get_boost_start(t) (tsk_rt(t)->boost_start_time)
+
+/* task_params macros */
+#define get_exec_cost(t) (tsk_rt(t)->task_params.exec_cost)
+#define get_rt_period(t) (tsk_rt(t)->task_params.period)
+#define get_rt_relative_deadline(t) (tsk_rt(t)->task_params.relative_deadline)
+#define get_rt_phase(t) (tsk_rt(t)->task_params.phase)
+#define get_partition(t) (tsk_rt(t)->task_params.cpu)
+#define get_priority(t) (tsk_rt(t)->task_params.priority)
+#define get_class(t) (tsk_rt(t)->task_params.cls)
+
+/* job_param macros */
+#define get_exec_time(t) (tsk_rt(t)->job_params.exec_time)
+#define get_deadline(t) (tsk_rt(t)->job_params.deadline)
+#define get_release(t) (tsk_rt(t)->job_params.release)
+#define get_lateness(t) (tsk_rt(t)->job_params.lateness)
+
+#define is_hrt(t) \
+ (tsk_rt(t)->task_params.cls == RT_CLASS_HARD)
+#define is_srt(t) \
+ (tsk_rt(t)->task_params.cls == RT_CLASS_SOFT)
+#define is_be(t) \
+ (tsk_rt(t)->task_params.cls == RT_CLASS_BEST_EFFORT)
+
+/* Our notion of time within LITMUS: kernel monotonic time. */
+static inline lt_t litmus_clock(void)
+{
+ return ktime_to_ns(ktime_get());
+}
+
+/* A macro to convert from nanoseconds to ktime_t. */
+#define ns_to_ktime(t) ktime_add_ns(ktime_set(0, 0), t)
+
+#define get_domain(t) (tsk_rt(t)->domain)
+
+/* Honor the flag in the preempt_count variable that is set
+ * when scheduling is in progress.
+ */
+#define is_running(t) \
+ ((t)->state == TASK_RUNNING || \
+ task_thread_info(t)->preempt_count & PREEMPT_ACTIVE)
+
+#define is_blocked(t) \
+ (!is_running(t))
+#define is_released(t, now) \
+ (lt_before_eq(get_release(t), now))
+#define is_tardy(t, now) \
+ (lt_before_eq(tsk_rt(t)->job_params.deadline, now))
+
+/* real-time comparison macros */
+#define earlier_deadline(a, b) (lt_before(\
+ (a)->rt_param.job_params.deadline,\
+ (b)->rt_param.job_params.deadline))
+#define earlier_release(a, b) (lt_before(\
+ (a)->rt_param.job_params.release,\
+ (b)->rt_param.job_params.release))
+
+void preempt_if_preemptable(struct task_struct* t, int on_cpu);
+
+#ifdef CONFIG_LITMUS_LOCKING
+void srp_ceiling_block(void);
+#else
+#define srp_ceiling_block() /* nothing */
+#endif
+
+#define bheap2task(hn) ((struct task_struct*) hn->value)
+
+#ifdef CONFIG_NP_SECTION
+
+static inline int is_kernel_np(struct task_struct *t)
+{
+ return tsk_rt(t)->kernel_np;
+}
+
+static inline int is_user_np(struct task_struct *t)
+{
+ return tsk_rt(t)->ctrl_page ? tsk_rt(t)->ctrl_page->sched.np.flag : 0;
+}
+
+static inline void request_exit_np(struct task_struct *t)
+{
+ if (is_user_np(t)) {
+ /* Set the flag that tells user space to call
+ * into the kernel at the end of a critical section. */
+ if (likely(tsk_rt(t)->ctrl_page)) {
+ TRACE_TASK(t, "setting delayed_preemption flag\n");
+ tsk_rt(t)->ctrl_page->sched.np.preempt = 1;
+ }
+ }
+}
+
+static inline void make_np(struct task_struct *t)
+{
+ tsk_rt(t)->kernel_np++;
+}
+
+/* Caller should check if preemption is necessary when
+ * the function return 0.
+ */
+static inline int take_np(struct task_struct *t)
+{
+ return --tsk_rt(t)->kernel_np;
+}
+
+/* returns 0 if remote CPU needs an IPI to preempt, 1 if no IPI is required */
+static inline int request_exit_np_atomic(struct task_struct *t)
+{
+ union np_flag old, new;
+
+ if (tsk_rt(t)->ctrl_page) {
+ old.raw = tsk_rt(t)->ctrl_page->sched.raw;
+ if (old.np.flag == 0) {
+ /* no longer non-preemptive */
+ return 0;
+ } else if (old.np.preempt) {
+ /* already set, nothing for us to do */
+ return 1;
+ } else {
+ /* non preemptive and flag not set */
+ new.raw = old.raw;
+ new.np.preempt = 1;
+ /* if we get old back, then we atomically set the flag */
+ return cmpxchg(&tsk_rt(t)->ctrl_page->sched.raw, old.raw, new.raw) == old.raw;
+ /* If we raced with a concurrent change, then so be
+ * it. Deliver it by IPI. We don't want an unbounded
+ * retry loop here since tasks might exploit that to
+ * keep the kernel busy indefinitely. */
+ }
+ } else
+ return 0;
+}
+
+#else
+
+static inline int is_kernel_np(struct task_struct* t)
+{
+ return 0;
+}
+
+static inline int is_user_np(struct task_struct* t)
+{
+ return 0;
+}
+
+static inline void request_exit_np(struct task_struct *t)
+{
+ /* request_exit_np() shouldn't be called if !CONFIG_NP_SECTION */
+ BUG();
+}
+
+static inline int request_exit_np_atomic(struct task_struct *t)
+{
+ return 0;
+}
+
+#endif
+
+static inline void clear_exit_np(struct task_struct *t)
+{
+ if (likely(tsk_rt(t)->ctrl_page))
+ tsk_rt(t)->ctrl_page->sched.np.preempt = 0;
+}
+
+static inline int is_np(struct task_struct *t)
+{
+#ifdef CONFIG_SCHED_DEBUG_TRACE
+ int kernel, user;
+ kernel = is_kernel_np(t);
+ user = is_user_np(t);
+ if (kernel || user)
+ TRACE_TASK(t, " is non-preemptive: kernel=%d user=%d\n",
+
+ kernel, user);
+ return kernel || user;
+#else
+ return unlikely(is_kernel_np(t) || is_user_np(t));
+#endif
+}
+
+static inline int is_present(struct task_struct* t)
+{
+ return t && tsk_rt(t)->present;
+}
+
+static inline int is_completed(struct task_struct* t)
+{
+ return t && tsk_rt(t)->completed;
+}
+
+
+/* make the unit explicit */
+typedef unsigned long quanta_t;
+
+enum round {
+ FLOOR,
+ CEIL
+};
+
+
+/* Tick period is used to convert ns-specified execution
+ * costs and periods into tick-based equivalents.
+ */
+extern ktime_t tick_period;
+
+static inline quanta_t time2quanta(lt_t time, enum round round)
+{
+ s64 quantum_length = ktime_to_ns(tick_period);
+
+ if (do_div(time, quantum_length) && round == CEIL)
+ time++;
+ return (quanta_t) time;
+}
+
+/* By how much is cpu staggered behind CPU 0? */
+u64 cpu_stagger_offset(int cpu);
+
+static inline struct control_page* get_control_page(struct task_struct *t)
+{
+ return tsk_rt(t)->ctrl_page;
+}
+
+static inline int has_control_page(struct task_struct* t)
+{
+ return tsk_rt(t)->ctrl_page != NULL;
+}
+
+
+#ifdef CONFIG_SCHED_OVERHEAD_TRACE
+
+#define TS_SYSCALL_IN_START \
+ if (has_control_page(current)) { \
+ __TS_SYSCALL_IN_START(&get_control_page(current)->ts_syscall_start); \
+ }
+
+#define TS_SYSCALL_IN_END \
+ if (has_control_page(current)) { \
+ uint64_t irqs; \
+ local_irq_disable(); \
+ irqs = get_control_page(current)->irq_count - \
+ get_control_page(current)->irq_syscall_start; \
+ __TS_SYSCALL_IN_END(&irqs); \
+ local_irq_enable(); \
+ }
+
+#else
+
+#define TS_SYSCALL_IN_START
+#define TS_SYSCALL_IN_END
+
+#endif
+
+#endif
diff --git a/include/litmus/litmus_proc.h b/include/litmus/litmus_proc.h
new file mode 100644
index 0000000..6800e72
--- /dev/null
+++ b/include/litmus/litmus_proc.h
@@ -0,0 +1,25 @@
+#include <litmus/sched_plugin.h>
+#include <linux/proc_fs.h>
+
+int __init init_litmus_proc(void);
+void exit_litmus_proc(void);
+
+/*
+ * On success, returns 0 and sets the pointer to the location of the new
+ * proc dir entry, otherwise returns an error code and sets pde to NULL.
+ */
+long make_plugin_proc_dir(struct sched_plugin* plugin,
+ struct proc_dir_entry** pde);
+
+/*
+ * Plugins should deallocate all child proc directory entries before
+ * calling this, to avoid memory leaks.
+ */
+void remove_plugin_proc_dir(struct sched_plugin* plugin);
+
+
+/* Copy at most size-1 bytes from ubuf into kbuf, null-terminate buf, and
+ * remove a '\n' if present. Returns the number of bytes that were read or
+ * -EFAULT. */
+int copy_and_chomp(char *kbuf, unsigned long ksize,
+ __user const char* ubuf, unsigned long ulength);
diff --git a/include/litmus/locking.h b/include/litmus/locking.h
new file mode 100644
index 0000000..4d7b870
--- /dev/null
+++ b/include/litmus/locking.h
@@ -0,0 +1,28 @@
+#ifndef LITMUS_LOCKING_H
+#define LITMUS_LOCKING_H
+
+struct litmus_lock_ops;
+
+/* Generic base struct for LITMUS^RT userspace semaphores.
+ * This structure should be embedded in protocol-specific semaphores.
+ */
+struct litmus_lock {
+ struct litmus_lock_ops *ops;
+ int type;
+};
+
+struct litmus_lock_ops {
+ /* Current task tries to obtain / drop a reference to a lock.
+ * Optional methods, allowed by default. */
+ int (*open)(struct litmus_lock*, void* __user);
+ int (*close)(struct litmus_lock*);
+
+ /* Current tries to lock/unlock this lock (mandatory methods). */
+ int (*lock)(struct litmus_lock*);
+ int (*unlock)(struct litmus_lock*);
+
+ /* The lock is no longer being referenced (mandatory method). */
+ void (*deallocate)(struct litmus_lock*);
+};
+
+#endif
diff --git a/include/litmus/preempt.h b/include/litmus/preempt.h
new file mode 100644
index 0000000..380b886
--- /dev/null
+++ b/include/litmus/preempt.h
@@ -0,0 +1,164 @@
+#ifndef LITMUS_PREEMPT_H
+#define LITMUS_PREEMPT_H
+
+#include <linux/types.h>
+#include <linux/cache.h>
+#include <linux/percpu.h>
+#include <asm/atomic.h>
+
+#include <litmus/debug_trace.h>
+
+extern DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, resched_state);
+
+#ifdef CONFIG_PREEMPT_STATE_TRACE
+const char* sched_state_name(int s);
+#define TRACE_STATE(fmt, args...) TRACE("SCHED_STATE " fmt, args)
+#else
+#define TRACE_STATE(fmt, args...) /* ignore */
+#endif
+
+#define VERIFY_SCHED_STATE(x) \
+ do { int __s = get_sched_state(); \
+ if ((__s & (x)) == 0) \
+ TRACE_STATE("INVALID s=0x%x (%s) not " \
+ "in 0x%x (%s) [%s]\n", \
+ __s, sched_state_name(__s), \
+ (x), #x, __FUNCTION__); \
+ } while (0);
+
+#define TRACE_SCHED_STATE_CHANGE(x, y, cpu) \
+ TRACE_STATE("[P%d] 0x%x (%s) -> 0x%x (%s)\n", \
+ cpu, (x), sched_state_name(x), \
+ (y), sched_state_name(y))
+
+
+typedef enum scheduling_state {
+ TASK_SCHEDULED = (1 << 0), /* The currently scheduled task is the one that
+ * should be scheduled, and the processor does not
+ * plan to invoke schedule(). */
+ SHOULD_SCHEDULE = (1 << 1), /* A remote processor has determined that the
+ * processor should reschedule, but this has not
+ * been communicated yet (IPI still pending). */
+ WILL_SCHEDULE = (1 << 2), /* The processor has noticed that it has to
+ * reschedule and will do so shortly. */
+ TASK_PICKED = (1 << 3), /* The processor is currently executing schedule(),
+ * has selected a new task to schedule, but has not
+ * yet performed the actual context switch. */
+ PICKED_WRONG_TASK = (1 << 4), /* The processor has not yet performed the context
+ * switch, but a remote processor has already
+ * determined that a higher-priority task became
+ * eligible after the task was picked. */
+} sched_state_t;
+
+static inline sched_state_t get_sched_state_on(int cpu)
+{
+ return atomic_read(&per_cpu(resched_state, cpu));
+}
+
+static inline sched_state_t get_sched_state(void)
+{
+ return atomic_read(&__get_cpu_var(resched_state));
+}
+
+static inline int is_in_sched_state(int possible_states)
+{
+ return get_sched_state() & possible_states;
+}
+
+static inline int cpu_is_in_sched_state(int cpu, int possible_states)
+{
+ return get_sched_state_on(cpu) & possible_states;
+}
+
+static inline void set_sched_state(sched_state_t s)
+{
+ TRACE_SCHED_STATE_CHANGE(get_sched_state(), s, smp_processor_id());
+ atomic_set(&__get_cpu_var(resched_state), s);
+}
+
+static inline int sched_state_transition(sched_state_t from, sched_state_t to)
+{
+ sched_state_t old_state;
+
+ old_state = atomic_cmpxchg(&__get_cpu_var(resched_state), from, to);
+ if (old_state == from) {
+ TRACE_SCHED_STATE_CHANGE(from, to, smp_processor_id());
+ return 1;
+ } else
+ return 0;
+}
+
+static inline int sched_state_transition_on(int cpu,
+ sched_state_t from,
+ sched_state_t to)
+{
+ sched_state_t old_state;
+
+ old_state = atomic_cmpxchg(&per_cpu(resched_state, cpu), from, to);
+ if (old_state == from) {
+ TRACE_SCHED_STATE_CHANGE(from, to, cpu);
+ return 1;
+ } else
+ return 0;
+}
+
+/* Plugins must call this function after they have decided which job to
+ * schedule next. IMPORTANT: this function must be called while still holding
+ * the lock that is used to serialize scheduling decisions.
+ *
+ * (Ideally, we would like to use runqueue locks for this purpose, but that
+ * would lead to deadlocks with the migration code.)
+ */
+static inline void sched_state_task_picked(void)
+{
+ VERIFY_SCHED_STATE(WILL_SCHEDULE);
+
+ /* WILL_SCHEDULE has only a local tansition => simple store is ok */
+ set_sched_state(TASK_PICKED);
+}
+
+static inline void sched_state_entered_schedule(void)
+{
+ /* Update state for the case that we entered schedule() not due to
+ * set_tsk_need_resched() */
+ set_sched_state(WILL_SCHEDULE);
+}
+
+/* Called by schedule() to check if the scheduling decision is still valid
+ * after a context switch. Returns 1 if the CPU needs to reschdule. */
+static inline int sched_state_validate_switch(void)
+{
+ int left_state_ok = 0;
+
+ VERIFY_SCHED_STATE(PICKED_WRONG_TASK | TASK_PICKED);
+
+ if (is_in_sched_state(TASK_PICKED)) {
+ /* Might be good; let's try to transition out of this
+ * state. This must be done atomically since remote processors
+ * may try to change the state, too. */
+ left_state_ok = sched_state_transition(TASK_PICKED, TASK_SCHEDULED);
+ }
+
+ if (!left_state_ok) {
+ /* We raced with a higher-priority task arrival => not
+ * valid. The CPU needs to reschedule. */
+ set_sched_state(WILL_SCHEDULE);
+ return 1;
+ } else
+ return 0;
+}
+
+/* State transition events. See litmus/preempt.c for details. */
+void sched_state_will_schedule(struct task_struct* tsk);
+void sched_state_ipi(void);
+/* Cause a CPU (remote or local) to reschedule. */
+void litmus_reschedule(int cpu);
+void litmus_reschedule_local(void);
+
+#ifdef CONFIG_DEBUG_KERNEL
+void sched_state_plugin_check(void);
+#else
+#define sched_state_plugin_check() /* no check */
+#endif
+
+#endif
diff --git a/include/litmus/rt_domain.h b/include/litmus/rt_domain.h
new file mode 100644
index 0000000..ac24929
--- /dev/null
+++ b/include/litmus/rt_domain.h
@@ -0,0 +1,182 @@
+/* CLEANUP: Add comments and make it less messy.
+ *
+ */
+
+#ifndef __UNC_RT_DOMAIN_H__
+#define __UNC_RT_DOMAIN_H__
+
+#include <litmus/bheap.h>
+
+#define RELEASE_QUEUE_SLOTS 127 /* prime */
+
+struct _rt_domain;
+
+typedef int (*check_resched_needed_t)(struct _rt_domain *rt);
+typedef void (*release_jobs_t)(struct _rt_domain *rt, struct bheap* tasks);
+
+struct release_queue {
+ /* each slot maintains a list of release heaps sorted
+ * by release time */
+ struct list_head slot[RELEASE_QUEUE_SLOTS];
+};
+
+typedef struct _rt_domain {
+ /* runnable rt tasks are in here */
+ raw_spinlock_t ready_lock;
+ struct bheap ready_queue;
+
+ /* real-time tasks waiting for release are in here */
+ raw_spinlock_t release_lock;
+ struct release_queue release_queue;
+
+#ifdef CONFIG_RELEASE_MASTER
+ int release_master;
+#endif
+
+ /* for moving tasks to the release queue */
+ raw_spinlock_t tobe_lock;
+ struct list_head tobe_released;
+
+ /* how do we check if we need to kick another CPU? */
+ check_resched_needed_t check_resched;
+
+ /* how do we release jobs? */
+ release_jobs_t release_jobs;
+
+ /* how are tasks ordered in the ready queue? */
+ bheap_prio_t order;
+} rt_domain_t;
+
+struct release_heap {
+ /* list_head for per-time-slot list */
+ struct list_head list;
+ lt_t release_time;
+ /* all tasks to be released at release_time */
+ struct bheap heap;
+ /* used to trigger the release */
+ struct hrtimer timer;
+
+#ifdef CONFIG_RELEASE_MASTER
+ /* used to delegate releases */
+ struct hrtimer_start_on_info info;
+#endif
+ /* required for the timer callback */
+ rt_domain_t* dom;
+};
+
+
+static inline struct task_struct* __next_ready(rt_domain_t* rt)
+{
+ struct bheap_node *hn = bheap_peek(rt->order, &rt->ready_queue);
+ if (hn)
+ return bheap2task(hn);
+ else
+ return NULL;
+}
+
+void rt_domain_init(rt_domain_t *rt, bheap_prio_t order,
+ check_resched_needed_t check,
+ release_jobs_t relase);
+
+void __add_ready(rt_domain_t* rt, struct task_struct *new);
+void __merge_ready(rt_domain_t* rt, struct bheap *tasks);
+void __add_release(rt_domain_t* rt, struct task_struct *task);
+
+static inline struct task_struct* __take_ready(rt_domain_t* rt)
+{
+ struct bheap_node* hn = bheap_take(rt->order, &rt->ready_queue);
+ if (hn)
+ return bheap2task(hn);
+ else
+ return NULL;
+}
+
+static inline struct task_struct* __peek_ready(rt_domain_t* rt)
+{
+ struct bheap_node* hn = bheap_peek(rt->order, &rt->ready_queue);
+ if (hn)
+ return bheap2task(hn);
+ else
+ return NULL;
+}
+
+static inline int is_queued(struct task_struct *t)
+{
+ BUG_ON(!tsk_rt(t)->heap_node);
+ return bheap_node_in_heap(tsk_rt(t)->heap_node);
+}
+
+static inline void remove(rt_domain_t* rt, struct task_struct *t)
+{
+ bheap_delete(rt->order, &rt->ready_queue, tsk_rt(t)->heap_node);
+}
+
+static inline void add_ready(rt_domain_t* rt, struct task_struct *new)
+{
+ unsigned long flags;
+ /* first we need the write lock for rt_ready_queue */
+ raw_spin_lock_irqsave(&rt->ready_lock, flags);
+ __add_ready(rt, new);
+ raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
+}
+
+static inline void merge_ready(rt_domain_t* rt, struct bheap* tasks)
+{
+ unsigned long flags;
+ raw_spin_lock_irqsave(&rt->ready_lock, flags);
+ __merge_ready(rt, tasks);
+ raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
+}
+
+static inline struct task_struct* take_ready(rt_domain_t* rt)
+{
+ unsigned long flags;
+ struct task_struct* ret;
+ /* first we need the write lock for rt_ready_queue */
+ raw_spin_lock_irqsave(&rt->ready_lock, flags);
+ ret = __take_ready(rt);
+ raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
+ return ret;
+}
+
+
+static inline void add_release(rt_domain_t* rt, struct task_struct *task)
+{
+ unsigned long flags;
+ raw_spin_lock_irqsave(&rt->tobe_lock, flags);
+ __add_release(rt, task);
+ raw_spin_unlock_irqrestore(&rt->tobe_lock, flags);
+}
+
+#ifdef CONFIG_RELEASE_MASTER
+void __add_release_on(rt_domain_t* rt, struct task_struct *task,
+ int target_cpu);
+
+static inline void add_release_on(rt_domain_t* rt,
+ struct task_struct *task,
+ int target_cpu)
+{
+ unsigned long flags;
+ raw_spin_lock_irqsave(&rt->tobe_lock, flags);
+ __add_release_on(rt, task, target_cpu);
+ raw_spin_unlock_irqrestore(&rt->tobe_lock, flags);
+}
+#endif
+
+static inline int __jobs_pending(rt_domain_t* rt)
+{
+ return !bheap_empty(&rt->ready_queue);
+}
+
+static inline int jobs_pending(rt_domain_t* rt)
+{
+ unsigned long flags;
+ int ret;
+ /* first we need the write lock for rt_ready_queue */
+ raw_spin_lock_irqsave(&rt->ready_lock, flags);
+ ret = !bheap_empty(&rt->ready_queue);
+ raw_spin_unlock_irqrestore(&rt->ready_lock, flags);
+ return ret;
+}
+
+#endif
diff --git a/include/litmus/rt_param.h b/include/litmus/rt_param.h
new file mode 100644
index 0000000..4cd06dd
--- /dev/null
+++ b/include/litmus/rt_param.h
@@ -0,0 +1,253 @@
+/*
+ * Definition of the scheduler plugin interface.
+ *
+ */
+#ifndef _LINUX_RT_PARAM_H_
+#define _LINUX_RT_PARAM_H_
+
+/* Litmus time type. */
+typedef unsigned long long lt_t;
+
+static inline int lt_after(lt_t a, lt_t b)
+{
+ return ((long long) b) - ((long long) a) < 0;
+}
+#define lt_before(a, b) lt_after(b, a)
+
+static inline int lt_after_eq(lt_t a, lt_t b)
+{
+ return ((long long) a) - ((long long) b) >= 0;
+}
+#define lt_before_eq(a, b) lt_after_eq(b, a)
+
+/* different types of clients */
+typedef enum {
+ RT_CLASS_HARD,
+ RT_CLASS_SOFT,
+ RT_CLASS_BEST_EFFORT
+} task_class_t;
+
+typedef enum {
+ NO_ENFORCEMENT, /* job may overrun unhindered */
+ QUANTUM_ENFORCEMENT, /* budgets are only checked on quantum boundaries */
+ PRECISE_ENFORCEMENT /* budgets are enforced with hrtimers */
+} budget_policy_t;
+
+/* We use the common priority interpretation "lower index == higher priority",
+ * which is commonly used in fixed-priority schedulability analysis papers.
+ * So, a numerically lower priority value implies higher scheduling priority,
+ * with priority 1 being the highest priority. Priority 0 is reserved for
+ * priority boosting. LITMUS_MAX_PRIORITY denotes the maximum priority value
+ * range.
+ */
+
+#define LITMUS_MAX_PRIORITY 512
+#define LITMUS_HIGHEST_PRIORITY 1
+#define LITMUS_LOWEST_PRIORITY (LITMUS_MAX_PRIORITY - 1)
+
+/* Provide generic comparison macros for userspace,
+ * in case that we change this later. */
+#define litmus_higher_fixed_prio(a, b) (a < b)
+#define litmus_lower_fixed_prio(a, b) (a > b)
+#define litmus_is_valid_fixed_prio(p) \
+ ((p) >= LITMUS_HIGHEST_PRIORITY && \
+ (p) <= LITMUS_LOWEST_PRIORITY)
+
+struct rt_task {
+ lt_t exec_cost;
+ lt_t period;
+ lt_t relative_deadline;
+ lt_t phase;
+ unsigned int cpu;
+ unsigned int priority;
+ task_class_t cls;
+ budget_policy_t budget_policy; /* ignored by pfair */
+};
+
+union np_flag {
+ uint64_t raw;
+ struct {
+ /* Is the task currently in a non-preemptive section? */
+ uint64_t flag:31;
+ /* Should the task call into the scheduler? */
+ uint64_t preempt:1;
+ } np;
+};
+
+/* The definition of the data that is shared between the kernel and real-time
+ * tasks via a shared page (see litmus/ctrldev.c).
+ *
+ * WARNING: User space can write to this, so don't trust
+ * the correctness of the fields!
+ *
+ * This servees two purposes: to enable efficient signaling
+ * of non-preemptive sections (user->kernel) and
+ * delayed preemptions (kernel->user), and to export
+ * some real-time relevant statistics such as preemption and
+ * migration data to user space. We can't use a device to export
+ * statistics because we want to avoid system call overhead when
+ * determining preemption/migration overheads).
+ */
+struct control_page {
+ /* This flag is used by userspace to communicate non-preempive
+ * sections. */
+ volatile union np_flag sched;
+
+ volatile uint64_t irq_count; /* Incremented by the kernel each time an IRQ is
+ * handled. */
+
+ /* Locking overhead tracing: userspace records here the time stamp
+ * and IRQ counter prior to starting the system call. */
+ uint64_t ts_syscall_start; /* Feather-Trace cycles */
+ uint64_t irq_syscall_start; /* Snapshot of irq_count when the syscall
+ * started. */
+
+ /* to be extended */
+};
+
+/* Expected offsets within the control page. */
+
+#define LITMUS_CP_OFFSET_SCHED 0
+#define LITMUS_CP_OFFSET_IRQ_COUNT 8
+#define LITMUS_CP_OFFSET_TS_SC_START 16
+#define LITMUS_CP_OFFSET_IRQ_SC_START 24
+
+/* don't export internal data structures to user space (liblitmus) */
+#ifdef __KERNEL__
+
+struct _rt_domain;
+struct bheap_node;
+struct release_heap;
+
+struct rt_job {
+ /* Time instant the the job was or will be released. */
+ lt_t release;
+ /* What is the current deadline? */
+ lt_t deadline;
+
+ /* How much service has this job received so far? */
+ lt_t exec_time;
+
+ /* By how much did the prior job miss its deadline by?
+ * Value differs from tardiness in that lateness may
+ * be negative (when job finishes before its deadline).
+ */
+ long long lateness;
+
+ /* Which job is this. This is used to let user space
+ * specify which job to wait for, which is important if jobs
+ * overrun. If we just call sys_sleep_next_period() then we
+ * will unintentionally miss jobs after an overrun.
+ *
+ * Increase this sequence number when a job is released.
+ */
+ unsigned int job_no;
+};
+
+struct pfair_param;
+
+/* RT task parameters for scheduling extensions
+ * These parameters are inherited during clone and therefore must
+ * be explicitly set up before the task set is launched.
+ */
+struct rt_param {
+ /* is the task sleeping? */
+ unsigned int flags:8;
+
+ /* do we need to check for srp blocking? */
+ unsigned int srp_non_recurse:1;
+
+ /* is the task present? (true if it can be scheduled) */
+ unsigned int present:1;
+
+ /* has the task completed? */
+ unsigned int completed:1;
+
+#ifdef CONFIG_LITMUS_LOCKING
+ /* Is the task being priority-boosted by a locking protocol? */
+ unsigned int priority_boosted:1;
+ /* If so, when did this start? */
+ lt_t boost_start_time;
+#endif
+
+ /* user controlled parameters */
+ struct rt_task task_params;
+
+ /* timing parameters */
+ struct rt_job job_params;
+
+ /* task representing the current "inherited" task
+ * priority, assigned by inherit_priority and
+ * return priority in the scheduler plugins.
+ * could point to self if PI does not result in
+ * an increased task priority.
+ */
+ struct task_struct* inh_task;
+
+#ifdef CONFIG_NP_SECTION
+ /* For the FMLP under PSN-EDF, it is required to make the task
+ * non-preemptive from kernel space. In order not to interfere with
+ * user space, this counter indicates the kernel space np setting.
+ * kernel_np > 0 => task is non-preemptive
+ */
+ unsigned int kernel_np;
+#endif
+
+ /* This field can be used by plugins to store where the task
+ * is currently scheduled. It is the responsibility of the
+ * plugin to avoid race conditions.
+ *
+ * This used by GSN-EDF and PFAIR.
+ */
+ volatile int scheduled_on;
+
+ /* Is the stack of the task currently in use? This is updated by
+ * the LITMUS core.
+ *
+ * Be careful to avoid deadlocks!
+ */
+ volatile int stack_in_use;
+
+ /* This field can be used by plugins to store where the task
+ * is currently linked. It is the responsibility of the plugin
+ * to avoid race conditions.
+ *
+ * Used by GSN-EDF.
+ */
+ volatile int linked_on;
+
+ /* PFAIR/PD^2 state. Allocated on demand. */
+ struct pfair_param* pfair;
+
+ /* Fields saved before BE->RT transition.
+ */
+ int old_policy;
+ int old_prio;
+
+ /* ready queue for this task */
+ struct _rt_domain* domain;
+
+ /* heap element for this task
+ *
+ * Warning: Don't statically allocate this node. The heap
+ * implementation swaps these between tasks, thus after
+ * dequeuing from a heap you may end up with a different node
+ * then the one you had when enqueuing the task. For the same
+ * reason, don't obtain and store references to this node
+ * other than this pointer (which is updated by the heap
+ * implementation).
+ */
+ struct bheap_node* heap_node;
+ struct release_heap* rel_heap;
+
+ /* Used by rt_domain to queue task in release list.
+ */
+ struct list_head list;
+
+ /* Pointer to the page shared between userspace and kernel. */
+ struct control_page * ctrl_page;
+};
+
+#endif
+
+#endif
diff --git a/include/litmus/sched_plugin.h b/include/litmus/sched_plugin.h
new file mode 100644
index 0000000..1546ab7
--- /dev/null
+++ b/include/litmus/sched_plugin.h
@@ -0,0 +1,113 @@
+/*
+ * Definition of the scheduler plugin interface.
+ *
+ */
+#ifndef _LINUX_SCHED_PLUGIN_H_
+#define _LINUX_SCHED_PLUGIN_H_
+
+#include <linux/sched.h>
+
+#ifdef CONFIG_LITMUS_LOCKING
+#include <litmus/locking.h>
+#endif
+
+/************************ setup/tear down ********************/
+
+typedef long (*activate_plugin_t) (void);
+typedef long (*deactivate_plugin_t) (void);
+
+
+
+/********************* scheduler invocation ******************/
+
+/* Plugin-specific realtime tick handler */
+typedef void (*scheduler_tick_t) (struct task_struct *cur);
+/* Novell make sched decision function */
+typedef struct task_struct* (*schedule_t)(struct task_struct * prev);
+/* Clean up after the task switch has occured.
+ * This function is called after every (even non-rt) task switch.
+ */
+typedef void (*finish_switch_t)(struct task_struct *prev);
+
+
+/********************* task state changes ********************/
+
+/* Called to setup a new real-time task.
+ * Release the first job, enqueue, etc.
+ * Task may already be running.
+ */
+typedef void (*task_new_t) (struct task_struct *task,
+ int on_rq,
+ int running);
+
+/* Called to re-introduce a task after blocking.
+ * Can potentially be called multiple times.
+ */
+typedef void (*task_wake_up_t) (struct task_struct *task);
+/* called to notify the plugin of a blocking real-time task
+ * it will only be called for real-time tasks and before schedule is called */
+typedef void (*task_block_t) (struct task_struct *task);
+/* Called when a real-time task exits or changes to a different scheduling
+ * class.
+ * Free any allocated resources
+ */
+typedef void (*task_exit_t) (struct task_struct *);
+
+#ifdef CONFIG_LITMUS_LOCKING
+/* Called when the current task attempts to create a new lock of a given
+ * protocol type. */
+typedef long (*allocate_lock_t) (struct litmus_lock **lock, int type,
+ void* __user config);
+#endif
+
+
+/********************* sys call backends ********************/
+/* This function causes the caller to sleep until the next release */
+typedef long (*complete_job_t) (void);
+
+typedef long (*admit_task_t)(struct task_struct* tsk);
+
+typedef void (*release_at_t)(struct task_struct *t, lt_t start);
+
+struct sched_plugin {
+ struct list_head list;
+ /* basic info */
+ char *plugin_name;
+
+ /* setup */
+ activate_plugin_t activate_plugin;
+ deactivate_plugin_t deactivate_plugin;
+
+ /* scheduler invocation */
+ scheduler_tick_t tick;
+ schedule_t schedule;
+ finish_switch_t finish_switch;
+
+ /* syscall backend */
+ complete_job_t complete_job;
+ release_at_t release_at;
+
+ /* task state changes */
+ admit_task_t admit_task;
+
+ task_new_t task_new;
+ task_wake_up_t task_wake_up;
+ task_block_t task_block;
+ task_exit_t task_exit;
+
+#ifdef CONFIG_LITMUS_LOCKING
+ /* locking protocols */
+ allocate_lock_t allocate_lock;
+#endif
+} __attribute__ ((__aligned__(SMP_CACHE_BYTES)));
+
+
+extern struct sched_plugin *litmus;
+
+int register_sched_plugin(struct sched_plugin* plugin);
+struct sched_plugin* find_sched_plugin(const char* name);
+int print_sched_plugins(char* buf, int max);
+
+extern struct sched_plugin linux_sched_plugin;
+
+#endif
diff --git a/include/litmus/sched_trace.h b/include/litmus/sched_trace.h
new file mode 100644
index 0000000..82bde82
--- /dev/null
+++ b/include/litmus/sched_trace.h
@@ -0,0 +1,259 @@
+/*
+ * sched_trace.h -- record scheduler events to a byte stream for offline analysis.
+ */
+#ifndef _LINUX_SCHED_TRACE_H_
+#define _LINUX_SCHED_TRACE_H_
+
+/* all times in nanoseconds */
+
+struct st_trace_header {
+ u8 type; /* Of what type is this record? */
+ u8 cpu; /* On which CPU was it recorded? */
+ u16 pid; /* PID of the task. */
+ u32 job; /* The job sequence number. */
+};
+
+#define ST_NAME_LEN 16
+struct st_name_data {
+ char cmd[ST_NAME_LEN];/* The name of the executable of this process. */
+};
+
+struct st_param_data { /* regular params */
+ u32 wcet;
+ u32 period;
+ u32 phase;
+ u8 partition;
+ u8 class;
+ u8 __unused[2];
+};
+
+struct st_release_data { /* A job is was/is going to be released. */
+ u64 release; /* What's the release time? */
+ u64 deadline; /* By when must it finish? */
+};
+
+struct st_assigned_data { /* A job was asigned to a CPU. */
+ u64 when;
+ u8 target; /* Where should it execute? */
+ u8 __unused[7];
+};
+
+struct st_switch_to_data { /* A process was switched to on a given CPU. */
+ u64 when; /* When did this occur? */
+ u32 exec_time; /* Time the current job has executed. */
+ u8 __unused[4];
+
+};
+
+struct st_switch_away_data { /* A process was switched away from on a given CPU. */
+ u64 when;
+ u64 exec_time;
+};
+
+struct st_completion_data { /* A job completed. */
+ u64 when;
+ u8 forced:1; /* Set to 1 if job overran and kernel advanced to the
+ * next task automatically; set to 0 otherwise.
+ */
+ u8 __uflags:7;
+ u8 __unused[7];
+};
+
+struct st_block_data { /* A task blocks. */
+ u64 when;
+ u64 __unused;
+};
+
+struct st_resume_data { /* A task resumes. */
+ u64 when;
+ u64 __unused;
+};
+
+struct st_action_data {
+ u64 when;
+ u8 action;
+ u8 __unused[7];
+};
+
+struct st_sys_release_data {
+ u64 when;
+ u64 release;
+};
+
+#define DATA(x) struct st_ ## x ## _data x;
+
+typedef enum {
+ ST_NAME = 1, /* Start at one, so that we can spot
+ * uninitialized records. */
+ ST_PARAM,
+ ST_RELEASE,
+ ST_ASSIGNED,
+ ST_SWITCH_TO,
+ ST_SWITCH_AWAY,
+ ST_COMPLETION,
+ ST_BLOCK,
+ ST_RESUME,
+ ST_ACTION,
+ ST_SYS_RELEASE
+} st_event_record_type_t;
+
+struct st_event_record {
+ struct st_trace_header hdr;
+ union {
+ u64 raw[2];
+
+ DATA(name);
+ DATA(param);
+ DATA(release);
+ DATA(assigned);
+ DATA(switch_to);
+ DATA(switch_away);
+ DATA(completion);
+ DATA(block);
+ DATA(resume);
+ DATA(action);
+ DATA(sys_release);
+ } data;
+};
+
+#undef DATA
+
+#ifdef __KERNEL__
+
+#include <linux/sched.h>
+#include <litmus/feather_trace.h>
+
+#ifdef CONFIG_SCHED_TASK_TRACE
+
+#define SCHED_TRACE(id, callback, task) \
+ ft_event1(id, callback, task)
+#define SCHED_TRACE2(id, callback, task, xtra) \
+ ft_event2(id, callback, task, xtra)
+
+/* provide prototypes; needed on sparc64 */
+#ifndef NO_TASK_TRACE_DECLS
+feather_callback void do_sched_trace_task_name(unsigned long id,
+ struct task_struct* task);
+feather_callback void do_sched_trace_task_param(unsigned long id,
+ struct task_struct* task);
+feather_callback void do_sched_trace_task_release(unsigned long id,
+ struct task_struct* task);
+feather_callback void do_sched_trace_task_switch_to(unsigned long id,
+ struct task_struct* task);
+feather_callback void do_sched_trace_task_switch_away(unsigned long id,
+ struct task_struct* task);
+feather_callback void do_sched_trace_task_completion(unsigned long id,
+ struct task_struct* task,
+ unsigned long forced);
+feather_callback void do_sched_trace_task_block(unsigned long id,
+ struct task_struct* task);
+feather_callback void do_sched_trace_task_resume(unsigned long id,
+ struct task_struct* task);
+feather_callback void do_sched_trace_action(unsigned long id,
+ struct task_struct* task,
+ unsigned long action);
+feather_callback void do_sched_trace_sys_release(unsigned long id,
+ lt_t* start);
+
+#endif
+
+#else
+
+#define SCHED_TRACE(id, callback, task) /* no tracing */
+#define SCHED_TRACE2(id, callback, task, xtra) /* no tracing */
+
+#endif
+
+#ifdef CONFIG_SCHED_LITMUS_TRACEPOINT
+
+#include <trace/events/litmus.h>
+
+#else
+
+/* Override trace macros to actually do nothing */
+#define trace_litmus_task_param(t)
+#define trace_litmus_task_release(t)
+#define trace_litmus_switch_to(t)
+#define trace_litmus_switch_away(prev)
+#define trace_litmus_task_completion(t, forced)
+#define trace_litmus_task_block(t)
+#define trace_litmus_task_resume(t)
+#define trace_litmus_sys_release(start)
+
+#endif
+
+
+#define SCHED_TRACE_BASE_ID 500
+
+
+#define sched_trace_task_name(t) \
+ SCHED_TRACE(SCHED_TRACE_BASE_ID + 1, \
+ do_sched_trace_task_name, t)
+
+#define sched_trace_task_param(t) \
+ do { \
+ SCHED_TRACE(SCHED_TRACE_BASE_ID + 2, \
+ do_sched_trace_task_param, t); \
+ trace_litmus_task_param(t); \
+ } while (0)
+
+#define sched_trace_task_release(t) \
+ do { \
+ SCHED_TRACE(SCHED_TRACE_BASE_ID + 3, \
+ do_sched_trace_task_release, t); \
+ trace_litmus_task_release(t); \
+ } while (0)
+
+#define sched_trace_task_switch_to(t) \
+ do { \
+ SCHED_TRACE(SCHED_TRACE_BASE_ID + 4, \
+ do_sched_trace_task_switch_to, t); \
+ trace_litmus_switch_to(t); \
+ } while (0)
+
+#define sched_trace_task_switch_away(t) \
+ do { \
+ SCHED_TRACE(SCHED_TRACE_BASE_ID + 5, \
+ do_sched_trace_task_switch_away, t); \
+ trace_litmus_switch_away(t); \
+ } while (0)
+
+#define sched_trace_task_completion(t, forced) \
+ do { \
+ SCHED_TRACE2(SCHED_TRACE_BASE_ID + 6, \
+ do_sched_trace_task_completion, t, \
+ (unsigned long) forced); \
+ trace_litmus_task_completion(t, forced); \
+ } while (0)
+
+#define sched_trace_task_block(t) \
+ do { \
+ SCHED_TRACE(SCHED_TRACE_BASE_ID + 7, \
+ do_sched_trace_task_block, t); \
+ trace_litmus_task_block(t); \
+ } while (0)
+
+#define sched_trace_task_resume(t) \
+ do { \
+ SCHED_TRACE(SCHED_TRACE_BASE_ID + 8, \
+ do_sched_trace_task_resume, t); \
+ trace_litmus_task_resume(t); \
+ } while (0)
+
+#define sched_trace_action(t, action) \
+ SCHED_TRACE2(SCHED_TRACE_BASE_ID + 9, \
+ do_sched_trace_action, t, (unsigned long) action);
+
+/* when is a pointer, it does not need an explicit cast to unsigned long */
+#define sched_trace_sys_release(when) \
+ do { \
+ SCHED_TRACE(SCHED_TRACE_BASE_ID + 10, \
+ do_sched_trace_sys_release, when); \
+ trace_litmus_sys_release(when); \
+ } while (0)
+
+#define sched_trace_quantum_boundary() /* NOT IMPLEMENTED */
+
+#endif /* __KERNEL__ */
+
+#endif
diff --git a/include/litmus/srp.h b/include/litmus/srp.h
new file mode 100644
index 0000000..c9a4552
--- /dev/null
+++ b/include/litmus/srp.h
@@ -0,0 +1,28 @@
+#ifndef LITMUS_SRP_H
+#define LITMUS_SRP_H
+
+struct srp_semaphore;
+
+struct srp_priority {
+ struct list_head list;
+ unsigned int priority;
+ pid_t pid;
+};
+#define list2prio(l) list_entry(l, struct srp_priority, list)
+
+/* struct for uniprocessor SRP "semaphore" */
+struct srp_semaphore {
+ struct litmus_lock litmus_lock;
+ struct srp_priority ceiling;
+ struct task_struct* owner;
+ int cpu; /* cpu associated with this "semaphore" and resource */
+};
+
+/* map a task to its SRP preemption level priority */
+typedef unsigned int (*srp_prioritization_t)(struct task_struct* t);
+/* Must be updated by each plugin that uses SRP.*/
+extern srp_prioritization_t get_srp_prio;
+
+struct srp_semaphore* allocate_srp_semaphore(void);
+
+#endif
diff --git a/include/litmus/trace.h b/include/litmus/trace.h
new file mode 100644
index 0000000..8ad4966
--- /dev/null
+++ b/include/litmus/trace.h
@@ -0,0 +1,145 @@
+#ifndef _SYS_TRACE_H_
+#define _SYS_TRACE_H_
+
+#ifdef CONFIG_SCHED_OVERHEAD_TRACE
+
+
+#include <litmus/feather_trace.h>
+#include <litmus/feather_buffer.h>
+
+
+/*********************** TIMESTAMPS ************************/
+
+enum task_type_marker {
+ TSK_BE,
+ TSK_RT,
+ TSK_UNKNOWN
+};
+
+struct timestamp {
+ uint64_t timestamp:48;
+ uint64_t pid:16;
+ uint32_t seq_no;
+ uint8_t cpu;
+ uint8_t event;
+ uint8_t task_type:2;
+ uint8_t irq_flag:1;
+ uint8_t irq_count:5;
+};
+
+/* tracing callbacks */
+feather_callback void save_timestamp(unsigned long event);
+feather_callback void save_timestamp_def(unsigned long event, unsigned long type);
+feather_callback void save_timestamp_task(unsigned long event, unsigned long t_ptr);
+feather_callback void save_timestamp_cpu(unsigned long event, unsigned long cpu);
+feather_callback void save_task_latency(unsigned long event, unsigned long when_ptr);
+feather_callback void save_timestamp_time(unsigned long event, unsigned long time_ptr);
+feather_callback void save_timestamp_irq(unsigned long event, unsigned long irq_count_ptr);
+feather_callback void save_timestamp_hide_irq(unsigned long event);
+
+#define TIMESTAMP(id) ft_event0(id, save_timestamp)
+
+#define DTIMESTAMP(id, def) ft_event1(id, save_timestamp_def, (unsigned long) def)
+
+#define TIMESTAMP_CUR(id) DTIMESTAMP(id, is_realtime(current) ? TSK_RT : TSK_BE)
+
+#define TTIMESTAMP(id, task) \
+ ft_event1(id, save_timestamp_task, (unsigned long) task)
+
+#define CTIMESTAMP(id, cpu) \
+ ft_event1(id, save_timestamp_cpu, (unsigned long) cpu)
+
+#define LTIMESTAMP(id, task) \
+ ft_event1(id, save_task_latency, (unsigned long) task)
+
+#define TIMESTAMP_TIME(id, time_ptr) \
+ ft_event1(id, save_timestamp_time, (unsigned long) time_ptr)
+
+#define TIMESTAMP_IRQ(id, irq_count_ptr) \
+ ft_event1(id, save_timestamp_irq, (unsigned long) irq_count_ptr)
+
+#define TIMESTAMP_IN_IRQ(id) \
+ ft_event0(id, save_timestamp_hide_irq)
+
+#else /* !CONFIG_SCHED_OVERHEAD_TRACE */
+
+#define TIMESTAMP(id) /* no tracing */
+
+#define DTIMESTAMP(id, def) /* no tracing */
+
+#define TIMESTAMP_CUR(id) /* no tracing */
+
+#define TTIMESTAMP(id, task) /* no tracing */
+
+#define CTIMESTAMP(id, cpu) /* no tracing */
+
+#define LTIMESTAMP(id, when_ptr) /* no tracing */
+
+#define TIMESTAMP_TIME(id, time_ptr) /* no tracing */
+
+#define TIMESTAMP_IRQ(id, irq_count_ptr) /* no tracing */
+
+#define TIMESTAMP_IN_IRQ(id) /* no tracing */
+
+#endif
+
+
+/* Convention for timestamps
+ * =========================
+ *
+ * In order to process the trace files with a common tool, we use the following
+ * convention to measure execution times: The end time id of a code segment is
+ * always the next number after the start time event id.
+ */
+
+#define __TS_SYSCALL_IN_START(p) TIMESTAMP_TIME(10, p)
+#define __TS_SYSCALL_IN_END(p) TIMESTAMP_IRQ(11, p)
+
+#define TS_SYSCALL_OUT_START TIMESTAMP_CUR(20)
+#define TS_SYSCALL_OUT_END TIMESTAMP_CUR(21)
+
+#define TS_LOCK_START TIMESTAMP_CUR(30)
+#define TS_LOCK_END TIMESTAMP_CUR(31)
+
+#define TS_LOCK_SUSPEND TIMESTAMP_CUR(38)
+#define TS_LOCK_RESUME TIMESTAMP_CUR(39)
+
+#define TS_UNLOCK_START TIMESTAMP_CUR(40)
+#define TS_UNLOCK_END TIMESTAMP_CUR(41)
+
+#define TS_SCHED_START DTIMESTAMP(100, TSK_UNKNOWN) /* we only
+ * care
+ * about
+ * next */
+#define TS_SCHED_END(t) TTIMESTAMP(101, t)
+#define TS_SCHED2_START(t) TTIMESTAMP(102, t)
+#define TS_SCHED2_END(t) TTIMESTAMP(103, t)
+
+#define TS_CXS_START(t) TTIMESTAMP(104, t)
+#define TS_CXS_END(t) TTIMESTAMP(105, t)
+
+#define TS_RELEASE_START DTIMESTAMP(106, TSK_RT)
+#define TS_RELEASE_END DTIMESTAMP(107, TSK_RT)
+
+#define TS_TICK_START(t) TTIMESTAMP(110, t)
+#define TS_TICK_END(t) TTIMESTAMP(111, t)
+
+
+#define TS_PLUGIN_SCHED_START /* TIMESTAMP(120) */ /* currently unused */
+#define TS_PLUGIN_SCHED_END /* TIMESTAMP(121) */
+
+#define TS_PLUGIN_TICK_START /* TIMESTAMP(130) */
+#define TS_PLUGIN_TICK_END /* TIMESTAMP(131) */
+
+#define TS_ENTER_NP_START TIMESTAMP(140)
+#define TS_ENTER_NP_END TIMESTAMP(141)
+
+#define TS_EXIT_NP_START TIMESTAMP(150)
+#define TS_EXIT_NP_END TIMESTAMP(151)
+
+#define TS_SEND_RESCHED_START(c) CTIMESTAMP(190, c)
+#define TS_SEND_RESCHED_END TIMESTAMP_IN_IRQ(191)
+
+#define TS_RELEASE_LATENCY(when) LTIMESTAMP(208, &(when))
+
+#endif /* !_SYS_TRACE_H_ */
diff --git a/include/litmus/trace_irq.h b/include/litmus/trace_irq.h
new file mode 100644
index 0000000..0d0c042
--- /dev/null
+++ b/include/litmus/trace_irq.h
@@ -0,0 +1,14 @@
+#ifndef _LITMUS_TRACE_IRQ_H_
+#define _LITMUS_TRACE_IRQ_H_
+
+#ifdef CONFIG_SCHED_OVERHEAD_TRACE
+
+void ft_irq_fired(void);
+
+#else
+
+#define ft_irq_fired() /* nothing to do */
+
+#endif
+
+#endif
diff --git a/include/litmus/unistd_32.h b/include/litmus/unistd_32.h
new file mode 100644
index 0000000..94264c2
--- /dev/null
+++ b/include/litmus/unistd_32.h
@@ -0,0 +1,21 @@
+/*
+ * included from arch/x86/include/asm/unistd_32.h
+ *
+ * LITMUS^RT syscalls with "relative" numbers
+ */
+#define __LSC(x) (__NR_LITMUS + x)
+
+#define __NR_set_rt_task_param __LSC(0)
+#define __NR_get_rt_task_param __LSC(1)
+#define __NR_complete_job __LSC(2)
+#define __NR_od_open __LSC(3)
+#define __NR_od_close __LSC(4)
+#define __NR_litmus_lock __LSC(5)
+#define __NR_litmus_unlock __LSC(6)
+#define __NR_query_job_no __LSC(7)
+#define __NR_wait_for_job_release __LSC(8)
+#define __NR_wait_for_ts_release __LSC(9)
+#define __NR_release_ts __LSC(10)
+#define __NR_null_call __LSC(11)
+
+#define NR_litmus_syscalls 12
diff --git a/include/litmus/unistd_64.h b/include/litmus/unistd_64.h
new file mode 100644
index 0000000..d5ced0d
--- /dev/null
+++ b/include/litmus/unistd_64.h
@@ -0,0 +1,33 @@
+/*
+ * included from arch/x86/include/asm/unistd_64.h
+ *
+ * LITMUS^RT syscalls with "relative" numbers
+ */
+#define __LSC(x) (__NR_LITMUS + x)
+
+#define __NR_set_rt_task_param __LSC(0)
+__SYSCALL(__NR_set_rt_task_param, sys_set_rt_task_param)
+#define __NR_get_rt_task_param __LSC(1)
+__SYSCALL(__NR_get_rt_task_param, sys_get_rt_task_param)
+#define __NR_complete_job __LSC(2)
+__SYSCALL(__NR_complete_job, sys_complete_job)
+#define __NR_od_open __LSC(3)
+__SYSCALL(__NR_od_open, sys_od_open)
+#define __NR_od_close __LSC(4)
+__SYSCALL(__NR_od_close, sys_od_close)
+#define __NR_litmus_lock __LSC(5)
+__SYSCALL(__NR_litmus_lock, sys_litmus_lock)
+#define __NR_litmus_unlock __LSC(6)
+__SYSCALL(__NR_litmus_unlock, sys_litmus_unlock)
+#define __NR_query_job_no __LSC(7)
+__SYSCALL(__NR_query_job_no, sys_query_job_no)
+#define __NR_wait_for_job_release __LSC(8)
+__SYSCALL(__NR_wait_for_job_release, sys_wait_for_job_release)
+#define __NR_wait_for_ts_release __LSC(9)
+__SYSCALL(__NR_wait_for_ts_release, sys_wait_for_ts_release)
+#define __NR_release_ts __LSC(10)
+__SYSCALL(__NR_release_ts, sys_release_ts)
+#define __NR_null_call __LSC(11)
+__SYSCALL(__NR_null_call, sys_null_call)
+
+#define NR_litmus_syscalls 12
diff --git a/include/litmus/wait.h b/include/litmus/wait.h
new file mode 100644
index 0000000..ce1347c
--- /dev/null
+++ b/include/litmus/wait.h
@@ -0,0 +1,57 @@
+#ifndef _LITMUS_WAIT_H_
+#define _LITMUS_WAIT_H_
+
+struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq);
+
+/* wrap regular wait_queue_t head */
+struct __prio_wait_queue {
+ wait_queue_t wq;
+
+ /* some priority point */
+ lt_t priority;
+ /* break ties in priority by lower tie_breaker */
+ unsigned int tie_breaker;
+};
+
+typedef struct __prio_wait_queue prio_wait_queue_t;
+
+static inline void init_prio_waitqueue_entry(prio_wait_queue_t *pwq,
+ struct task_struct* t,
+ lt_t priority)
+{
+ init_waitqueue_entry(&pwq->wq, t);
+ pwq->priority = priority;
+ pwq->tie_breaker = 0;
+}
+
+static inline void init_prio_waitqueue_entry_tie(prio_wait_queue_t *pwq,
+ struct task_struct* t,
+ lt_t priority,
+ unsigned int tie_breaker)
+{
+ init_waitqueue_entry(&pwq->wq, t);
+ pwq->priority = priority;
+ pwq->tie_breaker = tie_breaker;
+}
+
+unsigned int __add_wait_queue_prio_exclusive(
+ wait_queue_head_t* head,
+ prio_wait_queue_t *new);
+
+static inline unsigned int add_wait_queue_prio_exclusive(
+ wait_queue_head_t* head,
+ prio_wait_queue_t *new)
+{
+ unsigned long flags;
+ unsigned int passed;
+
+ spin_lock_irqsave(&head->lock, flags);
+ passed = __add_wait_queue_prio_exclusive(head, new);
+
+ spin_unlock_irqrestore(&head->lock, flags);
+
+ return passed;
+}
+
+
+#endif
diff --git a/include/trace/events/litmus.h b/include/trace/events/litmus.h
new file mode 100644
index 0000000..0fffcee0
--- /dev/null
+++ b/include/trace/events/litmus.h
@@ -0,0 +1,231 @@
+/*
+ * LITMUS^RT kernel style scheduling tracepoints
+ */
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM litmus
+
+#if !defined(_SCHED_TASK_TRACEPOINT_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _SCHED_TASK_TRACEPOINT_H
+
+#include <linux/tracepoint.h>
+
+#include <litmus/litmus.h>
+#include <litmus/rt_param.h>
+
+/*
+ * Tracing task admission
+ */
+TRACE_EVENT(litmus_task_param,
+
+ TP_PROTO(struct task_struct *t),
+
+ TP_ARGS(t),
+
+ TP_STRUCT__entry(
+ __field( pid_t, pid )
+ __field( unsigned int, job )
+ __field( lt_t, wcet )
+ __field( lt_t, period )
+ __field( lt_t, phase )
+ __field( int, partition )
+ ),
+
+ TP_fast_assign(
+ __entry->pid = t ? t->pid : 0;
+ __entry->job = t ? t->rt_param.job_params.job_no : 0;
+ __entry->wcet = get_exec_cost(t);
+ __entry->period = get_rt_period(t);
+ __entry->phase = get_rt_phase(t);
+ __entry->partition = get_partition(t);
+ ),
+
+ TP_printk("period(%d, %Lu).\nwcet(%d, %Lu).\n",
+ __entry->pid, __entry->period,
+ __entry->pid, __entry->wcet)
+);
+
+/*
+ * Tracing jobs release
+ */
+TRACE_EVENT(litmus_task_release,
+
+ TP_PROTO(struct task_struct *t),
+
+ TP_ARGS(t),
+
+ TP_STRUCT__entry(
+ __field( pid_t, pid )
+ __field( unsigned int, job )
+ __field( lt_t, release )
+ __field( lt_t, deadline )
+ ),
+
+ TP_fast_assign(
+ __entry->pid = t ? t->pid : 0;
+ __entry->job = t ? t->rt_param.job_params.job_no : 0;
+ __entry->release = get_release(t);
+ __entry->deadline = get_deadline(t);
+ ),
+
+ TP_printk("release(job(%u, %u)): %Lu\ndeadline(job(%u, %u)): %Lu\n",
+ __entry->pid, __entry->job, __entry->release,
+ __entry->pid, __entry->job, __entry->deadline)
+);
+
+/*
+ * Tracepoint for switching to new task
+ */
+TRACE_EVENT(litmus_switch_to,
+
+ TP_PROTO(struct task_struct *t),
+
+ TP_ARGS(t),
+
+ TP_STRUCT__entry(
+ __field( pid_t, pid )
+ __field( unsigned int, job )
+ __field( lt_t, when )
+ __field( lt_t, exec_time )
+ ),
+
+ TP_fast_assign(
+ __entry->pid = is_realtime(t) ? t->pid : 0;
+ __entry->job = is_realtime(t) ? t->rt_param.job_params.job_no : 0;
+ __entry->when = litmus_clock();
+ __entry->exec_time = get_exec_time(t);
+ ),
+
+ TP_printk("switch_to(job(%u, %u)): %Lu (exec: %Lu)\n",
+ __entry->pid, __entry->job,
+ __entry->when, __entry->exec_time)
+);
+
+/*
+ * Tracepoint for switching away previous task
+ */
+TRACE_EVENT(litmus_switch_away,
+
+ TP_PROTO(struct task_struct *t),
+
+ TP_ARGS(t),
+
+ TP_STRUCT__entry(
+ __field( pid_t, pid )
+ __field( unsigned int, job )
+ __field( lt_t, when )
+ __field( lt_t, exec_time )
+ ),
+
+ TP_fast_assign(
+ __entry->pid = is_realtime(t) ? t->pid : 0;
+ __entry->job = is_realtime(t) ? t->rt_param.job_params.job_no : 0;
+ __entry->when = litmus_clock();
+ __entry->exec_time = get_exec_time(t);
+ ),
+
+ TP_printk("switch_away(job(%u, %u)): %Lu (exec: %Lu)\n",
+ __entry->pid, __entry->job,
+ __entry->when, __entry->exec_time)
+);
+
+/*
+ * Tracing jobs completion
+ */
+TRACE_EVENT(litmus_task_completion,
+
+ TP_PROTO(struct task_struct *t, unsigned long forced),
+
+ TP_ARGS(t, forced),
+
+ TP_STRUCT__entry(
+ __field( pid_t, pid )
+ __field( unsigned int, job )
+ __field( lt_t, when )
+ __field( unsigned long, forced )
+ ),
+
+ TP_fast_assign(
+ __entry->pid = t ? t->pid : 0;
+ __entry->job = t ? t->rt_param.job_params.job_no : 0;
+ __entry->when = litmus_clock();
+ __entry->forced = forced;
+ ),
+
+ TP_printk("completed(job(%u, %u)): %Lu (forced: %lu)\n",
+ __entry->pid, __entry->job,
+ __entry->when, __entry->forced)
+);
+
+/*
+ * Trace blocking tasks.
+ */
+TRACE_EVENT(litmus_task_block,
+
+ TP_PROTO(struct task_struct *t),
+
+ TP_ARGS(t),
+
+ TP_STRUCT__entry(
+ __field( pid_t, pid )
+ __field( lt_t, when )
+ ),
+
+ TP_fast_assign(
+ __entry->pid = t ? t->pid : 0;
+ __entry->when = litmus_clock();
+ ),
+
+ TP_printk("(%u) blocks: %Lu\n", __entry->pid, __entry->when)
+);
+
+/*
+ * Tracing jobs resume
+ */
+TRACE_EVENT(litmus_task_resume,
+
+ TP_PROTO(struct task_struct *t),
+
+ TP_ARGS(t),
+
+ TP_STRUCT__entry(
+ __field( pid_t, pid )
+ __field( unsigned int, job )
+ __field( lt_t, when )
+ ),
+
+ TP_fast_assign(
+ __entry->pid = t ? t->pid : 0;
+ __entry->job = t ? t->rt_param.job_params.job_no : 0;
+ __entry->when = litmus_clock();
+ ),
+
+ TP_printk("resume(job(%u, %u)): %Lu\n",
+ __entry->pid, __entry->job, __entry->when)
+);
+
+/*
+ * Trace synchronous release
+ */
+TRACE_EVENT(litmus_sys_release,
+
+ TP_PROTO(lt_t *start),
+
+ TP_ARGS(start),
+
+ TP_STRUCT__entry(
+ __field( lt_t, rel )
+ __field( lt_t, when )
+ ),
+
+ TP_fast_assign(
+ __entry->rel = *start;
+ __entry->when = litmus_clock();
+ ),
+
+ TP_printk("SynRelease(%Lu) at %Lu\n", __entry->rel, __entry->when)
+);
+
+#endif /* _SCHED_TASK_TRACEPOINT_H */
+
+/* Must stay outside the protection */
+#include <trace/define_trace.h>
diff --git a/kernel/exit.c b/kernel/exit.c
index f2b321b..64879bd 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -57,6 +57,8 @@
#include <asm/pgtable.h>
#include <asm/mmu_context.h>
+extern void exit_od_table(struct task_struct *t);
+
static void exit_mm(struct task_struct * tsk);
static void __unhash_process(struct task_struct *p, bool group_dead)
@@ -980,6 +982,8 @@ NORET_TYPE void do_exit(long code)
if (unlikely(tsk->audit_context))
audit_free(tsk);
+ exit_od_table(tsk);
+
tsk->exit_code = code;
taskstats_exit(tsk, group_dead);
diff --git a/kernel/fork.c b/kernel/fork.c
index 0276c30..25c6111 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -77,6 +77,9 @@
#include <trace/events/sched.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+
/*
* Protected counters by write_lock_irq(&tasklist_lock)
*/
@@ -191,6 +194,7 @@ void __put_task_struct(struct task_struct *tsk)
WARN_ON(atomic_read(&tsk->usage));
WARN_ON(tsk == current);
+ exit_litmus(tsk);
exit_creds(tsk);
delayacct_tsk_free(tsk);
put_signal_struct(tsk->signal);
@@ -275,6 +279,9 @@ static struct task_struct *dup_task_struct(struct task_struct *orig)
tsk->stack = ti;
+ /* Don't let the new task be a real-time task. */
+ litmus_fork(tsk);
+
err = prop_local_init_single(&tsk->dirties);
if (err)
goto out;
diff --git a/kernel/hrtimer.c b/kernel/hrtimer.c
index a9205e3..11e8969 100644
--- a/kernel/hrtimer.c
+++ b/kernel/hrtimer.c
@@ -46,6 +46,8 @@
#include <linux/sched.h>
#include <linux/timer.h>
+#include <litmus/litmus.h>
+
#include <asm/uaccess.h>
#include <trace/events/timer.h>
@@ -1026,6 +1028,98 @@ hrtimer_start(struct hrtimer *timer, ktime_t tim, const enum hrtimer_mode mode)
}
EXPORT_SYMBOL_GPL(hrtimer_start);
+#ifdef CONFIG_ARCH_HAS_SEND_PULL_TIMERS
+
+/**
+ * hrtimer_start_on_info_init - Initialize hrtimer_start_on_info
+ */
+void hrtimer_start_on_info_init(struct hrtimer_start_on_info *info)
+{
+ memset(info, 0, sizeof(struct hrtimer_start_on_info));
+ atomic_set(&info->state, HRTIMER_START_ON_INACTIVE);
+}
+
+/**
+ * hrtimer_pull - PULL_TIMERS_VECTOR callback on remote cpu
+ */
+void hrtimer_pull(void)
+{
+ struct hrtimer_cpu_base *base = &__get_cpu_var(hrtimer_bases);
+ struct hrtimer_start_on_info *info;
+ struct list_head *pos, *safe, list;
+
+ raw_spin_lock(&base->lock);
+ list_replace_init(&base->to_pull, &list);
+ raw_spin_unlock(&base->lock);
+
+ list_for_each_safe(pos, safe, &list) {
+ info = list_entry(pos, struct hrtimer_start_on_info, list);
+ TRACE("pulled timer 0x%x\n", info->timer);
+ list_del(pos);
+ hrtimer_start(info->timer, info->time, info->mode);
+ }
+}
+
+/**
+ * hrtimer_start_on - trigger timer arming on remote cpu
+ * @cpu: remote cpu
+ * @info: save timer information for enqueuing on remote cpu
+ * @timer: timer to be pulled
+ * @time: expire time
+ * @mode: timer mode
+ */
+int hrtimer_start_on(int cpu, struct hrtimer_start_on_info* info,
+ struct hrtimer *timer, ktime_t time,
+ const enum hrtimer_mode mode)
+{
+ unsigned long flags;
+ struct hrtimer_cpu_base* base;
+ int in_use = 0, was_empty;
+
+ /* serialize access to info through the timer base */
+ lock_hrtimer_base(timer, &flags);
+
+ in_use = (atomic_read(&info->state) != HRTIMER_START_ON_INACTIVE);
+ if (!in_use) {
+ INIT_LIST_HEAD(&info->list);
+ info->timer = timer;
+ info->time = time;
+ info->mode = mode;
+ /* mark as in use */
+ atomic_set(&info->state, HRTIMER_START_ON_QUEUED);
+ }
+
+ unlock_hrtimer_base(timer, &flags);
+
+ if (!in_use) {
+ /* initiate pull */
+ preempt_disable();
+ if (cpu == smp_processor_id()) {
+ /* start timer locally; we may get called
+ * with rq->lock held, do not wake up anything
+ */
+ TRACE("hrtimer_start_on: starting on local CPU\n");
+ __hrtimer_start_range_ns(info->timer, info->time,
+ 0, info->mode, 0);
+ } else {
+ TRACE("hrtimer_start_on: pulling to remote CPU\n");
+ base = &per_cpu(hrtimer_bases, cpu);
+ raw_spin_lock_irqsave(&base->lock, flags);
+ was_empty = list_empty(&base->to_pull);
+ list_add(&info->list, &base->to_pull);
+ raw_spin_unlock_irqrestore(&base->lock, flags);
+ if (was_empty)
+ /* only send IPI if other no else
+ * has done so already
+ */
+ smp_send_pull_timers(cpu);
+ }
+ preempt_enable();
+ }
+ return in_use;
+}
+
+#endif
/**
* hrtimer_try_to_cancel - try to deactivate a timer
@@ -1625,6 +1719,7 @@ static void __cpuinit init_hrtimers_cpu(int cpu)
}
hrtimer_init_hres(cpu_base);
+ INIT_LIST_HEAD(&cpu_base->to_pull);
}
#ifdef CONFIG_HOTPLUG_CPU
diff --git a/kernel/printk.c b/kernel/printk.c
index 3518539..b799a2e 100644
--- a/kernel/printk.c
+++ b/kernel/printk.c
@@ -70,6 +70,13 @@ int console_printk[4] = {
};
/*
+ * divert printk() messages when there is a LITMUS^RT debug listener
+ */
+#include <litmus/litmus.h>
+int trace_override = 0;
+int trace_recurse = 0;
+
+/*
* Low level drivers may need that to know if they can schedule in
* their unblank() callback or not. So let's export it.
*/
@@ -871,6 +878,9 @@ asmlinkage int vprintk(const char *fmt, va_list args)
/* Emit the output into the temporary buffer */
printed_len += vscnprintf(printk_buf + printed_len,
sizeof(printk_buf) - printed_len, fmt, args);
+ /* if LITMUS^RT tracer is active divert printk() msgs */
+ if (trace_override && !trace_recurse)
+ TRACE("%s", printk_buf);
p = printk_buf;
@@ -947,7 +957,7 @@ asmlinkage int vprintk(const char *fmt, va_list args)
* Try to acquire and then immediately release the
* console semaphore. The release will do all the
* actual magic (print out buffers, wake up klogd,
- * etc).
+ * etc).
*
* The console_trylock_for_printk() function
* will release 'logbuf_lock' regardless of whether it
@@ -1220,7 +1230,7 @@ int printk_needs_cpu(int cpu)
void wake_up_klogd(void)
{
- if (waitqueue_active(&log_wait))
+ if (!trace_override && waitqueue_active(&log_wait))
this_cpu_write(printk_pending, 1);
}
diff --git a/kernel/sched.c b/kernel/sched.c
index fde6ff9..c4b6bd5 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -83,6 +83,11 @@
#define CREATE_TRACE_POINTS
#include <trace/events/sched.h>
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
+
+static void litmus_tick(struct rq*, struct task_struct*);
+
/*
* Convert user-nice values [ -20 ... 0 ... 19 ]
* to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ],
@@ -410,6 +415,12 @@ struct rt_rq {
#endif
};
+/* Litmus related fields in a runqueue */
+struct litmus_rq {
+ unsigned long nr_running;
+ struct task_struct *prev;
+};
+
#ifdef CONFIG_SMP
/*
@@ -475,6 +486,7 @@ struct rq {
struct cfs_rq cfs;
struct rt_rq rt;
+ struct litmus_rq litmus;
#ifdef CONFIG_FAIR_GROUP_SCHED
/* list of leaf cfs_rq on this cpu: */
@@ -1045,6 +1057,7 @@ static enum hrtimer_restart hrtick(struct hrtimer *timer)
raw_spin_lock(&rq->lock);
update_rq_clock(rq);
rq->curr->sched_class->task_tick(rq, rq->curr, 1);
+ litmus_tick(rq, rq->curr);
raw_spin_unlock(&rq->lock);
return HRTIMER_NORESTART;
@@ -1773,7 +1786,7 @@ static inline void __set_task_cpu(struct task_struct *p, unsigned int cpu)
static const struct sched_class rt_sched_class;
-#define sched_class_highest (&stop_sched_class)
+#define sched_class_highest (&litmus_sched_class)
#define for_each_class(class) \
for (class = sched_class_highest; class; class = class->next)
@@ -2031,6 +2044,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
#include "sched_rt.c"
#include "sched_autogroup.c"
#include "sched_stoptask.c"
+#include "../litmus/sched_litmus.c"
#ifdef CONFIG_SCHED_DEBUG
# include "sched_debug.c"
#endif
@@ -2153,6 +2167,10 @@ static void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags)
* A queue event has occurred, and we're going to schedule. In
* this case, we can save a useless back to back clock update.
*/
+ /* LITMUS^RT:
+ * The "disable-clock-update" approach was buggy in Linux 2.6.36.
+ * The issue has been solved in 2.6.37.
+ */
if (rq->curr->on_rq && test_tsk_need_resched(rq->curr))
rq->skip_clock_update = 1;
}
@@ -2579,8 +2597,12 @@ void scheduler_ipi(void)
struct rq *rq = this_rq();
struct task_struct *list = xchg(&rq->wake_list, NULL);
- if (!list)
+ if (!list) {
+ /* If we don't call irq_enter(), we need to trigger the IRQ
+ * tracing manually. */
+ ft_irq_fired();
return;
+ }
/*
* Not all reschedule IPI handlers call irq_enter/irq_exit, since
@@ -2643,7 +2665,12 @@ static void ttwu_queue(struct task_struct *p, int cpu)
struct rq *rq = cpu_rq(cpu);
#if defined(CONFIG_SMP)
- if (sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
+ /*
+ * LITMUS^RT: whether to send an IPI to the remote CPU
+ * is plugin specific.
+ */
+ if (!is_realtime(p) &&
+ sched_feat(TTWU_QUEUE) && cpu != smp_processor_id()) {
sched_clock_cpu(cpu); /* sync clocks x-cpu */
ttwu_queue_remote(p, cpu);
return;
@@ -2676,6 +2703,9 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
unsigned long flags;
int cpu, success = 0;
+ if (is_realtime(p))
+ TRACE_TASK(p, "try_to_wake_up() state:%d\n", p->state);
+
smp_wmb();
raw_spin_lock_irqsave(&p->pi_lock, flags);
if (!(p->state & state))
@@ -2712,6 +2742,12 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
*/
smp_rmb();
+ /* LITMUS^RT: once the task can be safely referenced by this
+ * CPU, don't mess up with Linux load balancing stuff.
+ */
+ if (is_realtime(p))
+ goto litmus_out_activate;
+
p->sched_contributes_to_load = !!task_contributes_to_load(p);
p->state = TASK_WAKING;
@@ -2723,12 +2759,16 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags)
wake_flags |= WF_MIGRATED;
set_task_cpu(p, cpu);
}
+
+litmus_out_activate:
#endif /* CONFIG_SMP */
ttwu_queue(p, cpu);
stat:
ttwu_stat(p, cpu, wake_flags);
out:
+ if (is_realtime(p))
+ TRACE_TASK(p, "try_to_wake_up() done state:%d\n", p->state);
raw_spin_unlock_irqrestore(&p->pi_lock, flags);
return success;
@@ -2839,7 +2879,8 @@ void sched_fork(struct task_struct *p)
* Revert to default priority/policy on fork if requested.
*/
if (unlikely(p->sched_reset_on_fork)) {
- if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) {
+ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR ||
+ p->policy == SCHED_LITMUS) {
p->policy = SCHED_NORMAL;
p->normal_prio = p->static_prio;
}
@@ -3050,6 +3091,8 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev)
*/
prev_state = prev->state;
finish_arch_switch(prev);
+ litmus->finish_switch(prev);
+ prev->rt_param.stack_in_use = NO_CPU;
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
local_irq_disable();
#endif /* __ARCH_WANT_INTERRUPTS_ON_CTXSW */
@@ -3079,6 +3122,15 @@ static inline void pre_schedule(struct rq *rq, struct task_struct *prev)
{
if (prev->sched_class->pre_schedule)
prev->sched_class->pre_schedule(rq, prev);
+
+ /* LITMUS^RT not very clean hack: we need to save the prev task
+ * as our scheduling decision rely on it (as we drop the rq lock
+ * something in prev can change...); there is no way to escape
+ * this ack apart from modifying pick_nex_task(rq, _prev_) or
+ * falling back on the previous solution of decoupling
+ * scheduling decisions
+ */
+ rq->litmus.prev = prev;
}
/* rq->lock is NOT held, but preemption is disabled */
@@ -3115,16 +3167,26 @@ static inline void post_schedule(struct rq *rq)
asmlinkage void schedule_tail(struct task_struct *prev)
__releases(rq->lock)
{
- struct rq *rq = this_rq();
-
+ struct rq *rq;
+
+ preempt_disable();
+
+ rq = this_rq();
finish_task_switch(rq, prev);
+ sched_trace_task_switch_to(current);
+
/*
* FIXME: do we need to worry about rq being invalidated by the
* task_switch?
*/
post_schedule(rq);
+ if (sched_state_validate_switch())
+ litmus_reschedule_local();
+
+ preempt_enable();
+
#ifdef __ARCH_WANT_UNLOCKED_CTXSW
/* In this case, finish_task_switch does not reenable preemption */
preempt_enable();
@@ -4094,18 +4156,26 @@ void scheduler_tick(void)
sched_clock_tick();
+ TS_TICK_START(current);
+
raw_spin_lock(&rq->lock);
update_rq_clock(rq);
update_cpu_load_active(rq);
curr->sched_class->task_tick(rq, curr, 0);
+
+ /* litmus_tick may force current to resched */
+ litmus_tick(rq, curr);
+
raw_spin_unlock(&rq->lock);
perf_event_task_tick();
#ifdef CONFIG_SMP
rq->idle_at_tick = idle_cpu(cpu);
- trigger_load_balance(rq, cpu);
+ if (!is_realtime(current))
+ trigger_load_balance(rq, cpu);
#endif
+ TS_TICK_END(current);
}
notrace unsigned long get_parent_ip(unsigned long addr)
@@ -4225,12 +4295,20 @@ pick_next_task(struct rq *rq)
/*
* Optimization: we know that if all tasks are in
* the fair class we can call that function directly:
- */
- if (likely(rq->nr_running == rq->cfs.nr_running)) {
+
+ * NOT IN LITMUS^RT!
+
+ * This breaks many assumptions in the plugins.
+ * Do not uncomment without thinking long and hard
+ * about how this affects global plugins such as GSN-EDF.
+
+ if (rq->nr_running == rq->cfs.nr_running) {
+ TRACE("taking shortcut in pick_next_task()\n");
p = fair_sched_class.pick_next_task(rq);
if (likely(p))
return p;
}
+ */
for_each_class(class) {
p = class->pick_next_task(rq);
@@ -4253,11 +4331,19 @@ asmlinkage void __sched schedule(void)
need_resched:
preempt_disable();
+ sched_state_entered_schedule();
cpu = smp_processor_id();
rq = cpu_rq(cpu);
rcu_note_context_switch(cpu);
prev = rq->curr;
+ /* LITMUS^RT: quickly re-evaluate the scheduling decision
+ * if the previous one is no longer valid after CTX.
+ */
+litmus_need_resched_nonpreemptible:
+ TS_SCHED_START;
+ sched_trace_task_switch_away(prev);
+
schedule_debug(prev);
if (sched_feat(HRTICK))
@@ -4314,7 +4400,10 @@ need_resched:
rq->curr = next;
++*switch_count;
+ TS_SCHED_END(next);
+ TS_CXS_START(next);
context_switch(rq, prev, next); /* unlocks the rq */
+ TS_CXS_END(current);
/*
* The context switch have flipped the stack from under us
* and restored the local variables which were saved when
@@ -4323,14 +4412,29 @@ need_resched:
*/
cpu = smp_processor_id();
rq = cpu_rq(cpu);
- } else
+ } else {
+ TS_SCHED_END(prev);
raw_spin_unlock_irq(&rq->lock);
+ }
+
+ TS_SCHED2_START(prev);
+ sched_trace_task_switch_to(current);
post_schedule(rq);
+ if (sched_state_validate_switch()) {
+ TS_SCHED2_END(prev);
+ goto litmus_need_resched_nonpreemptible;
+ }
+
preempt_enable_no_resched();
+
+ TS_SCHED2_END(prev);
+
if (need_resched())
goto need_resched;
+
+ srp_ceiling_block();
}
EXPORT_SYMBOL(schedule);
@@ -5039,7 +5143,9 @@ __setscheduler(struct rq *rq, struct task_struct *p, int policy, int prio)
p->normal_prio = normal_prio(p);
/* we are holding p->pi_lock already */
p->prio = rt_mutex_getprio(p);
- if (rt_prio(p->prio))
+ if (p->policy == SCHED_LITMUS)
+ p->sched_class = &litmus_sched_class;
+ else if (rt_prio(p->prio))
p->sched_class = &rt_sched_class;
else
p->sched_class = &fair_sched_class;
@@ -5087,7 +5193,7 @@ recheck:
if (policy != SCHED_FIFO && policy != SCHED_RR &&
policy != SCHED_NORMAL && policy != SCHED_BATCH &&
- policy != SCHED_IDLE)
+ policy != SCHED_IDLE && policy != SCHED_LITMUS)
return -EINVAL;
}
@@ -5102,6 +5208,8 @@ recheck:
return -EINVAL;
if (rt_policy(policy) != (param->sched_priority != 0))
return -EINVAL;
+ if (policy == SCHED_LITMUS && policy == p->policy)
+ return -EINVAL;
/*
* Allow unprivileged RT tasks to decrease priority:
@@ -5145,6 +5253,12 @@ recheck:
return retval;
}
+ if (policy == SCHED_LITMUS) {
+ retval = litmus_admit_task(p);
+ if (retval)
+ return retval;
+ }
+
/*
* make sure no PI-waiters arrive (or leave) while we are
* changing the priority of the task:
@@ -5203,10 +5317,19 @@ recheck:
p->sched_reset_on_fork = reset_on_fork;
+ if (p->policy == SCHED_LITMUS)
+ litmus_exit_task(p);
+
oldprio = p->prio;
prev_class = p->sched_class;
__setscheduler(rq, p, policy, param->sched_priority);
+ if (policy == SCHED_LITMUS) {
+ p->rt_param.stack_in_use = running ? rq->cpu : NO_CPU;
+ p->rt_param.present = running;
+ litmus->task_new(p, on_rq, running);
+ }
+
if (running)
p->sched_class->set_curr_task(rq);
if (on_rq)
@@ -5374,10 +5497,11 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask)
rcu_read_lock();
p = find_process_by_pid(pid);
- if (!p) {
+ /* Don't set affinity if task not found and for LITMUS tasks */
+ if (!p || is_realtime(p)) {
rcu_read_unlock();
put_online_cpus();
- return -ESRCH;
+ return p ? -EPERM : -ESRCH;
}
/* Prevent p going away */
diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c
index c768588..334eb47 100644
--- a/kernel/sched_fair.c
+++ b/kernel/sched_fair.c
@@ -1890,6 +1890,9 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_
int scale = cfs_rq->nr_running >= sched_nr_latency;
int next_buddy_marked = 0;
+ if (unlikely(rt_prio(p->prio)) || p->policy == SCHED_LITMUS)
+ goto preempt;
+
if (unlikely(se == pse))
return;
diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 10d0182..db04161 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -3,6 +3,8 @@
* policies)
*/
+#include <litmus/litmus.h>
+
#ifdef CONFIG_RT_GROUP_SCHED
#define rt_entity_is_task(rt_se) (!(rt_se)->my_q)
@@ -228,8 +230,11 @@ static void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
if (rt_rq->rt_nr_running) {
if (rt_se && !on_rt_rq(rt_se))
enqueue_rt_entity(rt_se, false);
- if (rt_rq->highest_prio.curr < curr->prio)
+ if (rt_rq->highest_prio.curr < curr->prio &&
+ /* Don't subject LITMUS tasks to remote reschedules */
+ !is_realtime(curr)) {
resched_task(curr);
+ }
}
}
@@ -322,8 +327,10 @@ static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se)
static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq)
{
- if (rt_rq->rt_nr_running)
- resched_task(rq_of_rt_rq(rt_rq)->curr);
+ struct task_struct *curr = rq_of_rt_rq(rt_rq)->curr;
+
+ if (rt_rq->rt_nr_running && !is_realtime(curr))
+ resched_task(curr);
}
static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq)
@@ -1078,7 +1085,7 @@ static void check_preempt_equal_prio(struct rq *rq, struct task_struct *p)
*/
static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flags)
{
- if (p->prio < rq->curr->prio) {
+ if (p->prio < rq->curr->prio || p->policy == SCHED_LITMUS) {
resched_task(rq->curr);
return;
}
diff --git a/kernel/softirq.c b/kernel/softirq.c
index fca82c3..2f2df08 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -211,6 +211,9 @@ asmlinkage void __do_softirq(void)
int max_restart = MAX_SOFTIRQ_RESTART;
int cpu;
+ /* Mark Feather-Trace samples as "disturbed". */
+ ft_irq_fired();
+
pending = local_softirq_pending();
account_system_vtime(current);
diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c
index d5097c4..0c0e02f 100644
--- a/kernel/time/tick-sched.c
+++ b/kernel/time/tick-sched.c
@@ -766,12 +766,53 @@ static enum hrtimer_restart tick_sched_timer(struct hrtimer *timer)
}
/**
+ * tick_set_quanta_type - get the quanta type as a boot option
+ * Default is standard setup with ticks staggered over first
+ * half of tick period.
+ */
+int quanta_type = LINUX_DEFAULT_TICKS;
+static int __init tick_set_quanta_type(char *str)
+{
+ if (strcmp("aligned", str) == 0) {
+ quanta_type = LITMUS_ALIGNED_TICKS;
+ printk(KERN_INFO "LITMUS^RT: setting aligned quanta\n");
+ }
+ else if (strcmp("staggered", str) == 0) {
+ quanta_type = LITMUS_STAGGERED_TICKS;
+ printk(KERN_INFO "LITMUS^RT: setting staggered quanta\n");
+ }
+ return 1;
+}
+__setup("quanta=", tick_set_quanta_type);
+
+u64 cpu_stagger_offset(int cpu)
+{
+ u64 offset = 0;
+ switch (quanta_type) {
+ case LITMUS_ALIGNED_TICKS:
+ offset = 0;
+ break;
+ case LITMUS_STAGGERED_TICKS:
+ offset = ktime_to_ns(tick_period);
+ do_div(offset, num_possible_cpus());
+ offset *= cpu;
+ break;
+ default:
+ offset = ktime_to_ns(tick_period) >> 1;
+ do_div(offset, num_possible_cpus());
+ offset *= cpu;
+ }
+ return offset;
+}
+
+/**
* tick_setup_sched_timer - setup the tick emulation timer
*/
void tick_setup_sched_timer(void)
{
struct tick_sched *ts = &__get_cpu_var(tick_cpu_sched);
ktime_t now = ktime_get();
+ u64 offset;
/*
* Emulate tick processing via per-CPU hrtimers:
@@ -782,6 +823,12 @@ void tick_setup_sched_timer(void)
/* Get the next period (per cpu) */
hrtimer_set_expires(&ts->sched_timer, tick_init_jiffy_update());
+ /* Offset must be set correctly to achieve desired quanta type. */
+ offset = cpu_stagger_offset(smp_processor_id());
+
+ /* Add the correct offset to expiration time */
+ hrtimer_add_expires_ns(&ts->sched_timer, offset);
+
for (;;) {
hrtimer_forward(&ts->sched_timer, now, tick_period);
hrtimer_start_expires(&ts->sched_timer,
diff --git a/litmus/Kconfig b/litmus/Kconfig
new file mode 100644
index 0000000..bd6635c
--- /dev/null
+++ b/litmus/Kconfig
@@ -0,0 +1,282 @@
+menu "LITMUS^RT"
+
+menu "Scheduling"
+
+config PLUGIN_CEDF
+ bool "Clustered-EDF"
+ depends on X86 && SYSFS
+ default y
+ help
+ Include the Clustered EDF (C-EDF) plugin in the kernel.
+ This is appropriate for large platforms with shared caches.
+ On smaller platforms (e.g., ARM PB11MPCore), using C-EDF
+ makes little sense since there aren't any shared caches.
+
+config PLUGIN_PFAIR
+ bool "PFAIR"
+ depends on HIGH_RES_TIMERS && !NO_HZ
+ default y
+ help
+ Include the PFAIR plugin (i.e., the PD^2 scheduler) in the kernel.
+ The PFAIR plugin requires high resolution timers (for staggered quanta)
+ and does not support NO_HZ (quanta could be missed when the system is idle).
+
+ If unsure, say Yes.
+
+config RELEASE_MASTER
+ bool "Release-master Support"
+ depends on ARCH_HAS_SEND_PULL_TIMERS
+ default n
+ help
+ Allow one processor to act as a dedicated interrupt processor
+ that services all timer interrupts, but that does not schedule
+ real-time tasks. See RTSS'09 paper for details
+ (http://www.cs.unc.edu/~anderson/papers.html).
+ Currently only supported by GSN-EDF.
+
+endmenu
+
+menu "Real-Time Synchronization"
+
+config NP_SECTION
+ bool "Non-preemptive section support"
+ default n
+ help
+ Allow tasks to become non-preemptable.
+ Note that plugins still need to explicitly support non-preemptivity.
+ Currently, only GSN-EDF and PSN-EDF have such support.
+
+ This is required to support locking protocols such as the FMLP.
+ If disabled, all tasks will be considered preemptable at all times.
+
+config LITMUS_LOCKING
+ bool "Support for real-time locking protocols"
+ depends on NP_SECTION
+ default n
+ help
+ Enable LITMUS^RT's deterministic multiprocessor real-time
+ locking protocols.
+
+ Say Yes if you want to include locking protocols such as the FMLP and
+ Baker's SRP.
+
+endmenu
+
+menu "Performance Enhancements"
+
+config SCHED_CPU_AFFINITY
+ bool "Local Migration Affinity"
+ depends on X86
+ default y
+ help
+ Rescheduled tasks prefer CPUs near to their previously used CPU. This
+ may improve performance through possible preservation of cache affinity.
+
+ Warning: May make bugs harder to find since tasks may migrate less often.
+
+ NOTES:
+ * Feature is not utilized by PFair/PD^2.
+
+ Say Yes if unsure.
+
+choice
+ prompt "EDF Tie-Break Behavior"
+ default EDF_TIE_BREAK_LATENESS_NORM
+ help
+ Allows the configuration of tie-breaking behavior when the deadlines
+ of two EDF-scheduled tasks are equal.
+
+ config EDF_TIE_BREAK_LATENESS
+ bool "Lateness-based Tie Break"
+ help
+ Break ties between two jobs, A and B, based upon the lateness of their
+ prior jobs. The job with the greatest lateness has priority. Note that
+ lateness has a negative value if the prior job finished before its
+ deadline.
+
+ config EDF_TIE_BREAK_LATENESS_NORM
+ bool "Normalized Lateness-based Tie Break"
+ help
+ Break ties between two jobs, A and B, based upon the lateness, normalized
+ by relative deadline, of their prior jobs. The job with the greatest
+ normalized lateness has priority. Note that lateness has a negative value
+ if the prior job finished before its deadline.
+
+ Normalized lateness tie-breaks are likely desireable over non-normalized
+ tie-breaks if the execution times and/or relative deadlines of tasks in a
+ task set vary greatly.
+
+ config EDF_TIE_BREAK_HASH
+ bool "Hash-based Tie Breaks"
+ help
+ Break ties between two jobs, A and B, with equal deadlines by using a
+ uniform hash; i.e.: hash(A.pid, A.job_num) < hash(B.pid, B.job_num). Job
+ A has ~50% of winning a given tie-break.
+
+ config EDF_PID_TIE_BREAK
+ bool "PID-based Tie Breaks"
+ help
+ Break ties based upon OS-assigned thread IDs. Use this option if
+ required by algorithm's real-time analysis or per-task response-time
+ jitter must be minimized.
+
+ NOTES:
+ * This tie-breaking method was default in Litmus 2012.2 and before.
+
+endchoice
+
+endmenu
+
+menu "Tracing"
+
+config FEATHER_TRACE
+ bool "Feather-Trace Infrastructure"
+ default y
+ help
+ Feather-Trace basic tracing infrastructure. Includes device file
+ driver and instrumentation point support.
+
+ There are actually two implementations of Feather-Trace.
+ 1) A slower, but portable, default implementation.
+ 2) Architecture-specific implementations that rewrite kernel .text at runtime.
+
+ If enabled, Feather-Trace will be based on 2) if available (currently only for x86).
+ However, if DEBUG_RODATA=y, then Feather-Trace will choose option 1) in any case
+ to avoid problems with write-protected .text pages.
+
+ Bottom line: to avoid increased overheads, choose DEBUG_RODATA=n.
+
+ Note that this option only enables the basic Feather-Trace infrastructure;
+ you still need to enable SCHED_TASK_TRACE and/or SCHED_OVERHEAD_TRACE to
+ actually enable any events.
+
+config SCHED_TASK_TRACE
+ bool "Trace real-time tasks"
+ depends on FEATHER_TRACE
+ default y
+ help
+ Include support for the sched_trace_XXX() tracing functions. This
+ allows the collection of real-time task events such as job
+ completions, job releases, early completions, etc. This results in a
+ small overhead in the scheduling code. Disable if the overhead is not
+ acceptable (e.g., benchmarking).
+
+ Say Yes for debugging.
+ Say No for overhead tracing.
+
+config SCHED_TASK_TRACE_SHIFT
+ int "Buffer size for sched_trace_xxx() events"
+ depends on SCHED_TASK_TRACE
+ range 8 13
+ default 9
+ help
+
+ Select the buffer size of sched_trace_xxx() events as a power of two.
+ These buffers are statically allocated as per-CPU data. Each event
+ requires 24 bytes storage plus one additional flag byte. Too large
+ buffers can cause issues with the per-cpu allocator (and waste
+ memory). Too small buffers can cause scheduling events to be lost. The
+ "right" size is workload dependent and depends on the number of tasks,
+ each task's period, each task's number of suspensions, and how often
+ the buffer is flushed.
+
+ Examples: 12 => 4k events
+ 10 => 1k events
+ 8 => 512 events
+
+config SCHED_LITMUS_TRACEPOINT
+ bool "Enable Event/Tracepoint Tracing for real-time task tracing"
+ depends on TRACEPOINTS
+ default n
+ help
+ Enable kernel-style events (tracepoint) for Litmus. Litmus events
+ trace the same functions as the above sched_trace_XXX(), but can
+ be enabled independently.
+ Litmus tracepoints can be recorded and analyzed together (single
+ time reference) with all other kernel tracing events (e.g.,
+ sched:sched_switch, etc.).
+
+ This also enables a quick way to visualize schedule traces using
+ trace-cmd utility and kernelshark visualizer.
+
+ Say Yes for debugging and visualization purposes.
+ Say No for overhead tracing.
+
+config SCHED_OVERHEAD_TRACE
+ bool "Record timestamps for overhead measurements"
+ depends on FEATHER_TRACE
+ default n
+ help
+ Export event stream for overhead tracing.
+ Say Yes for overhead tracing.
+
+config SCHED_DEBUG_TRACE
+ bool "TRACE() debugging"
+ default y
+ help
+ Include support for sched_trace_log_messageg(), which is used to
+ implement TRACE(). If disabled, no TRACE() messages will be included
+ in the kernel, and no overheads due to debugging statements will be
+ incurred by the scheduler. Disable if the overhead is not acceptable
+ (e.g. benchmarking).
+
+ Say Yes for debugging.
+ Say No for overhead tracing.
+
+config SCHED_DEBUG_TRACE_SHIFT
+ int "Buffer size for TRACE() buffer"
+ depends on SCHED_DEBUG_TRACE
+ range 14 22
+ default 18
+ help
+
+ Select the amount of memory needed per for the TRACE() buffer, as a
+ power of two. The TRACE() buffer is global and statically allocated. If
+ the buffer is too small, there will be holes in the TRACE() log if the
+ buffer-flushing task is starved.
+
+ The default should be sufficient for most systems. Increase the buffer
+ size if the log contains holes. Reduce the buffer size when running on
+ a memory-constrained system.
+
+ Examples: 14 => 16KB
+ 18 => 256KB
+ 20 => 1MB
+
+ This buffer is exported to usespace using a misc device as
+ 'litmus/log'. On a system with default udev rules, a corresponding
+ character device node should be created at /dev/litmus/log. The buffer
+ can be flushed using cat, e.g., 'cat /dev/litmus/log > my_log_file.txt'.
+
+config SCHED_DEBUG_TRACE_CALLER
+ bool "Include [function@file:line] tag in TRACE() log"
+ depends on SCHED_DEBUG_TRACE
+ default n
+ help
+ With this option enabled, TRACE() prepends
+
+ "[<function name>@<filename>:<line number>]"
+
+ to each message in the debug log. Enable this to aid in figuring out
+ what was called in which order. The downside is that it adds a lot of
+ clutter.
+
+ If unsure, say No.
+
+config PREEMPT_STATE_TRACE
+ bool "Trace preemption state machine transitions"
+ depends on SCHED_DEBUG_TRACE && DEBUG_KERNEL
+ default n
+ help
+ With this option enabled, each CPU will log when it transitions
+ states in the preemption state machine. This state machine is
+ used to determine how to react to IPIs (avoid races with in-flight IPIs).
+
+ Warning: this creates a lot of information in the debug trace. Only
+ recommended when you are debugging preemption-related races.
+
+ If unsure, say No.
+
+endmenu
+
+endmenu
diff --git a/litmus/Makefile b/litmus/Makefile
new file mode 100644
index 0000000..d26ca70
--- /dev/null
+++ b/litmus/Makefile
@@ -0,0 +1,32 @@
+#
+# Makefile for LITMUS^RT
+#
+
+obj-y = sched_plugin.o litmus.o \
+ preempt.o \
+ litmus_proc.o \
+ budget.o \
+ clustered.o \
+ jobs.o \
+ sync.o \
+ rt_domain.o \
+ edf_common.o \
+ fp_common.o \
+ fdso.o \
+ locking.o \
+ srp.o \
+ bheap.o \
+ binheap.o \
+ ctrldev.o \
+ sched_gsn_edf.o \
+ sched_psn_edf.o \
+ sched_pfp.o
+
+obj-$(CONFIG_PLUGIN_CEDF) += sched_cedf.o
+obj-$(CONFIG_PLUGIN_PFAIR) += sched_pfair.o
+obj-$(CONFIG_SCHED_CPU_AFFINITY) += affinity.o
+
+obj-$(CONFIG_FEATHER_TRACE) += ft_event.o ftdev.o
+obj-$(CONFIG_SCHED_TASK_TRACE) += sched_task_trace.o
+obj-$(CONFIG_SCHED_DEBUG_TRACE) += sched_trace.o
+obj-$(CONFIG_SCHED_OVERHEAD_TRACE) += trace.o
diff --git a/litmus/affinity.c b/litmus/affinity.c
new file mode 100644
index 0000000..3fa6dd7
--- /dev/null
+++ b/litmus/affinity.c
@@ -0,0 +1,42 @@
+#include <linux/cpu.h>
+
+#include <litmus/affinity.h>
+
+struct neighborhood neigh_info[NR_CPUS];
+
+/* called by _init_litmus() */
+void init_topology(void) {
+ int cpu;
+ int i;
+ int chk;
+ int depth = num_cache_leaves;
+
+ if (depth > NUM_CACHE_LEVELS)
+ depth = NUM_CACHE_LEVELS;
+
+ for_each_online_cpu(cpu) {
+ for (i = 0; i < depth; ++i) {
+ chk = get_shared_cpu_map((struct cpumask *)&neigh_info[cpu].neighbors[i], cpu, i);
+ if (chk) {
+ /* failed */
+ neigh_info[cpu].size[i] = 0;
+ } else {
+ /* size = num bits in mask */
+ neigh_info[cpu].size[i] =
+ cpumask_weight((struct cpumask *)&neigh_info[cpu].neighbors[i]);
+ }
+ printk("CPU %d has %d neighbors at level %d. (mask = %lx)\n",
+ cpu, neigh_info[cpu].size[i], i,
+ *cpumask_bits(neigh_info[cpu].neighbors[i]));
+ }
+
+ /* set data for non-existent levels */
+ for (; i < NUM_CACHE_LEVELS; ++i) {
+ neigh_info[cpu].size[i] = 0;
+
+ printk("CPU %d has %d neighbors at level %d. (mask = %lx)\n",
+ cpu, neigh_info[cpu].size[i], i, 0lu);
+ }
+ }
+}
+
diff --git a/litmus/bheap.c b/litmus/bheap.c
new file mode 100644
index 0000000..528af97
--- /dev/null
+++ b/litmus/bheap.c
@@ -0,0 +1,314 @@
+#include "linux/kernel.h"
+#include "litmus/bheap.h"
+
+void bheap_init(struct bheap* heap)
+{
+ heap->head = NULL;
+ heap->min = NULL;
+}
+
+void bheap_node_init(struct bheap_node** _h, void* value)
+{
+ struct bheap_node* h = *_h;
+ h->parent = NULL;
+ h->next = NULL;
+ h->child = NULL;
+ h->degree = NOT_IN_HEAP;
+ h->value = value;
+ h->ref = _h;
+}
+
+
+/* make child a subtree of root */
+static void __bheap_link(struct bheap_node* root,
+ struct bheap_node* child)
+{
+ child->parent = root;
+ child->next = root->child;
+ root->child = child;
+ root->degree++;
+}
+
+/* merge root lists */
+static struct bheap_node* __bheap_merge(struct bheap_node* a,
+ struct bheap_node* b)
+{
+ struct bheap_node* head = NULL;
+ struct bheap_node** pos = &head;
+
+ while (a && b) {
+ if (a->degree < b->degree) {
+ *pos = a;
+ a = a->next;
+ } else {
+ *pos = b;
+ b = b->next;
+ }
+ pos = &(*pos)->next;
+ }
+ if (a)
+ *pos = a;
+ else
+ *pos = b;
+ return head;
+}
+
+/* reverse a linked list of nodes. also clears parent pointer */
+static struct bheap_node* __bheap_reverse(struct bheap_node* h)
+{
+ struct bheap_node* tail = NULL;
+ struct bheap_node* next;
+
+ if (!h)
+ return h;
+
+ h->parent = NULL;
+ while (h->next) {
+ next = h->next;
+ h->next = tail;
+ tail = h;
+ h = next;
+ h->parent = NULL;
+ }
+ h->next = tail;
+ return h;
+}
+
+static void __bheap_min(bheap_prio_t higher_prio, struct bheap* heap,
+ struct bheap_node** prev, struct bheap_node** node)
+{
+ struct bheap_node *_prev, *cur;
+ *prev = NULL;
+
+ if (!heap->head) {
+ *node = NULL;
+ return;
+ }
+
+ *node = heap->head;
+ _prev = heap->head;
+ cur = heap->head->next;
+ while (cur) {
+ if (higher_prio(cur, *node)) {
+ *node = cur;
+ *prev = _prev;
+ }
+ _prev = cur;
+ cur = cur->next;
+ }
+}
+
+static void __bheap_union(bheap_prio_t higher_prio, struct bheap* heap,
+ struct bheap_node* h2)
+{
+ struct bheap_node* h1;
+ struct bheap_node *prev, *x, *next;
+ if (!h2)
+ return;
+ h1 = heap->head;
+ if (!h1) {
+ heap->head = h2;
+ return;
+ }
+ h1 = __bheap_merge(h1, h2);
+ prev = NULL;
+ x = h1;
+ next = x->next;
+ while (next) {
+ if (x->degree != next->degree ||
+ (next->next && next->next->degree == x->degree)) {
+ /* nothing to do, advance */
+ prev = x;
+ x = next;
+ } else if (higher_prio(x, next)) {
+ /* x becomes the root of next */
+ x->next = next->next;
+ __bheap_link(x, next);
+ } else {
+ /* next becomes the root of x */
+ if (prev)
+ prev->next = next;
+ else
+ h1 = next;
+ __bheap_link(next, x);
+ x = next;
+ }
+ next = x->next;
+ }
+ heap->head = h1;
+}
+
+static struct bheap_node* __bheap_extract_min(bheap_prio_t higher_prio,
+ struct bheap* heap)
+{
+ struct bheap_node *prev, *node;
+ __bheap_min(higher_prio, heap, &prev, &node);
+ if (!node)
+ return NULL;
+ if (prev)
+ prev->next = node->next;
+ else
+ heap->head = node->next;
+ __bheap_union(higher_prio, heap, __bheap_reverse(node->child));
+ return node;
+}
+
+/* insert (and reinitialize) a node into the heap */
+void bheap_insert(bheap_prio_t higher_prio, struct bheap* heap,
+ struct bheap_node* node)
+{
+ struct bheap_node *min;
+ node->child = NULL;
+ node->parent = NULL;
+ node->next = NULL;
+ node->degree = 0;
+ if (heap->min && higher_prio(node, heap->min)) {
+ /* swap min cache */
+ min = heap->min;
+ min->child = NULL;
+ min->parent = NULL;
+ min->next = NULL;
+ min->degree = 0;
+ __bheap_union(higher_prio, heap, min);
+ heap->min = node;
+ } else
+ __bheap_union(higher_prio, heap, node);
+}
+
+void bheap_uncache_min(bheap_prio_t higher_prio, struct bheap* heap)
+{
+ struct bheap_node* min;
+ if (heap->min) {
+ min = heap->min;
+ heap->min = NULL;
+ bheap_insert(higher_prio, heap, min);
+ }
+}
+
+/* merge addition into target */
+void bheap_union(bheap_prio_t higher_prio,
+ struct bheap* target, struct bheap* addition)
+{
+ /* first insert any cached minima, if necessary */
+ bheap_uncache_min(higher_prio, target);
+ bheap_uncache_min(higher_prio, addition);
+ __bheap_union(higher_prio, target, addition->head);
+ /* this is a destructive merge */
+ addition->head = NULL;
+}
+
+struct bheap_node* bheap_peek(bheap_prio_t higher_prio,
+ struct bheap* heap)
+{
+ if (!heap->min)
+ heap->min = __bheap_extract_min(higher_prio, heap);
+ return heap->min;
+}
+
+struct bheap_node* bheap_take(bheap_prio_t higher_prio,
+ struct bheap* heap)
+{
+ struct bheap_node *node;
+ if (!heap->min)
+ heap->min = __bheap_extract_min(higher_prio, heap);
+ node = heap->min;
+ heap->min = NULL;
+ if (node)
+ node->degree = NOT_IN_HEAP;
+ return node;
+}
+
+int bheap_decrease(bheap_prio_t higher_prio, struct bheap_node* node)
+{
+ struct bheap_node *parent;
+ struct bheap_node** tmp_ref;
+ void* tmp;
+
+ /* bubble up */
+ parent = node->parent;
+ while (parent && higher_prio(node, parent)) {
+ /* swap parent and node */
+ tmp = parent->value;
+ parent->value = node->value;
+ node->value = tmp;
+ /* swap references */
+ *(parent->ref) = node;
+ *(node->ref) = parent;
+ tmp_ref = parent->ref;
+ parent->ref = node->ref;
+ node->ref = tmp_ref;
+ /* step up */
+ node = parent;
+ parent = node->parent;
+ }
+
+ return parent != NULL;
+}
+
+void bheap_delete(bheap_prio_t higher_prio, struct bheap* heap,
+ struct bheap_node* node)
+{
+ struct bheap_node *parent, *prev, *pos;
+ struct bheap_node** tmp_ref;
+ void* tmp;
+
+ if (heap->min != node) {
+ /* bubble up */
+ parent = node->parent;
+ while (parent) {
+ /* swap parent and node */
+ tmp = parent->value;
+ parent->value = node->value;
+ node->value = tmp;
+ /* swap references */
+ *(parent->ref) = node;
+ *(node->ref) = parent;
+ tmp_ref = parent->ref;
+ parent->ref = node->ref;
+ node->ref = tmp_ref;
+ /* step up */
+ node = parent;
+ parent = node->parent;
+ }
+ /* now delete:
+ * first find prev */
+ prev = NULL;
+ pos = heap->head;
+ while (pos != node) {
+ prev = pos;
+ pos = pos->next;
+ }
+ /* we have prev, now remove node */
+ if (prev)
+ prev->next = node->next;
+ else
+ heap->head = node->next;
+ __bheap_union(higher_prio, heap, __bheap_reverse(node->child));
+ } else
+ heap->min = NULL;
+ node->degree = NOT_IN_HEAP;
+}
+
+/* allocate a heap node for value and insert into the heap */
+int bheap_add(bheap_prio_t higher_prio, struct bheap* heap,
+ void* value, int gfp_flags)
+{
+ struct bheap_node* hn = bheap_node_alloc(gfp_flags);
+ if (likely(hn)) {
+ bheap_node_init(&hn, value);
+ bheap_insert(higher_prio, heap, hn);
+ }
+ return hn != NULL;
+}
+
+void* bheap_take_del(bheap_prio_t higher_prio,
+ struct bheap* heap)
+{
+ struct bheap_node* hn = bheap_take(higher_prio, heap);
+ void* ret = NULL;
+ if (hn) {
+ ret = hn->value;
+ bheap_node_free(hn);
+ }
+ return ret;
+}
diff --git a/litmus/binheap.c b/litmus/binheap.c
new file mode 100644
index 0000000..40a913f
--- /dev/null
+++ b/litmus/binheap.c
@@ -0,0 +1,388 @@
+#include <litmus/binheap.h>
+
+/* Returns true of the root ancestor of node is the root of the given heap. */
+int binheap_is_in_this_heap(struct binheap_node *node,
+ struct binheap* heap)
+{
+ if(!binheap_is_in_heap(node)) {
+ return 0;
+ }
+
+ while(node->parent != NULL) {
+ node = node->parent;
+ }
+
+ return (node == heap->root);
+}
+
+
+/* Update the node reference pointers. Same logic as Litmus binomial heap. */
+static void __update_ref(struct binheap_node *parent,
+ struct binheap_node *child)
+{
+ *(parent->ref_ptr) = child;
+ *(child->ref_ptr) = parent;
+
+ swap(parent->ref_ptr, child->ref_ptr);
+}
+
+
+/* Swaps data between two nodes. */
+static void __binheap_swap(struct binheap_node *parent,
+ struct binheap_node *child)
+{
+ swap(parent->data, child->data);
+ __update_ref(parent, child);
+}
+
+
+/* Swaps memory and data between two nodes. Actual nodes swap instead of
+ * just data. Needed when we delete nodes from the heap.
+ */
+static void __binheap_swap_safe(struct binheap *handle,
+ struct binheap_node *a,
+ struct binheap_node *b)
+{
+ swap(a->data, b->data);
+ __update_ref(a, b);
+
+ if((a->parent != NULL) && (a->parent == b->parent)) {
+ /* special case: shared parent */
+ swap(a->parent->left, a->parent->right);
+ }
+ else {
+ /* Update pointers to swap parents. */
+
+ if(a->parent) {
+ if(a == a->parent->left) {
+ a->parent->left = b;
+ }
+ else {
+ a->parent->right = b;
+ }
+ }
+
+ if(b->parent) {
+ if(b == b->parent->left) {
+ b->parent->left = a;
+ }
+ else {
+ b->parent->right = a;
+ }
+ }
+
+ swap(a->parent, b->parent);
+ }
+
+ /* swap children */
+
+ if(a->left) {
+ a->left->parent = b;
+
+ if(a->right) {
+ a->right->parent = b;
+ }
+ }
+
+ if(b->left) {
+ b->left->parent = a;
+
+ if(b->right) {
+ b->right->parent = a;
+ }
+ }
+
+ swap(a->left, b->left);
+ swap(a->right, b->right);
+
+
+ /* update next/last/root pointers */
+
+ if(a == handle->next) {
+ handle->next = b;
+ }
+ else if(b == handle->next) {
+ handle->next = a;
+ }
+
+ if(a == handle->last) {
+ handle->last = b;
+ }
+ else if(b == handle->last) {
+ handle->last = a;
+ }
+
+ if(a == handle->root) {
+ handle->root = b;
+ }
+ else if(b == handle->root) {
+ handle->root = a;
+ }
+}
+
+
+/**
+ * Update the pointer to the last node in the complete binary tree.
+ * Called internally after the root node has been deleted.
+ */
+static void __binheap_update_last(struct binheap *handle)
+{
+ struct binheap_node *temp = handle->last;
+
+ /* find a "bend" in the tree. */
+ while(temp->parent && (temp == temp->parent->left)) {
+ temp = temp->parent;
+ }
+
+ /* step over to sibling if we're not at root */
+ if(temp->parent != NULL) {
+ temp = temp->parent->left;
+ }
+
+ /* now travel right as far as possible. */
+ while(temp->right != NULL) {
+ temp = temp->right;
+ }
+
+ /* take one step to the left if we're not at the bottom-most level. */
+ if(temp->left != NULL) {
+ temp = temp->left;
+ }
+
+ handle->last = temp;
+}
+
+
+/**
+ * Update the pointer to the node that will take the next inserted node.
+ * Called internally after a node has been inserted.
+ */
+static void __binheap_update_next(struct binheap *handle)
+{
+ struct binheap_node *temp = handle->next;
+
+ /* find a "bend" in the tree. */
+ while(temp->parent && (temp == temp->parent->right)) {
+ temp = temp->parent;
+ }
+
+ /* step over to sibling if we're not at root */
+ if(temp->parent != NULL) {
+ temp = temp->parent->right;
+ }
+
+ /* now travel left as far as possible. */
+ while(temp->left != NULL) {
+ temp = temp->left;
+ }
+
+ handle->next = temp;
+}
+
+
+
+/* bubble node up towards root */
+static void __binheap_bubble_up(struct binheap *handle,
+ struct binheap_node *node)
+{
+ /* let BINHEAP_POISON data bubble to the top */
+
+ while((node->parent != NULL) &&
+ ((node->data == BINHEAP_POISON) ||
+ handle->compare(node, node->parent))) {
+ __binheap_swap(node->parent, node);
+ node = node->parent;
+ }
+}
+
+
+/* bubble node down, swapping with min-child */
+static void __binheap_bubble_down(struct binheap *handle)
+{
+ struct binheap_node *node = handle->root;
+
+ while(node->left != NULL) {
+ if(node->right && handle->compare(node->right, node->left)) {
+ if(handle->compare(node->right, node)) {
+ __binheap_swap(node, node->right);
+ node = node->right;
+ }
+ else {
+ break;
+ }
+ }
+ else {
+ if(handle->compare(node->left, node)) {
+ __binheap_swap(node, node->left);
+ node = node->left;
+ }
+ else {
+ break;
+ }
+ }
+ }
+}
+
+
+void __binheap_add(struct binheap_node *new_node,
+ struct binheap *handle,
+ void *data)
+{
+ new_node->data = data;
+ new_node->ref = new_node;
+ new_node->ref_ptr = &(new_node->ref);
+
+ if(!binheap_empty(handle)) {
+ /* insert left side first */
+ if(handle->next->left == NULL) {
+ handle->next->left = new_node;
+ new_node->parent = handle->next;
+ new_node->left = NULL;
+ new_node->right = NULL;
+
+ handle->last = new_node;
+
+ __binheap_bubble_up(handle, new_node);
+ }
+ else {
+ /* left occupied. insert right. */
+ handle->next->right = new_node;
+ new_node->parent = handle->next;
+ new_node->left = NULL;
+ new_node->right = NULL;
+
+ handle->last = new_node;
+
+ __binheap_update_next(handle);
+ __binheap_bubble_up(handle, new_node);
+ }
+ }
+ else {
+ /* first node in heap */
+
+ new_node->parent = NULL;
+ new_node->left = NULL;
+ new_node->right = NULL;
+
+ handle->root = new_node;
+ handle->next = new_node;
+ handle->last = new_node;
+ }
+}
+
+
+/**
+ * Removes the root node from the heap. The node is removed after coalescing
+ * the binheap_node with its original data pointer at the root of the tree.
+ *
+ * The 'last' node in the tree is then swapped up to the root and bubbled
+ * down.
+ */
+void __binheap_delete_root(struct binheap *handle,
+ struct binheap_node *container)
+{
+ struct binheap_node *root = handle->root;
+
+ if(root != container) {
+ /* coalesce */
+ __binheap_swap_safe(handle, root, container);
+ root = container;
+ }
+
+ if(handle->last != root) {
+ /* swap 'last' node up to root and bubble it down. */
+
+ struct binheap_node *to_move = handle->last;
+
+ if(to_move->parent != root) {
+ handle->next = to_move->parent;
+
+ if(handle->next->right == to_move) {
+ /* disconnect from parent */
+ to_move->parent->right = NULL;
+ handle->last = handle->next->left;
+ }
+ else {
+ /* find new 'last' before we disconnect */
+ __binheap_update_last(handle);
+
+ /* disconnect from parent */
+ to_move->parent->left = NULL;
+ }
+ }
+ else {
+ /* 'last' is direct child of root */
+
+ handle->next = to_move;
+
+ if(to_move == to_move->parent->right) {
+ to_move->parent->right = NULL;
+ handle->last = to_move->parent->left;
+ }
+ else {
+ to_move->parent->left = NULL;
+ handle->last = to_move;
+ }
+ }
+ to_move->parent = NULL;
+
+ /* reconnect as root. We can't just swap data ptrs since root node
+ * may be freed after this function returns.
+ */
+ to_move->left = root->left;
+ to_move->right = root->right;
+ if(to_move->left != NULL) {
+ to_move->left->parent = to_move;
+ }
+ if(to_move->right != NULL) {
+ to_move->right->parent = to_move;
+ }
+
+ handle->root = to_move;
+
+ /* bubble down */
+ __binheap_bubble_down(handle);
+ }
+ else {
+ /* removing last node in tree */
+ handle->root = NULL;
+ handle->next = NULL;
+ handle->last = NULL;
+ }
+
+ /* mark as removed */
+ container->parent = BINHEAP_POISON;
+}
+
+
+/**
+ * Delete an arbitrary node. Bubble node to delete up to the root,
+ * and then delete to root.
+ */
+void __binheap_delete(struct binheap_node *node_to_delete,
+ struct binheap *handle)
+{
+ struct binheap_node *target = node_to_delete->ref;
+ void *temp_data = target->data;
+
+ /* temporarily set data to null to allow node to bubble up to the top. */
+ target->data = BINHEAP_POISON;
+
+ __binheap_bubble_up(handle, target);
+ __binheap_delete_root(handle, node_to_delete);
+
+ node_to_delete->data = temp_data; /* restore node data pointer */
+}
+
+
+/**
+ * Bubble up a node whose pointer has decreased in value.
+ */
+void __binheap_decrease(struct binheap_node *orig_node,
+ struct binheap *handle)
+{
+ struct binheap_node *target = orig_node->ref;
+
+ __binheap_bubble_up(handle, target);
+}
+
diff --git a/litmus/budget.c b/litmus/budget.c
new file mode 100644
index 0000000..f7712be
--- /dev/null
+++ b/litmus/budget.c
@@ -0,0 +1,113 @@
+#include <linux/sched.h>
+#include <linux/percpu.h>
+#include <linux/hrtimer.h>
+
+#include <litmus/litmus.h>
+#include <litmus/preempt.h>
+
+#include <litmus/budget.h>
+
+struct enforcement_timer {
+ /* The enforcement timer is used to accurately police
+ * slice budgets. */
+ struct hrtimer timer;
+ int armed;
+};
+
+DEFINE_PER_CPU(struct enforcement_timer, budget_timer);
+
+static enum hrtimer_restart on_enforcement_timeout(struct hrtimer *timer)
+{
+ struct enforcement_timer* et = container_of(timer,
+ struct enforcement_timer,
+ timer);
+ unsigned long flags;
+
+ local_irq_save(flags);
+ TRACE("enforcement timer fired.\n");
+ et->armed = 0;
+ /* activate scheduler */
+ litmus_reschedule_local();
+ local_irq_restore(flags);
+
+ return HRTIMER_NORESTART;
+}
+
+/* assumes called with IRQs off */
+static void cancel_enforcement_timer(struct enforcement_timer* et)
+{
+ int ret;
+
+ TRACE("cancelling enforcement timer.\n");
+
+ /* Since interrupts are disabled and et->armed is only
+ * modified locally, we do not need any locks.
+ */
+
+ if (et->armed) {
+ ret = hrtimer_try_to_cancel(&et->timer);
+ /* Should never be inactive. */
+ BUG_ON(ret == 0);
+ /* Should never be running concurrently. */
+ BUG_ON(ret == -1);
+
+ et->armed = 0;
+ }
+}
+
+/* assumes called with IRQs off */
+static void arm_enforcement_timer(struct enforcement_timer* et,
+ struct task_struct* t)
+{
+ lt_t when_to_fire;
+ TRACE_TASK(t, "arming enforcement timer.\n");
+
+ /* Calling this when there is no budget left for the task
+ * makes no sense, unless the task is non-preemptive. */
+ BUG_ON(budget_exhausted(t) && (!is_np(t)));
+
+ /* __hrtimer_start_range_ns() cancels the timer
+ * anyway, so we don't have to check whether it is still armed */
+
+ if (likely(!is_np(t))) {
+ when_to_fire = litmus_clock() + budget_remaining(t);
+ __hrtimer_start_range_ns(&et->timer,
+ ns_to_ktime(when_to_fire),
+ 0 /* delta */,
+ HRTIMER_MODE_ABS_PINNED,
+ 0 /* no wakeup */);
+ et->armed = 1;
+ }
+}
+
+
+/* expects to be called with IRQs off */
+void update_enforcement_timer(struct task_struct* t)
+{
+ struct enforcement_timer* et = &__get_cpu_var(budget_timer);
+
+ if (t && budget_precisely_enforced(t)) {
+ /* Make sure we call into the scheduler when this budget
+ * expires. */
+ arm_enforcement_timer(et, t);
+ } else if (et->armed) {
+ /* Make sure we don't cause unnecessary interrupts. */
+ cancel_enforcement_timer(et);
+ }
+}
+
+
+static int __init init_budget_enforcement(void)
+{
+ int cpu;
+ struct enforcement_timer* et;
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ et = &per_cpu(budget_timer, cpu);
+ hrtimer_init(&et->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ et->timer.function = on_enforcement_timeout;
+ }
+ return 0;
+}
+
+module_init(init_budget_enforcement);
diff --git a/litmus/clustered.c b/litmus/clustered.c
new file mode 100644
index 0000000..6fe1b51
--- /dev/null
+++ b/litmus/clustered.c
@@ -0,0 +1,111 @@
+#include <linux/gfp.h>
+#include <linux/cpumask.h>
+#include <linux/list.h>
+
+#include <litmus/clustered.h>
+
+#ifndef CONFIG_X86
+/* fake get_shared_cpu_map() on non-x86 architectures */
+
+int get_shared_cpu_map(cpumask_var_t mask, unsigned int cpu, int index)
+{
+ if (index != 1)
+ return 1;
+ else {
+ /* Fake L1: CPU is all by itself. */
+ cpumask_clear(mask);
+ cpumask_set_cpu(cpu, mask);
+ return 0;
+ }
+}
+
+#endif
+
+int get_cluster_size(enum cache_level level)
+{
+ cpumask_var_t mask;
+ int ok;
+ int num_cpus;
+
+ if (level == GLOBAL_CLUSTER)
+ return num_online_cpus();
+ else {
+ if (!zalloc_cpumask_var(&mask, GFP_ATOMIC))
+ return -ENOMEM;
+ /* assumes CPU 0 is representative of all CPUs */
+ ok = get_shared_cpu_map(mask, 0, level);
+ /* ok == 0 means we got the map; otherwise it's an invalid cache level */
+ if (ok == 0)
+ num_cpus = cpumask_weight(mask);
+ free_cpumask_var(mask);
+
+ if (ok == 0)
+ return num_cpus;
+ else
+ return -EINVAL;
+ }
+}
+
+int assign_cpus_to_clusters(enum cache_level level,
+ struct scheduling_cluster* clusters[],
+ unsigned int num_clusters,
+ struct cluster_cpu* cpus[],
+ unsigned int num_cpus)
+{
+ cpumask_var_t mask;
+ unsigned int i, free_cluster = 0, low_cpu;
+ int err = 0;
+
+ if (!zalloc_cpumask_var(&mask, GFP_ATOMIC))
+ return -ENOMEM;
+
+ /* clear cluster pointers */
+ for (i = 0; i < num_cpus; i++) {
+ cpus[i]->id = i;
+ cpus[i]->cluster = NULL;
+ }
+
+ /* initialize clusters */
+ for (i = 0; i < num_clusters; i++) {
+ clusters[i]->id = i;
+ INIT_LIST_HEAD(&clusters[i]->cpus);
+ }
+
+ /* Assign each CPU. Two assumtions are made:
+ * 1) The index of a cpu in cpus corresponds to its processor id (i.e., the index in a cpu mask).
+ * 2) All cpus that belong to some cluster are online.
+ */
+ for_each_online_cpu(i) {
+ /* get lowest-id CPU in cluster */
+ if (level != GLOBAL_CLUSTER) {
+ err = get_shared_cpu_map(mask, cpus[i]->id, level);
+ if (err != 0) {
+ /* ugh... wrong cache level? Either caller screwed up
+ * or the CPU topology is weird. */
+ printk(KERN_ERR "Could not set up clusters for L%d sharing (max: L%d).\n",
+ level, err);
+ err = -EINVAL;
+ goto out;
+ }
+ low_cpu = cpumask_first(mask);
+ } else
+ low_cpu = 0;
+ if (low_cpu == i) {
+ /* caller must provide an appropriate number of clusters */
+ BUG_ON(free_cluster >= num_clusters);
+
+ /* create new cluster */
+ cpus[i]->cluster = clusters[free_cluster++];
+ } else {
+ /* low_cpu points to the right cluster
+ * Assumption: low_cpu is actually online and was processed earlier. */
+ cpus[i]->cluster = cpus[low_cpu]->cluster;
+ }
+ /* enqueue in cpus list */
+ list_add_tail(&cpus[i]->cluster_list, &cpus[i]->cluster->cpus);
+ printk(KERN_INFO "Assigning CPU%u to cluster %u\n.", i, cpus[i]->cluster->id);
+ }
+out:
+ free_cpumask_var(mask);
+ return err;
+}
diff --git a/litmus/ctrldev.c b/litmus/ctrldev.c
new file mode 100644
index 0000000..41919b2
--- /dev/null
+++ b/litmus/ctrldev.c
@@ -0,0 +1,160 @@
+#include <linux/sched.h>
+#include <linux/mm.h>
+#include <linux/fs.h>
+#include <linux/miscdevice.h>
+#include <linux/module.h>
+
+#include <litmus/litmus.h>
+
+/* only one page for now, but we might want to add a RO version at some point */
+
+#define CTRL_NAME "litmus/ctrl"
+
+/* allocate t->rt_param.ctrl_page*/
+static int alloc_ctrl_page(struct task_struct *t)
+{
+ int err = 0;
+
+ /* only allocate if the task doesn't have one yet */
+ if (!tsk_rt(t)->ctrl_page) {
+ tsk_rt(t)->ctrl_page = (void*) get_zeroed_page(GFP_KERNEL);
+ if (!tsk_rt(t)->ctrl_page)
+ err = -ENOMEM;
+ /* will get de-allocated in task teardown */
+ TRACE_TASK(t, "%s ctrl_page = %p\n", __FUNCTION__,
+ tsk_rt(t)->ctrl_page);
+ }
+ return err;
+}
+
+static int map_ctrl_page(struct task_struct *t, struct vm_area_struct* vma)
+{
+ int err;
+
+ struct page* ctrl = virt_to_page(tsk_rt(t)->ctrl_page);
+
+ TRACE_CUR(CTRL_NAME
+ ": mapping %p (pfn:%lx) to 0x%lx (prot:%lx)\n",
+ tsk_rt(t)->ctrl_page,page_to_pfn(ctrl), vma->vm_start,
+ vma->vm_page_prot);
+
+ /* Map it into the vma. */
+ err = vm_insert_page(vma, vma->vm_start, ctrl);
+
+ if (err)
+ TRACE_CUR(CTRL_NAME ": vm_insert_page() failed (%d)\n", err);
+
+ return err;
+}
+
+static void litmus_ctrl_vm_close(struct vm_area_struct* vma)
+{
+ TRACE_CUR("%s flags=0x%x prot=0x%x\n", __FUNCTION__,
+ vma->vm_flags, vma->vm_page_prot);
+
+ TRACE_CUR(CTRL_NAME
+ ": %p:%p vma:%p vma->vm_private_data:%p closed.\n",
+ (void*) vma->vm_start, (void*) vma->vm_end, vma,
+ vma->vm_private_data);
+}
+
+static int litmus_ctrl_vm_fault(struct vm_area_struct* vma,
+ struct vm_fault* vmf)
+{
+ TRACE_CUR("%s flags=0x%x (off:%ld)\n", __FUNCTION__,
+ vma->vm_flags, vmf->pgoff);
+
+ /* This function should never be called, since all pages should have
+ * been mapped by mmap() already. */
+ WARN_ONCE(1, "Page faults should be impossible in the control page\n");
+
+ return VM_FAULT_SIGBUS;
+}
+
+static struct vm_operations_struct litmus_ctrl_vm_ops = {
+ .close = litmus_ctrl_vm_close,
+ .fault = litmus_ctrl_vm_fault,
+};
+
+static int litmus_ctrl_mmap(struct file* filp, struct vm_area_struct* vma)
+{
+ int err = 0;
+
+ /* first make sure mapper knows what he's doing */
+
+ /* you can only get one page */
+ if (vma->vm_end - vma->vm_start != PAGE_SIZE)
+ return -EINVAL;
+
+ /* you can only map the "first" page */
+ if (vma->vm_pgoff != 0)
+ return -EINVAL;
+
+ /* you can't share it with anyone */
+ if (vma->vm_flags & (VM_MAYSHARE | VM_SHARED))
+ return -EINVAL;
+
+ vma->vm_ops = &litmus_ctrl_vm_ops;
+ /* This mapping should not be kept across forks,
+ * cannot be expanded, and is not a "normal" page. */
+ vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND | VM_IO;
+
+ /* We don't want the first write access to trigger a "minor" page fault
+ * to mark the page as dirty. This is transient, private memory, we
+ * don't care if it was touched or not. __S011 means RW access, but not
+ * execute, and avoids copy-on-write behavior.
+ * See protection_map in mmap.c. */
+ vma->vm_page_prot = __S011;
+
+ err = alloc_ctrl_page(current);
+ if (!err)
+ err = map_ctrl_page(current, vma);
+
+ TRACE_CUR("%s flags=0x%x prot=0x%lx\n",
+ __FUNCTION__, vma->vm_flags, vma->vm_page_prot);
+
+ return err;
+}
+
+static struct file_operations litmus_ctrl_fops = {
+ .owner = THIS_MODULE,
+ .mmap = litmus_ctrl_mmap,
+};
+
+static struct miscdevice litmus_ctrl_dev = {
+ .name = CTRL_NAME,
+ .minor = MISC_DYNAMIC_MINOR,
+ .fops = &litmus_ctrl_fops,
+};
+
+static int __init init_litmus_ctrl_dev(void)
+{
+ int err;
+
+ BUILD_BUG_ON(sizeof(struct control_page) > PAGE_SIZE);
+
+ BUILD_BUG_ON(sizeof(union np_flag) != sizeof(uint64_t));
+
+ BUILD_BUG_ON(offsetof(struct control_page, sched.raw)
+ != LITMUS_CP_OFFSET_SCHED);
+ BUILD_BUG_ON(offsetof(struct control_page, irq_count)
+ != LITMUS_CP_OFFSET_IRQ_COUNT);
+ BUILD_BUG_ON(offsetof(struct control_page, ts_syscall_start)
+ != LITMUS_CP_OFFSET_TS_SC_START);
+ BUILD_BUG_ON(offsetof(struct control_page, irq_syscall_start)
+ != LITMUS_CP_OFFSET_IRQ_SC_START);
+
+ printk("Initializing LITMUS^RT control device.\n");
+ err = misc_register(&litmus_ctrl_dev);
+ if (err)
+ printk("Could not allocate %s device (%d).\n", CTRL_NAME, err);
+ return err;
+}
+
+static void __exit exit_litmus_ctrl_dev(void)
+{
+ misc_deregister(&litmus_ctrl_dev);
+}
+
+module_init(init_litmus_ctrl_dev);
+module_exit(exit_litmus_ctrl_dev);
diff --git a/litmus/edf_common.c b/litmus/edf_common.c
new file mode 100644
index 0000000..5aca293
--- /dev/null
+++ b/litmus/edf_common.c
@@ -0,0 +1,200 @@
+/*
+ * kernel/edf_common.c
+ *
+ * Common functions for EDF based scheduler.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+
+#include <litmus/edf_common.h>
+
+#ifdef CONFIG_EDF_TIE_BREAK_LATENESS_NORM
+#include <litmus/fpmath.h>
+#endif
+
+#ifdef CONFIG_EDF_TIE_BREAK_HASH
+#include <linux/hash.h>
+static inline long edf_hash(struct task_struct *t)
+{
+ /* pid is 32 bits, so normally we would shove that into the
+ * upper 32-bits and and put the job number in the bottom
+ * and hash the 64-bit number with hash_64(). Sadly,
+ * in testing, hash_64() doesn't distribute keys were the
+ * upper bits are close together (as would be the case with
+ * pids) and job numbers are equal (as would be the case with
+ * synchronous task sets with all relative deadlines equal).
+ *
+ * A 2006 Linux patch proposed the following solution
+ * (but for some reason it wasn't accepted...).
+ *
+ * At least this workaround works for 32-bit systems as well.
+ */
+ return hash_32(hash_32((u32)tsk_rt(t)->job_params.job_no, 32) ^ t->pid, 32);
+}
+#endif
+
+
+/* edf_higher_prio - returns true if first has a higher EDF priority
+ * than second. Deadline ties are broken by PID.
+ *
+ * both first and second may be NULL
+ */
+int edf_higher_prio(struct task_struct* first,
+ struct task_struct* second)
+{
+ struct task_struct *first_task = first;
+ struct task_struct *second_task = second;
+
+ /* There is no point in comparing a task to itself. */
+ if (first && first == second) {
+ TRACE_TASK(first,
+ "WARNING: pointless edf priority comparison.\n");
+ return 0;
+ }
+
+
+ /* check for NULL tasks */
+ if (!first || !second)
+ return first && !second;
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+ /* Check for inherited priorities. Change task
+ * used for comparison in such a case.
+ */
+ if (unlikely(first->rt_param.inh_task))
+ first_task = first->rt_param.inh_task;
+ if (unlikely(second->rt_param.inh_task))
+ second_task = second->rt_param.inh_task;
+
+ /* Check for priority boosting. Tie-break by start of boosting.
+ */
+ if (unlikely(is_priority_boosted(first_task))) {
+ /* first_task is boosted, how about second_task? */
+ if (!is_priority_boosted(second_task) ||
+ lt_before(get_boost_start(first_task),
+ get_boost_start(second_task)))
+ return 1;
+ else
+ return 0;
+ } else if (unlikely(is_priority_boosted(second_task)))
+ /* second_task is boosted, first is not*/
+ return 0;
+
+#endif
+
+ if (earlier_deadline(first_task, second_task)) {
+ return 1;
+ }
+ else if (get_deadline(first_task) == get_deadline(second_task)) {
+ /* Need to tie break. All methods must set pid_break to 0/1 if
+ * first_task does not have priority over second_task.
+ */
+ int pid_break;
+
+
+#if defined(CONFIG_EDF_TIE_BREAK_LATENESS)
+ /* Tie break by lateness. Jobs with greater lateness get
+ * priority. This should spread tardiness across all tasks,
+ * especially in task sets where all tasks have the same
+ * period and relative deadlines.
+ */
+ if (get_lateness(first_task) > get_lateness(second_task)) {
+ return 1;
+ }
+ pid_break = (get_lateness(first_task) == get_lateness(second_task));
+
+
+#elif defined(CONFIG_EDF_TIE_BREAK_LATENESS_NORM)
+ /* Tie break by lateness, normalized by relative deadline. Jobs with
+ * greater normalized lateness get priority.
+ *
+ * Note: Considered using the algebraically equivalent
+ * lateness(first)*relative_deadline(second) >
+ lateness(second)*relative_deadline(first)
+ * to avoid fixed-point math, but values are prone to overflow if inputs
+ * are on the order of several seconds, even in 64-bit.
+ */
+ fp_t fnorm = _frac(get_lateness(first_task),
+ get_rt_relative_deadline(first_task));
+ fp_t snorm = _frac(get_lateness(second_task),
+ get_rt_relative_deadline(second_task));
+ if (_gt(fnorm, snorm)) {
+ return 1;
+ }
+ pid_break = _eq(fnorm, snorm);
+
+
+#elif defined(CONFIG_EDF_TIE_BREAK_HASH)
+ /* Tie break by comparing hashs of (pid, job#) tuple. There should be
+ * a 50% chance that first_task has a higher priority than second_task.
+ */
+ long fhash = edf_hash(first_task);
+ long shash = edf_hash(second_task);
+ if (fhash < shash) {
+ return 1;
+ }
+ pid_break = (fhash == shash);
+#else
+
+
+ /* CONFIG_EDF_PID_TIE_BREAK */
+ pid_break = 1; // fall through to tie-break by pid;
+#endif
+
+ /* Tie break by pid */
+ if(pid_break) {
+ if (first_task->pid < second_task->pid) {
+ return 1;
+ }
+ else if (first_task->pid == second_task->pid) {
+ /* If the PIDs are the same then the task with the
+ * inherited priority wins.
+ */
+ if (!second->rt_param.inh_task) {
+ return 1;
+ }
+ }
+ }
+ }
+ return 0; /* fall-through. prio(second_task) > prio(first_task) */
+}
+
+int edf_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+ return edf_higher_prio(bheap2task(a), bheap2task(b));
+}
+
+void edf_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+ release_jobs_t release)
+{
+ rt_domain_init(rt, edf_ready_order, resched, release);
+}
+
+/* need_to_preempt - check whether the task t needs to be preempted
+ * call only with irqs disabled and with ready_lock acquired
+ * THIS DOES NOT TAKE NON-PREEMPTIVE SECTIONS INTO ACCOUNT!
+ */
+int edf_preemption_needed(rt_domain_t* rt, struct task_struct *t)
+{
+ /* we need the read lock for edf_ready_queue */
+ /* no need to preempt if there is nothing pending */
+ if (!__jobs_pending(rt))
+ return 0;
+ /* we need to reschedule if t doesn't exist */
+ if (!t)
+ return 1;
+
+ /* NOTE: We cannot check for non-preemptibility since we
+ * don't know what address space we're currently in.
+ */
+
+ /* make sure to get non-rt stuff out of the way */
+ return !is_realtime(t) || edf_higher_prio(__next_ready(rt), t);
+}
diff --git a/litmus/fdso.c b/litmus/fdso.c
new file mode 100644
index 0000000..250377d
--- /dev/null
+++ b/litmus/fdso.c
@@ -0,0 +1,305 @@
+/* fdso.c - file descriptor attached shared objects
+ *
+ * (c) 2007 B. Brandenburg, LITMUS^RT project
+ *
+ * Notes:
+ * - objects descriptor (OD) tables are not cloned during a fork.
+ * - objects are created on-demand, and freed after the last reference
+ * is dropped.
+ * - for now, object types are hard coded.
+ * - As long as we have live objects, we keep a reference to the inode.
+ */
+
+#include <linux/errno.h>
+#include <linux/sched.h>
+#include <linux/mutex.h>
+#include <linux/file.h>
+#include <asm/uaccess.h>
+
+#include <litmus/fdso.h>
+
+extern struct fdso_ops generic_lock_ops;
+
+static const struct fdso_ops* fdso_ops[] = {
+ &generic_lock_ops, /* FMLP_SEM */
+ &generic_lock_ops, /* SRP_SEM */
+ &generic_lock_ops, /* MPCP_SEM */
+ &generic_lock_ops, /* MPCP_VS_SEM */
+ &generic_lock_ops, /* DPCP_SEM */
+ &generic_lock_ops, /* PCP_SEM */
+};
+
+static int fdso_create(void** obj_ref, obj_type_t type, void* __user config)
+{
+ if (fdso_ops[type]->create)
+ return fdso_ops[type]->create(obj_ref, type, config);
+ else
+ return -EINVAL;
+}
+
+static void fdso_destroy(obj_type_t type, void* obj)
+{
+ fdso_ops[type]->destroy(type, obj);
+}
+
+static int fdso_open(struct od_table_entry* entry, void* __user config)
+{
+ if (fdso_ops[entry->obj->type]->open)
+ return fdso_ops[entry->obj->type]->open(entry, config);
+ else
+ return 0;
+}
+
+static int fdso_close(struct od_table_entry* entry)
+{
+ if (fdso_ops[entry->obj->type]->close)
+ return fdso_ops[entry->obj->type]->close(entry);
+ else
+ return 0;
+}
+
+/* inode must be locked already */
+static int alloc_inode_obj(struct inode_obj_id** obj_ref,
+ struct inode* inode,
+ obj_type_t type,
+ unsigned int id,
+ void* __user config)
+{
+ struct inode_obj_id* obj;
+ void* raw_obj;
+ int err;
+
+ obj = kmalloc(sizeof(*obj), GFP_KERNEL);
+ if (!obj) {
+ return -ENOMEM;
+ }
+
+ err = fdso_create(&raw_obj, type, config);
+ if (err != 0) {
+ kfree(obj);
+ return err;
+ }
+
+ INIT_LIST_HEAD(&obj->list);
+ atomic_set(&obj->count, 1);
+ obj->type = type;
+ obj->id = id;
+ obj->obj = raw_obj;
+ obj->inode = inode;
+
+ list_add(&obj->list, &inode->i_obj_list);
+ atomic_inc(&inode->i_count);
+
+ printk(KERN_DEBUG "alloc_inode_obj(%p, %d, %d): object created\n", inode, type, id);
+
+ *obj_ref = obj;
+ return 0;
+}
+
+/* inode must be locked already */
+static struct inode_obj_id* get_inode_obj(struct inode* inode,
+ obj_type_t type,
+ unsigned int id)
+{
+ struct list_head* pos;
+ struct inode_obj_id* obj = NULL;
+
+ list_for_each(pos, &inode->i_obj_list) {
+ obj = list_entry(pos, struct inode_obj_id, list);
+ if (obj->id == id && obj->type == type) {
+ atomic_inc(&obj->count);
+ return obj;
+ }
+ }
+ printk(KERN_DEBUG "get_inode_obj(%p, %d, %d): couldn't find object\n", inode, type, id);
+ return NULL;
+}
+
+
+static void put_inode_obj(struct inode_obj_id* obj)
+{
+ struct inode* inode;
+ int let_go = 0;
+
+ inode = obj->inode;
+ if (atomic_dec_and_test(&obj->count)) {
+
+ mutex_lock(&inode->i_obj_mutex);
+ /* no new references can be obtained */
+ if (!atomic_read(&obj->count)) {
+ list_del(&obj->list);
+ fdso_destroy(obj->type, obj->obj);
+ kfree(obj);
+ let_go = 1;
+ }
+ mutex_unlock(&inode->i_obj_mutex);
+ if (let_go)
+ iput(inode);
+ }
+}
+
+static struct od_table_entry* get_od_entry(struct task_struct* t)
+{
+ struct od_table_entry* table;
+ int i;
+
+
+ table = t->od_table;
+ if (!table) {
+ table = kzalloc(sizeof(*table) * MAX_OBJECT_DESCRIPTORS,
+ GFP_KERNEL);
+ t->od_table = table;
+ }
+
+ for (i = 0; table && i < MAX_OBJECT_DESCRIPTORS; i++)
+ if (!table[i].used) {
+ table[i].used = 1;
+ return table + i;
+ }
+ return NULL;
+}
+
+static int put_od_entry(struct od_table_entry* od)
+{
+ put_inode_obj(od->obj);
+ od->used = 0;
+ return 0;
+}
+
+static long close_od_entry(struct od_table_entry *od)
+{
+ long ret;
+
+ /* Give the class a chance to reject the close. */
+ ret = fdso_close(od);
+ if (ret == 0)
+ ret = put_od_entry(od);
+
+ return ret;
+}
+
+void exit_od_table(struct task_struct* t)
+{
+ int i;
+
+ if (t->od_table) {
+ for (i = 0; i < MAX_OBJECT_DESCRIPTORS; i++)
+ if (t->od_table[i].used)
+ close_od_entry(t->od_table + i);
+ kfree(t->od_table);
+ t->od_table = NULL;
+ }
+}
+
+static int do_sys_od_open(struct file* file, obj_type_t type, int id,
+ void* __user config)
+{
+ int idx = 0, err = 0;
+ struct inode* inode;
+ struct inode_obj_id* obj = NULL;
+ struct od_table_entry* entry;
+
+ inode = file->f_dentry->d_inode;
+
+ entry = get_od_entry(current);
+ if (!entry)
+ return -ENOMEM;
+
+ mutex_lock(&inode->i_obj_mutex);
+ obj = get_inode_obj(inode, type, id);
+ if (!obj)
+ err = alloc_inode_obj(&obj, inode, type, id, config);
+ if (err != 0) {
+ obj = NULL;
+ idx = err;
+ entry->used = 0;
+ } else {
+ entry->obj = obj;
+ entry->class = fdso_ops[type];
+ idx = entry - current->od_table;
+ }
+
+ mutex_unlock(&inode->i_obj_mutex);
+
+ /* open only if creation succeeded */
+ if (!err)
+ err = fdso_open(entry, config);
+ if (err < 0) {
+ /* The class rejected the open call.
+ * We need to clean up and tell user space.
+ */
+ if (obj)
+ put_od_entry(entry);
+ idx = err;
+ }
+
+ return idx;
+}
+
+
+struct od_table_entry* get_entry_for_od(int od)
+{
+ struct task_struct *t = current;
+
+ if (!t->od_table)
+ return NULL;
+ if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
+ return NULL;
+ if (!t->od_table[od].used)
+ return NULL;
+ return t->od_table + od;
+}
+
+
+asmlinkage long sys_od_open(int fd, int type, int obj_id, void* __user config)
+{
+ int ret = 0;
+ struct file* file;
+
+ /*
+ 1) get file from fd, get inode from file
+ 2) lock inode
+ 3) try to lookup object
+ 4) if not present create and enqueue object, inc inode refcnt
+ 5) increment refcnt of object
+ 6) alloc od_table_entry, setup ptrs
+ 7) unlock inode
+ 8) return offset in od_table as OD
+ */
+
+ if (type < MIN_OBJ_TYPE || type > MAX_OBJ_TYPE) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ file = fget(fd);
+ if (!file) {
+ ret = -EBADF;
+ goto out;
+ }
+
+ ret = do_sys_od_open(file, type, obj_id, config);
+
+ fput(file);
+
+out:
+ return ret;
+}
+
+
+asmlinkage long sys_od_close(int od)
+{
+ int ret = -EINVAL;
+ struct task_struct *t = current;
+
+ if (od < 0 || od >= MAX_OBJECT_DESCRIPTORS)
+ return ret;
+
+ if (!t->od_table || !t->od_table[od].used)
+ return ret;
+
+
+ ret = close_od_entry(t->od_table + od);
+
+ return ret;
+}
diff --git a/litmus/fp_common.c b/litmus/fp_common.c
new file mode 100644
index 0000000..964a472
--- /dev/null
+++ b/litmus/fp_common.c
@@ -0,0 +1,119 @@
+/*
+ * litmus/fp_common.c
+ *
+ * Common functions for fixed-priority scheduler.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+
+#include <litmus/fp_common.h>
+
+/* fp_higher_prio - returns true if first has a higher static priority
+ * than second. Ties are broken by PID.
+ *
+ * both first and second may be NULL
+ */
+int fp_higher_prio(struct task_struct* first,
+ struct task_struct* second)
+{
+ struct task_struct *first_task = first;
+ struct task_struct *second_task = second;
+
+ /* There is no point in comparing a task to itself. */
+ if (unlikely(first && first == second)) {
+ TRACE_TASK(first,
+ "WARNING: pointless FP priority comparison.\n");
+ return 0;
+ }
+
+
+ /* check for NULL tasks */
+ if (!first || !second)
+ return first && !second;
+
+ if (!is_realtime(second_task))
+ return 1;
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+ /* Check for inherited priorities. Change task
+ * used for comparison in such a case.
+ */
+ if (unlikely(first->rt_param.inh_task))
+ first_task = first->rt_param.inh_task;
+ if (unlikely(second->rt_param.inh_task))
+ second_task = second->rt_param.inh_task;
+
+ /* Check for priority boosting. Tie-break by start of boosting.
+ */
+ if (unlikely(is_priority_boosted(first_task))) {
+ /* first_task is boosted, how about second_task? */
+ if (is_priority_boosted(second_task))
+ /* break by priority point */
+ return lt_before(get_boost_start(first_task),
+ get_boost_start(second_task));
+ else
+ /* priority boosting wins. */
+ return 1;
+ } else if (unlikely(is_priority_boosted(second_task)))
+ /* second_task is boosted, first is not*/
+ return 0;
+
+#endif
+
+ /* Comparisons to itself are not expected; priority inheritance
+ * should also not cause this to happen. */
+ BUG_ON(first_task == second_task);
+
+ if (get_priority(first_task) < get_priority(second_task))
+ return 1;
+ else if (get_priority(first_task) == get_priority(second_task))
+ /* Break by PID. */
+ return first_task->pid < second_task->pid;
+ else
+ return 0;
+}
+
+int fp_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+ return fp_higher_prio(bheap2task(a), bheap2task(b));
+}
+
+void fp_domain_init(rt_domain_t* rt, check_resched_needed_t resched,
+ release_jobs_t release)
+{
+ rt_domain_init(rt, fp_ready_order, resched, release);
+}
+
+/* need_to_preempt - check whether the task t needs to be preempted
+ */
+int fp_preemption_needed(struct fp_prio_queue *q, struct task_struct *t)
+{
+ struct task_struct *pending;
+
+ pending = fp_prio_peek(q);
+
+ if (!pending)
+ return 0;
+ if (!t)
+ return 1;
+
+ /* make sure to get non-rt stuff out of the way */
+ return !is_realtime(t) || fp_higher_prio(pending, t);
+}
+
+void fp_prio_queue_init(struct fp_prio_queue* q)
+{
+ int i;
+
+ for (i = 0; i < FP_PRIO_BIT_WORDS; i++)
+ q->bitmask[i] = 0;
+ for (i = 0; i < LITMUS_MAX_PRIORITY; i++)
+ bheap_init(&q->queue[i]);
+}
diff --git a/litmus/ft_event.c b/litmus/ft_event.c
new file mode 100644
index 0000000..399a07b
--- /dev/null
+++ b/litmus/ft_event.c
@@ -0,0 +1,43 @@
+#include <linux/types.h>
+
+#include <litmus/feather_trace.h>
+
+#if !defined(CONFIG_ARCH_HAS_FEATHER_TRACE) || defined(CONFIG_DEBUG_RODATA)
+/* provide dummy implementation */
+
+int ft_events[MAX_EVENTS];
+
+int ft_enable_event(unsigned long id)
+{
+ if (id < MAX_EVENTS) {
+ ft_events[id]++;
+ return 1;
+ } else
+ return 0;
+}
+
+int ft_disable_event(unsigned long id)
+{
+ if (id < MAX_EVENTS && ft_events[id]) {
+ ft_events[id]--;
+ return 1;
+ } else
+ return 0;
+}
+
+int ft_disable_all_events(void)
+{
+ int i;
+
+ for (i = 0; i < MAX_EVENTS; i++)
+ ft_events[i] = 0;
+
+ return MAX_EVENTS;
+}
+
+int ft_is_event_enabled(unsigned long id)
+{
+ return id < MAX_EVENTS && ft_events[id];
+}
+
+#endif
diff --git a/litmus/ftdev.c b/litmus/ftdev.c
new file mode 100644
index 0000000..99bc39f
--- /dev/null
+++ b/litmus/ftdev.c
@@ -0,0 +1,446 @@
+#include <linux/sched.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/cdev.h>
+#include <asm/uaccess.h>
+#include <linux/module.h>
+#include <linux/device.h>
+
+#include <litmus/litmus.h>
+#include <litmus/feather_trace.h>
+#include <litmus/ftdev.h>
+
+struct ft_buffer* alloc_ft_buffer(unsigned int count, size_t size)
+{
+ struct ft_buffer* buf;
+ size_t total = (size + 1) * count;
+ char* mem;
+ int order = 0, pages = 1;
+
+ buf = kmalloc(sizeof(*buf), GFP_KERNEL);
+ if (!buf)
+ return NULL;
+
+ total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
+ while (pages < total) {
+ order++;
+ pages *= 2;
+ }
+
+ mem = (char*) __get_free_pages(GFP_KERNEL, order);
+ if (!mem) {
+ kfree(buf);
+ return NULL;
+ }
+
+ if (!init_ft_buffer(buf, count, size,
+ mem + (count * size), /* markers at the end */
+ mem)) { /* buffer objects */
+ free_pages((unsigned long) mem, order);
+ kfree(buf);
+ return NULL;
+ }
+ return buf;
+}
+
+void free_ft_buffer(struct ft_buffer* buf)
+{
+ int order = 0, pages = 1;
+ size_t total;
+
+ if (buf) {
+ total = (buf->slot_size + 1) * buf->slot_count;
+ total = (total / PAGE_SIZE) + (total % PAGE_SIZE != 0);
+ while (pages < total) {
+ order++;
+ pages *= 2;
+ }
+ free_pages((unsigned long) buf->buffer_mem, order);
+ kfree(buf);
+ }
+}
+
+struct ftdev_event {
+ int id;
+ struct ftdev_event* next;
+};
+
+static int activate(struct ftdev_event** chain, int id)
+{
+ struct ftdev_event* ev = kmalloc(sizeof(*ev), GFP_KERNEL);
+ if (ev) {
+ printk(KERN_INFO
+ "Enabling feather-trace event %d.\n", (int) id);
+ ft_enable_event(id);
+ ev->id = id;
+ ev->next = *chain;
+ *chain = ev;
+ }
+ return ev ? 0 : -ENOMEM;
+}
+
+static void deactivate(struct ftdev_event** chain, int id)
+{
+ struct ftdev_event **cur = chain;
+ struct ftdev_event *nxt;
+ while (*cur) {
+ if ((*cur)->id == id) {
+ nxt = (*cur)->next;
+ kfree(*cur);
+ *cur = nxt;
+ printk(KERN_INFO
+ "Disabling feather-trace event %d.\n", (int) id);
+ ft_disable_event(id);
+ break;
+ }
+ cur = &(*cur)->next;
+ }
+}
+
+static int ftdev_open(struct inode *in, struct file *filp)
+{
+ struct ftdev* ftdev;
+ struct ftdev_minor* ftdm;
+ unsigned int buf_idx = iminor(in);
+ int err = 0;
+
+ ftdev = container_of(in->i_cdev, struct ftdev, cdev);
+
+ if (buf_idx >= ftdev->minor_cnt) {
+ err = -ENODEV;
+ goto out;
+ }
+ if (ftdev->can_open && (err = ftdev->can_open(ftdev, buf_idx)))
+ goto out;
+
+ ftdm = ftdev->minor + buf_idx;
+ ftdm->ftdev = ftdev;
+ filp->private_data = ftdm;
+
+ if (mutex_lock_interruptible(&ftdm->lock)) {
+ err = -ERESTARTSYS;
+ goto out;
+ }
+
+ if (!ftdm->readers && ftdev->alloc)
+ err = ftdev->alloc(ftdev, buf_idx);
+ if (0 == err)
+ ftdm->readers++;
+
+ mutex_unlock(&ftdm->lock);
+out:
+ return err;
+}
+
+static int ftdev_release(struct inode *in, struct file *filp)
+{
+ struct ftdev* ftdev;
+ struct ftdev_minor* ftdm;
+ unsigned int buf_idx = iminor(in);
+ int err = 0;
+
+ ftdev = container_of(in->i_cdev, struct ftdev, cdev);
+
+ if (buf_idx >= ftdev->minor_cnt) {
+ err = -ENODEV;
+ goto out;
+ }
+ ftdm = ftdev->minor + buf_idx;
+
+ if (mutex_lock_interruptible(&ftdm->lock)) {
+ err = -ERESTARTSYS;
+ goto out;
+ }
+
+ if (ftdm->readers == 1) {
+ while (ftdm->events)
+ deactivate(&ftdm->events, ftdm->events->id);
+
+ /* wait for any pending events to complete */
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ schedule_timeout(HZ);
+
+ printk(KERN_ALERT "Failed trace writes: %u\n",
+ ftdm->buf->failed_writes);
+
+ if (ftdev->free)
+ ftdev->free(ftdev, buf_idx);
+ }
+
+ ftdm->readers--;
+ mutex_unlock(&ftdm->lock);
+out:
+ return err;
+}
+
+/* based on ft_buffer_read
+ * @returns < 0 : page fault
+ * = 0 : no data available
+ * = 1 : one slot copied
+ */
+static int ft_buffer_copy_to_user(struct ft_buffer* buf, char __user *dest)
+{
+ unsigned int idx;
+ int err = 0;
+ if (buf->free_count != buf->slot_count) {
+ /* data available */
+ idx = buf->read_idx % buf->slot_count;
+ if (buf->slots[idx] == SLOT_READY) {
+ err = copy_to_user(dest, ((char*) buf->buffer_mem) +
+ idx * buf->slot_size,
+ buf->slot_size);
+ if (err == 0) {
+ /* copy ok */
+ buf->slots[idx] = SLOT_FREE;
+ buf->read_idx++;
+ fetch_and_inc(&buf->free_count);
+ err = 1;
+ }
+ }
+ }
+ return err;
+}
+
+static ssize_t ftdev_read(struct file *filp,
+ char __user *to, size_t len, loff_t *f_pos)
+{
+ /* we ignore f_pos, this is strictly sequential */
+
+ ssize_t err = 0;
+ size_t chunk;
+ int copied;
+ struct ftdev_minor* ftdm = filp->private_data;
+
+ if (mutex_lock_interruptible(&ftdm->lock)) {
+ err = -ERESTARTSYS;
+ goto out;
+ }
+
+
+ chunk = ftdm->buf->slot_size;
+ while (len >= chunk) {
+ copied = ft_buffer_copy_to_user(ftdm->buf, to);
+ if (copied == 1) {
+ len -= chunk;
+ to += chunk;
+ err += chunk;
+ } else if (err == 0 && copied == 0 && ftdm->events) {
+ /* Only wait if there are any events enabled and only
+ * if we haven't copied some data yet. We cannot wait
+ * here with copied data because that data would get
+ * lost if the task is interrupted (e.g., killed).
+ */
+ mutex_unlock(&ftdm->lock);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ schedule_timeout(50);
+
+ if (signal_pending(current)) {
+ if (err == 0)
+ /* nothing read yet, signal problem */
+ err = -ERESTARTSYS;
+ goto out;
+ }
+ if (mutex_lock_interruptible(&ftdm->lock)) {
+ err = -ERESTARTSYS;
+ goto out;
+ }
+ } else if (copied < 0) {
+ /* page fault */
+ err = copied;
+ break;
+ } else
+ /* nothing left to get, return to user space */
+ break;
+ }
+ mutex_unlock(&ftdm->lock);
+out:
+ return err;
+}
+
+static long ftdev_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+{
+ long err = -ENOIOCTLCMD;
+ struct ftdev_minor* ftdm = filp->private_data;
+
+ if (mutex_lock_interruptible(&ftdm->lock)) {
+ err = -ERESTARTSYS;
+ goto out;
+ }
+
+ /* FIXME: check id against list of acceptable events */
+
+ switch (cmd) {
+ case FTDEV_ENABLE_CMD:
+ if (activate(&ftdm->events, arg))
+ err = -ENOMEM;
+ else
+ err = 0;
+ break;
+
+ case FTDEV_DISABLE_CMD:
+ deactivate(&ftdm->events, arg);
+ err = 0;
+ break;
+
+ default:
+ printk(KERN_DEBUG "ftdev: strange ioctl (%u, %lu)\n", cmd, arg);
+ };
+
+ mutex_unlock(&ftdm->lock);
+out:
+ return err;
+}
+
+static ssize_t ftdev_write(struct file *filp, const char __user *from,
+ size_t len, loff_t *f_pos)
+{
+ struct ftdev_minor* ftdm = filp->private_data;
+ ssize_t err = -EINVAL;
+ struct ftdev* ftdev = ftdm->ftdev;
+
+ /* dispatch write to buffer-specific code, if available */
+ if (ftdev->write)
+ err = ftdev->write(ftdm->buf, len, from);
+
+ return err;
+}
+
+struct file_operations ftdev_fops = {
+ .owner = THIS_MODULE,
+ .open = ftdev_open,
+ .release = ftdev_release,
+ .write = ftdev_write,
+ .read = ftdev_read,
+ .unlocked_ioctl = ftdev_ioctl,
+};
+
+int ftdev_init( struct ftdev* ftdev, struct module* owner,
+ const int minor_cnt, const char* name)
+{
+ int i, err;
+
+ BUG_ON(minor_cnt < 1);
+
+ cdev_init(&ftdev->cdev, &ftdev_fops);
+ ftdev->name = name;
+ ftdev->minor_cnt = minor_cnt;
+ ftdev->cdev.owner = owner;
+ ftdev->cdev.ops = &ftdev_fops;
+ ftdev->alloc = NULL;
+ ftdev->free = NULL;
+ ftdev->can_open = NULL;
+ ftdev->write = NULL;
+
+ ftdev->minor = kcalloc(ftdev->minor_cnt, sizeof(*ftdev->minor),
+ GFP_KERNEL);
+ if (!ftdev->minor) {
+ printk(KERN_WARNING "ftdev(%s): Could not allocate memory\n",
+ ftdev->name);
+ err = -ENOMEM;
+ goto err_out;
+ }
+
+ for (i = 0; i < ftdev->minor_cnt; i++) {
+ mutex_init(&ftdev->minor[i].lock);
+ ftdev->minor[i].readers = 0;
+ ftdev->minor[i].buf = NULL;
+ ftdev->minor[i].events = NULL;
+ }
+
+ ftdev->class = class_create(owner, ftdev->name);
+ if (IS_ERR(ftdev->class)) {
+ err = PTR_ERR(ftdev->class);
+ printk(KERN_WARNING "ftdev(%s): "
+ "Could not create device class.\n", ftdev->name);
+ goto err_dealloc;
+ }
+
+ return 0;
+
+err_dealloc:
+ kfree(ftdev->minor);
+err_out:
+ return err;
+}
+
+/*
+ * Destroy minor devices up to, but not including, up_to.
+ */
+static void ftdev_device_destroy(struct ftdev* ftdev, unsigned int up_to)
+{
+ dev_t minor_cntr;
+
+ if (up_to < 1)
+ up_to = (ftdev->minor_cnt < 1) ? 0 : ftdev->minor_cnt;
+
+ for (minor_cntr = 0; minor_cntr < up_to; ++minor_cntr)
+ device_destroy(ftdev->class, MKDEV(ftdev->major, minor_cntr));
+}
+
+void ftdev_exit(struct ftdev* ftdev)
+{
+ printk("ftdev(%s): Exiting\n", ftdev->name);
+ ftdev_device_destroy(ftdev, -1);
+ cdev_del(&ftdev->cdev);
+ unregister_chrdev_region(MKDEV(ftdev->major, 0), ftdev->minor_cnt);
+ class_destroy(ftdev->class);
+ kfree(ftdev->minor);
+}
+
+int register_ftdev(struct ftdev* ftdev)
+{
+ struct device **device;
+ dev_t trace_dev_tmp, minor_cntr;
+ int err;
+
+ err = alloc_chrdev_region(&trace_dev_tmp, 0, ftdev->minor_cnt,
+ ftdev->name);
+ if (err) {
+ printk(KERN_WARNING "ftdev(%s): "
+ "Could not allocate char. device region (%d minors)\n",
+ ftdev->name, ftdev->minor_cnt);
+ goto err_out;
+ }
+
+ ftdev->major = MAJOR(trace_dev_tmp);
+
+ err = cdev_add(&ftdev->cdev, trace_dev_tmp, ftdev->minor_cnt);
+ if (err) {
+ printk(KERN_WARNING "ftdev(%s): "
+ "Could not add cdev for major %u with %u minor(s).\n",
+ ftdev->name, ftdev->major, ftdev->minor_cnt);
+ goto err_unregister;
+ }
+
+ /* create the minor device(s) */
+ for (minor_cntr = 0; minor_cntr < ftdev->minor_cnt; ++minor_cntr)
+ {
+ trace_dev_tmp = MKDEV(ftdev->major, minor_cntr);
+ device = &ftdev->minor[minor_cntr].device;
+
+ *device = device_create(ftdev->class, NULL, trace_dev_tmp, NULL,
+ "litmus/%s%d", ftdev->name, minor_cntr);
+ if (IS_ERR(*device)) {
+ err = PTR_ERR(*device);
+ printk(KERN_WARNING "ftdev(%s): "
+ "Could not create device major/minor number "
+ "%u/%u\n", ftdev->name, ftdev->major,
+ minor_cntr);
+ printk(KERN_WARNING "ftdev(%s): "
+ "will attempt deletion of allocated devices.\n",
+ ftdev->name);
+ goto err_minors;
+ }
+ }
+
+ return 0;
+
+err_minors:
+ ftdev_device_destroy(ftdev, minor_cntr);
+ cdev_del(&ftdev->cdev);
+err_unregister:
+ unregister_chrdev_region(MKDEV(ftdev->major, 0), ftdev->minor_cnt);
+err_out:
+ return err;
+}
diff --git a/litmus/jobs.c b/litmus/jobs.c
new file mode 100644
index 0000000..13a4ed4
--- /dev/null
+++ b/litmus/jobs.c
@@ -0,0 +1,57 @@
+/* litmus/jobs.c - common job control code
+ */
+
+#include <linux/sched.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+
+static inline void setup_release(struct task_struct *t, lt_t release)
+{
+ /* prepare next release */
+ t->rt_param.job_params.release = release;
+ t->rt_param.job_params.deadline = release + get_rt_relative_deadline(t);
+ t->rt_param.job_params.exec_time = 0;
+
+ /* update job sequence number */
+ t->rt_param.job_params.job_no++;
+
+ /* don't confuse Linux */
+ t->rt.time_slice = 1;
+}
+
+void prepare_for_next_period(struct task_struct *t)
+{
+ BUG_ON(!t);
+
+ /* Record lateness before we set up the next job's
+ * release and deadline. Lateness may be negative.
+ */
+ t->rt_param.job_params.lateness =
+ (long long)litmus_clock() -
+ (long long)t->rt_param.job_params.deadline;
+
+ setup_release(t, get_release(t) + get_rt_period(t));
+}
+
+void release_at(struct task_struct *t, lt_t start)
+{
+ BUG_ON(!t);
+ setup_release(t, start);
+ tsk_rt(t)->completed = 0;
+}
+
+
+/*
+ * Deactivate current task until the beginning of the next period.
+ */
+long complete_job(void)
+{
+ /* Mark that we do not excute anymore */
+ tsk_rt(current)->completed = 1;
+ /* call schedule, this will return when a new job arrives
+ * it also takes care of preparing for the next release
+ */
+ schedule();
+ return 0;
+}
diff --git a/litmus/litmus.c b/litmus/litmus.c
new file mode 100644
index 0000000..9c6b738
--- /dev/null
+++ b/litmus/litmus.c
@@ -0,0 +1,593 @@
+/*
+ * litmus.c -- Implementation of the LITMUS syscalls,
+ * the LITMUS intialization code,
+ * and the procfs interface..
+ */
+#include <asm/uaccess.h>
+#include <linux/uaccess.h>
+#include <linux/sysrq.h>
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/reboot.h>
+
+#include <litmus/litmus.h>
+#include <litmus/bheap.h>
+#include <litmus/trace.h>
+#include <litmus/rt_domain.h>
+#include <litmus/litmus_proc.h>
+#include <litmus/sched_trace.h>
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
+#endif
+
+/* Number of RT tasks that exist in the system */
+atomic_t rt_task_count = ATOMIC_INIT(0);
+static DEFINE_RAW_SPINLOCK(task_transition_lock);
+/* synchronize plugin switching */
+atomic_t cannot_use_plugin = ATOMIC_INIT(0);
+
+/* Give log messages sequential IDs. */
+atomic_t __log_seq_no = ATOMIC_INIT(0);
+
+#ifdef CONFIG_RELEASE_MASTER
+/* current master CPU for handling timer IRQs */
+atomic_t release_master_cpu = ATOMIC_INIT(NO_CPU);
+#endif
+
+static struct kmem_cache * bheap_node_cache;
+extern struct kmem_cache * release_heap_cache;
+
+struct bheap_node* bheap_node_alloc(int gfp_flags)
+{
+ return kmem_cache_alloc(bheap_node_cache, gfp_flags);
+}
+
+void bheap_node_free(struct bheap_node* hn)
+{
+ kmem_cache_free(bheap_node_cache, hn);
+}
+
+struct release_heap* release_heap_alloc(int gfp_flags);
+void release_heap_free(struct release_heap* rh);
+
+/*
+ * sys_set_task_rt_param
+ * @pid: Pid of the task which scheduling parameters must be changed
+ * @param: New real-time extension parameters such as the execution cost and
+ * period
+ * Syscall for manipulating with task rt extension params
+ * Returns EFAULT if param is NULL.
+ * ESRCH if pid is not corrsponding
+ * to a valid task.
+ * EINVAL if either period or execution cost is <=0
+ * EPERM if pid is a real-time task
+ * 0 if success
+ *
+ * Only non-real-time tasks may be configured with this system call
+ * to avoid races with the scheduler. In practice, this means that a
+ * task's parameters must be set _before_ calling sys_prepare_rt_task()
+ *
+ * find_task_by_vpid() assumes that we are in the same namespace of the
+ * target.
+ */
+asmlinkage long sys_set_rt_task_param(pid_t pid, struct rt_task __user * param)
+{
+ struct rt_task tp;
+ struct task_struct *target;
+ int retval = -EINVAL;
+
+ printk("Setting up rt task parameters for process %d.\n", pid);
+
+ if (pid < 0 || param == 0) {
+ goto out;
+ }
+ if (copy_from_user(&tp, param, sizeof(tp))) {
+ retval = -EFAULT;
+ goto out;
+ }
+
+ /* Task search and manipulation must be protected */
+ read_lock_irq(&tasklist_lock);
+ if (!(target = find_task_by_vpid(pid))) {
+ retval = -ESRCH;
+ goto out_unlock;
+ }
+
+ if (is_realtime(target)) {
+ /* The task is already a real-time task.
+ * We cannot not allow parameter changes at this point.
+ */
+ retval = -EBUSY;
+ goto out_unlock;
+ }
+
+ /* set relative deadline to be implicit if left unspecified */
+ if (tp.relative_deadline == 0)
+ tp.relative_deadline = tp.period;
+
+ if (tp.exec_cost <= 0)
+ goto out_unlock;
+ if (tp.period <= 0)
+ goto out_unlock;
+ if (!cpu_online(tp.cpu))
+ goto out_unlock;
+ if (min(tp.relative_deadline, tp.period) < tp.exec_cost) /*density check*/
+ {
+ printk(KERN_INFO "litmus: real-time task %d rejected "
+ "because task density > 1.0\n", pid);
+ goto out_unlock;
+ }
+ if (tp.cls != RT_CLASS_HARD &&
+ tp.cls != RT_CLASS_SOFT &&
+ tp.cls != RT_CLASS_BEST_EFFORT)
+ {
+ printk(KERN_INFO "litmus: real-time task %d rejected "
+ "because its class is invalid\n", pid);
+ goto out_unlock;
+ }
+ if (tp.budget_policy != NO_ENFORCEMENT &&
+ tp.budget_policy != QUANTUM_ENFORCEMENT &&
+ tp.budget_policy != PRECISE_ENFORCEMENT)
+ {
+ printk(KERN_INFO "litmus: real-time task %d rejected "
+ "because unsupported budget enforcement policy "
+ "specified (%d)\n",
+ pid, tp.budget_policy);
+ goto out_unlock;
+ }
+
+ target->rt_param.task_params = tp;
+
+ retval = 0;
+ out_unlock:
+ read_unlock_irq(&tasklist_lock);
+ out:
+ return retval;
+}
+
+/*
+ * Getter of task's RT params
+ * returns EINVAL if param or pid is NULL
+ * returns ESRCH if pid does not correspond to a valid task
+ * returns EFAULT if copying of parameters has failed.
+ *
+ * find_task_by_vpid() assumes that we are in the same namespace of the
+ * target.
+ */
+asmlinkage long sys_get_rt_task_param(pid_t pid, struct rt_task __user * param)
+{
+ int retval = -EINVAL;
+ struct task_struct *source;
+ struct rt_task lp;
+ if (param == 0 || pid < 0)
+ goto out;
+ read_lock(&tasklist_lock);
+ if (!(source = find_task_by_vpid(pid))) {
+ retval = -ESRCH;
+ goto out_unlock;
+ }
+ lp = source->rt_param.task_params;
+ read_unlock(&tasklist_lock);
+ /* Do copying outside the lock */
+ retval =
+ copy_to_user(param, &lp, sizeof(lp)) ? -EFAULT : 0;
+ return retval;
+ out_unlock:
+ read_unlock(&tasklist_lock);
+ out:
+ return retval;
+
+}
+
+/*
+ * This is the crucial function for periodic task implementation,
+ * It checks if a task is periodic, checks if such kind of sleep
+ * is permitted and calls plugin-specific sleep, which puts the
+ * task into a wait array.
+ * returns 0 on successful wakeup
+ * returns EPERM if current conditions do not permit such sleep
+ * returns EINVAL if current task is not able to go to sleep
+ */
+asmlinkage long sys_complete_job(void)
+{
+ int retval = -EPERM;
+ if (!is_realtime(current)) {
+ retval = -EINVAL;
+ goto out;
+ }
+ /* Task with negative or zero period cannot sleep */
+ if (get_rt_period(current) <= 0) {
+ retval = -EINVAL;
+ goto out;
+ }
+ /* The plugin has to put the task into an
+ * appropriate queue and call schedule
+ */
+ retval = litmus->complete_job();
+ out:
+ return retval;
+}
+
+/* This is an "improved" version of sys_complete_job that
+ * addresses the problem of unintentionally missing a job after
+ * an overrun.
+ *
+ * returns 0 on successful wakeup
+ * returns EPERM if current conditions do not permit such sleep
+ * returns EINVAL if current task is not able to go to sleep
+ */
+asmlinkage long sys_wait_for_job_release(unsigned int job)
+{
+ int retval = -EPERM;
+ if (!is_realtime(current)) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ /* Task with negative or zero period cannot sleep */
+ if (get_rt_period(current) <= 0) {
+ retval = -EINVAL;
+ goto out;
+ }
+
+ retval = 0;
+
+ /* first wait until we have "reached" the desired job
+ *
+ * This implementation has at least two problems:
+ *
+ * 1) It doesn't gracefully handle the wrap around of
+ * job_no. Since LITMUS is a prototype, this is not much
+ * of a problem right now.
+ *
+ * 2) It is theoretically racy if a job release occurs
+ * between checking job_no and calling sleep_next_period().
+ * A proper solution would requiring adding another callback
+ * in the plugin structure and testing the condition with
+ * interrupts disabled.
+ *
+ * FIXME: At least problem 2 should be taken care of eventually.
+ */
+ while (!retval && job > current->rt_param.job_params.job_no)
+ /* If the last job overran then job <= job_no and we
+ * don't send the task to sleep.
+ */
+ retval = litmus->complete_job();
+ out:
+ return retval;
+}
+
+/* This is a helper syscall to query the current job sequence number.
+ *
+ * returns 0 on successful query
+ * returns EPERM if task is not a real-time task.
+ * returns EFAULT if &job is not a valid pointer.
+ */
+asmlinkage long sys_query_job_no(unsigned int __user *job)
+{
+ int retval = -EPERM;
+ if (is_realtime(current))
+ retval = put_user(current->rt_param.job_params.job_no, job);
+
+ return retval;
+}
+
+/* sys_null_call() is only used for determining raw system call
+ * overheads (kernel entry, kernel exit). It has no useful side effects.
+ * If ts is non-NULL, then the current Feather-Trace time is recorded.
+ */
+asmlinkage long sys_null_call(cycles_t __user *ts)
+{
+ long ret = 0;
+ cycles_t now;
+
+ if (ts) {
+ now = get_cycles();
+ ret = put_user(now, ts);
+ }
+
+ return ret;
+}
+
+/* p is a real-time task. Re-init its state as a best-effort task. */
+static void reinit_litmus_state(struct task_struct* p, int restore)
+{
+ struct rt_task user_config = {};
+ void* ctrl_page = NULL;
+
+ if (restore) {
+ /* Safe user-space provided configuration data.
+ * and allocated page. */
+ user_config = p->rt_param.task_params;
+ ctrl_page = p->rt_param.ctrl_page;
+ }
+
+ /* We probably should not be inheriting any task's priority
+ * at this point in time.
+ */
+ WARN_ON(p->rt_param.inh_task);
+
+ /* Cleanup everything else. */
+ memset(&p->rt_param, 0, sizeof(p->rt_param));
+
+ /* Restore preserved fields. */
+ if (restore) {
+ p->rt_param.task_params = user_config;
+ p->rt_param.ctrl_page = ctrl_page;
+ }
+}
+
+long litmus_admit_task(struct task_struct* tsk)
+{
+ long retval = 0;
+ unsigned long flags;
+
+ BUG_ON(is_realtime(tsk));
+
+ if (get_rt_relative_deadline(tsk) == 0 ||
+ get_exec_cost(tsk) >
+ min(get_rt_relative_deadline(tsk), get_rt_period(tsk)) ) {
+ TRACE_TASK(tsk,
+ "litmus admit: invalid task parameters "
+ "(e = %lu, p = %lu, d = %lu)\n",
+ get_exec_cost(tsk), get_rt_period(tsk),
+ get_rt_relative_deadline(tsk));
+ retval = -EINVAL;
+ goto out;
+ }
+
+ if (!cpu_online(get_partition(tsk))) {
+ TRACE_TASK(tsk, "litmus admit: cpu %d is not online\n",
+ get_partition(tsk));
+ retval = -EINVAL;
+ goto out;
+ }
+
+ INIT_LIST_HEAD(&tsk_rt(tsk)->list);
+
+ /* avoid scheduler plugin changing underneath us */
+ raw_spin_lock_irqsave(&task_transition_lock, flags);
+
+ /* allocate heap node for this task */
+ tsk_rt(tsk)->heap_node = bheap_node_alloc(GFP_ATOMIC);
+ tsk_rt(tsk)->rel_heap = release_heap_alloc(GFP_ATOMIC);
+
+ if (!tsk_rt(tsk)->heap_node || !tsk_rt(tsk)->rel_heap) {
+ printk(KERN_WARNING "litmus: no more heap node memory!?\n");
+
+ bheap_node_free(tsk_rt(tsk)->heap_node);
+ release_heap_free(tsk_rt(tsk)->rel_heap);
+
+ retval = -ENOMEM;
+ goto out_unlock;
+ } else {
+ bheap_node_init(&tsk_rt(tsk)->heap_node, tsk);
+ }
+
+ retval = litmus->admit_task(tsk);
+
+ if (!retval) {
+ sched_trace_task_name(tsk);
+ sched_trace_task_param(tsk);
+ atomic_inc(&rt_task_count);
+ }
+
+out_unlock:
+ raw_spin_unlock_irqrestore(&task_transition_lock, flags);
+out:
+ return retval;
+}
+
+void litmus_exit_task(struct task_struct* tsk)
+{
+ if (is_realtime(tsk)) {
+ sched_trace_task_completion(tsk, 1);
+
+ litmus->task_exit(tsk);
+
+ BUG_ON(bheap_node_in_heap(tsk_rt(tsk)->heap_node));
+ bheap_node_free(tsk_rt(tsk)->heap_node);
+ release_heap_free(tsk_rt(tsk)->rel_heap);
+
+ atomic_dec(&rt_task_count);
+ reinit_litmus_state(tsk, 1);
+ }
+}
+
+/* IPI callback to synchronize plugin switching */
+static void synch_on_plugin_switch(void* info)
+{
+ atomic_inc(&cannot_use_plugin);
+ while (atomic_read(&cannot_use_plugin) > 0)
+ cpu_relax();
+}
+
+/* Switching a plugin in use is tricky.
+ * We must watch out that no real-time tasks exists
+ * (and that none is created in parallel) and that the plugin is not
+ * currently in use on any processor (in theory).
+ */
+int switch_sched_plugin(struct sched_plugin* plugin)
+{
+ unsigned long flags;
+ int ret = 0;
+
+ BUG_ON(!plugin);
+
+ /* forbid other cpus to use the plugin */
+ atomic_set(&cannot_use_plugin, 1);
+ /* send IPI to force other CPUs to synch with us */
+ smp_call_function(synch_on_plugin_switch, NULL, 0);
+
+ /* wait until all other CPUs have started synch */
+ while (atomic_read(&cannot_use_plugin) < num_online_cpus())
+ cpu_relax();
+
+ /* stop task transitions */
+ raw_spin_lock_irqsave(&task_transition_lock, flags);
+
+ /* don't switch if there are active real-time tasks */
+ if (atomic_read(&rt_task_count) == 0) {
+ ret = litmus->deactivate_plugin();
+ if (0 != ret)
+ goto out;
+ ret = plugin->activate_plugin();
+ if (0 != ret) {
+ printk(KERN_INFO "Can't activate %s (%d).\n",
+ plugin->plugin_name, ret);
+ plugin = &linux_sched_plugin;
+ }
+ printk(KERN_INFO "Switching to LITMUS^RT plugin %s.\n", plugin->plugin_name);
+ litmus = plugin;
+ } else
+ ret = -EBUSY;
+out:
+ raw_spin_unlock_irqrestore(&task_transition_lock, flags);
+ atomic_set(&cannot_use_plugin, 0);
+ return ret;
+}
+
+/* Called upon fork.
+ * p is the newly forked task.
+ */
+void litmus_fork(struct task_struct* p)
+{
+ if (is_realtime(p)) {
+ /* clean out any litmus related state, don't preserve anything */
+ reinit_litmus_state(p, 0);
+ /* Don't let the child be a real-time task. */
+ p->sched_reset_on_fork = 1;
+ } else
+ /* non-rt tasks might have ctrl_page set */
+ tsk_rt(p)->ctrl_page = NULL;
+
+ /* od tables are never inherited across a fork */
+ p->od_table = NULL;
+}
+
+/* Called upon execve().
+ * current is doing the exec.
+ * Don't let address space specific stuff leak.
+ */
+void litmus_exec(void)
+{
+ struct task_struct* p = current;
+
+ if (is_realtime(p)) {
+ WARN_ON(p->rt_param.inh_task);
+ if (tsk_rt(p)->ctrl_page) {
+ free_page((unsigned long) tsk_rt(p)->ctrl_page);
+ tsk_rt(p)->ctrl_page = NULL;
+ }
+ }
+}
+
+void exit_litmus(struct task_struct *dead_tsk)
+{
+ /* We also allow non-RT tasks to
+ * allocate control pages to allow
+ * measurements with non-RT tasks.
+ * So check if we need to free the page
+ * in any case.
+ */
+ if (tsk_rt(dead_tsk)->ctrl_page) {
+ TRACE_TASK(dead_tsk,
+ "freeing ctrl_page %p\n",
+ tsk_rt(dead_tsk)->ctrl_page);
+ free_page((unsigned long) tsk_rt(dead_tsk)->ctrl_page);
+ }
+
+ /* main cleanup only for RT tasks */
+ if (is_realtime(dead_tsk))
+ litmus_exit_task(dead_tsk);
+}
+
+
+#ifdef CONFIG_MAGIC_SYSRQ
+int sys_kill(int pid, int sig);
+
+static void sysrq_handle_kill_rt_tasks(int key)
+{
+ struct task_struct *t;
+ read_lock(&tasklist_lock);
+ for_each_process(t) {
+ if (is_realtime(t)) {
+ sys_kill(t->pid, SIGKILL);
+ }
+ }
+ read_unlock(&tasklist_lock);
+}
+
+static struct sysrq_key_op sysrq_kill_rt_tasks_op = {
+ .handler = sysrq_handle_kill_rt_tasks,
+ .help_msg = "quit-rt-tasks(X)",
+ .action_msg = "sent SIGKILL to all LITMUS^RT real-time tasks",
+};
+#endif
+
+extern struct sched_plugin linux_sched_plugin;
+
+static int litmus_shutdown_nb(struct notifier_block *unused1,
+ unsigned long unused2, void *unused3)
+{
+ /* Attempt to switch back to regular Linux scheduling.
+ * Forces the active plugin to clean up.
+ */
+ if (litmus != &linux_sched_plugin) {
+ int ret = switch_sched_plugin(&linux_sched_plugin);
+ if (ret) {
+ printk("Auto-shutdown of active Litmus plugin failed.\n");
+ }
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block shutdown_notifier = {
+ .notifier_call = litmus_shutdown_nb,
+};
+
+static int __init _init_litmus(void)
+{
+ /* Common initializers,
+ * mode change lock is used to enforce single mode change
+ * operation.
+ */
+ printk("Starting LITMUS^RT kernel\n");
+
+ register_sched_plugin(&linux_sched_plugin);
+
+ bheap_node_cache = KMEM_CACHE(bheap_node, SLAB_PANIC);
+ release_heap_cache = KMEM_CACHE(release_heap, SLAB_PANIC);
+
+#ifdef CONFIG_MAGIC_SYSRQ
+ /* offer some debugging help */
+ if (!register_sysrq_key('x', &sysrq_kill_rt_tasks_op))
+ printk("Registered kill rt tasks magic sysrq.\n");
+ else
+ printk("Could not register kill rt tasks magic sysrq.\n");
+#endif
+
+ init_litmus_proc();
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+ init_topology();
+#endif
+
+ register_reboot_notifier(&shutdown_notifier);
+
+ return 0;
+}
+
+static void _exit_litmus(void)
+{
+ unregister_reboot_notifier(&shutdown_notifier);
+
+ exit_litmus_proc();
+ kmem_cache_destroy(bheap_node_cache);
+ kmem_cache_destroy(release_heap_cache);
+}
+
+module_init(_init_litmus);
+module_exit(_exit_litmus);
diff --git a/litmus/litmus_proc.c b/litmus/litmus_proc.c
new file mode 100644
index 0000000..4bf725a
--- /dev/null
+++ b/litmus/litmus_proc.c
@@ -0,0 +1,347 @@
+/*
+ * litmus_proc.c -- Implementation of the /proc/litmus directory tree.
+ */
+
+#include <linux/sched.h>
+#include <linux/uaccess.h>
+
+#include <litmus/litmus.h>
+#include <litmus/litmus_proc.h>
+
+#include <litmus/clustered.h>
+
+/* in litmus/litmus.c */
+extern atomic_t rt_task_count;
+
+static struct proc_dir_entry *litmus_dir = NULL,
+ *curr_file = NULL,
+ *stat_file = NULL,
+ *plugs_dir = NULL,
+#ifdef CONFIG_RELEASE_MASTER
+ *release_master_file = NULL,
+#endif
+ *plugs_file = NULL;
+
+/* in litmus/sync.c */
+int count_tasks_waiting_for_release(void);
+
+static int proc_read_stats(char *page, char **start,
+ off_t off, int count,
+ int *eof, void *data)
+{
+ int len;
+
+ len = snprintf(page, PAGE_SIZE,
+ "real-time tasks = %d\n"
+ "ready for release = %d\n",
+ atomic_read(&rt_task_count),
+ count_tasks_waiting_for_release());
+ return len;
+}
+
+static int proc_read_plugins(char *page, char **start,
+ off_t off, int count,
+ int *eof, void *data)
+{
+ int len;
+
+ len = print_sched_plugins(page, PAGE_SIZE);
+ return len;
+}
+
+static int proc_read_curr(char *page, char **start,
+ off_t off, int count,
+ int *eof, void *data)
+{
+ int len;
+
+ len = snprintf(page, PAGE_SIZE, "%s\n", litmus->plugin_name);
+ return len;
+}
+
+/* in litmus/litmus.c */
+int switch_sched_plugin(struct sched_plugin*);
+
+static int proc_write_curr(struct file *file,
+ const char *buffer,
+ unsigned long count,
+ void *data)
+{
+ int len, ret;
+ char name[65];
+ struct sched_plugin* found;
+
+ len = copy_and_chomp(name, sizeof(name), buffer, count);
+ if (len < 0)
+ return len;
+
+ found = find_sched_plugin(name);
+
+ if (found) {
+ ret = switch_sched_plugin(found);
+ if (ret != 0)
+ printk(KERN_INFO "Could not switch plugin: %d\n", ret);
+ } else
+ printk(KERN_INFO "Plugin '%s' is unknown.\n", name);
+
+ return len;
+}
+
+#ifdef CONFIG_RELEASE_MASTER
+static int proc_read_release_master(char *page, char **start,
+ off_t off, int count,
+ int *eof, void *data)
+{
+ int len, master;
+ master = atomic_read(&release_master_cpu);
+ if (master == NO_CPU)
+ len = snprintf(page, PAGE_SIZE, "NO_CPU\n");
+ else
+ len = snprintf(page, PAGE_SIZE, "%d\n", master);
+ return len;
+}
+
+static int proc_write_release_master(struct file *file,
+ const char *buffer,
+ unsigned long count,
+ void *data)
+{
+ int cpu, err, len, online = 0;
+ char msg[64];
+
+ len = copy_and_chomp(msg, sizeof(msg), buffer, count);
+
+ if (len < 0)
+ return len;
+
+ if (strcmp(msg, "NO_CPU") == 0)
+ atomic_set(&release_master_cpu, NO_CPU);
+ else {
+ err = sscanf(msg, "%d", &cpu);
+ if (err == 1 && cpu >= 0 && (online = cpu_online(cpu))) {
+ atomic_set(&release_master_cpu, cpu);
+ } else {
+ TRACE("invalid release master: '%s' "
+ "(err:%d cpu:%d online:%d)\n",
+ msg, err, cpu, online);
+ len = -EINVAL;
+ }
+ }
+ return len;
+}
+#endif
+
+int __init init_litmus_proc(void)
+{
+ litmus_dir = proc_mkdir("litmus", NULL);
+ if (!litmus_dir) {
+ printk(KERN_ERR "Could not allocate LITMUS^RT procfs entry.\n");
+ return -ENOMEM;
+ }
+
+ curr_file = create_proc_entry("active_plugin",
+ 0644, litmus_dir);
+ if (!curr_file) {
+ printk(KERN_ERR "Could not allocate active_plugin "
+ "procfs entry.\n");
+ return -ENOMEM;
+ }
+ curr_file->read_proc = proc_read_curr;
+ curr_file->write_proc = proc_write_curr;
+
+#ifdef CONFIG_RELEASE_MASTER
+ release_master_file = create_proc_entry("release_master",
+ 0644, litmus_dir);
+ if (!release_master_file) {
+ printk(KERN_ERR "Could not allocate release_master "
+ "procfs entry.\n");
+ return -ENOMEM;
+ }
+ release_master_file->read_proc = proc_read_release_master;
+ release_master_file->write_proc = proc_write_release_master;
+#endif
+
+ stat_file = create_proc_read_entry("stats", 0444, litmus_dir,
+ proc_read_stats, NULL);
+
+ plugs_dir = proc_mkdir("plugins", litmus_dir);
+ if (!plugs_dir){
+ printk(KERN_ERR "Could not allocate plugins directory "
+ "procfs entry.\n");
+ return -ENOMEM;
+ }
+
+ plugs_file = create_proc_read_entry("loaded", 0444, plugs_dir,
+ proc_read_plugins, NULL);
+
+ return 0;
+}
+
+void exit_litmus_proc(void)
+{
+ if (plugs_file)
+ remove_proc_entry("loaded", plugs_dir);
+ if (plugs_dir)
+ remove_proc_entry("plugins", litmus_dir);
+ if (stat_file)
+ remove_proc_entry("stats", litmus_dir);
+ if (curr_file)
+ remove_proc_entry("active_plugin", litmus_dir);
+#ifdef CONFIG_RELEASE_MASTER
+ if (release_master_file)
+ remove_proc_entry("release_master", litmus_dir);
+#endif
+ if (litmus_dir)
+ remove_proc_entry("litmus", NULL);
+}
+
+long make_plugin_proc_dir(struct sched_plugin* plugin,
+ struct proc_dir_entry** pde_in)
+{
+ struct proc_dir_entry *pde_new = NULL;
+ long rv;
+
+ if (!plugin || !plugin->plugin_name){
+ printk(KERN_ERR "Invalid plugin struct passed to %s.\n",
+ __func__);
+ rv = -EINVAL;
+ goto out_no_pde;
+ }
+
+ if (!plugs_dir){
+ printk(KERN_ERR "Could not make plugin sub-directory, because "
+ "/proc/litmus/plugins does not exist.\n");
+ rv = -ENOENT;
+ goto out_no_pde;
+ }
+
+ pde_new = proc_mkdir(plugin->plugin_name, plugs_dir);
+ if (!pde_new){
+ printk(KERN_ERR "Could not make plugin sub-directory: "
+ "out of memory?.\n");
+ rv = -ENOMEM;
+ goto out_no_pde;
+ }
+
+ rv = 0;
+ *pde_in = pde_new;
+ goto out_ok;
+
+out_no_pde:
+ *pde_in = NULL;
+out_ok:
+ return rv;
+}
+
+void remove_plugin_proc_dir(struct sched_plugin* plugin)
+{
+ if (!plugin || !plugin->plugin_name){
+ printk(KERN_ERR "Invalid plugin struct passed to %s.\n",
+ __func__);
+ return;
+ }
+ remove_proc_entry(plugin->plugin_name, plugs_dir);
+}
+
+
+
+/* misc. I/O helper functions */
+
+int copy_and_chomp(char *kbuf, unsigned long ksize,
+ __user const char* ubuf, unsigned long ulength)
+{
+ /* caller must provide buffer space */
+ BUG_ON(!ksize);
+
+ ksize--; /* leave space for null byte */
+
+ if (ksize > ulength)
+ ksize = ulength;
+
+ if(copy_from_user(kbuf, ubuf, ksize))
+ return -EFAULT;
+
+ kbuf[ksize] = '\0';
+
+ /* chomp kbuf */
+ if (ksize > 0 && kbuf[ksize - 1] == '\n')
+ kbuf[ksize - 1] = '\0';
+
+ return ksize;
+}
+
+/* helper functions for clustered plugins */
+static const char* cache_level_names[] = {
+ "ALL",
+ "L1",
+ "L2",
+ "L3",
+};
+
+int parse_cache_level(const char *cache_name, enum cache_level *level)
+{
+ int err = -EINVAL;
+ int i;
+ /* do a quick and dirty comparison to find the cluster size */
+ for (i = GLOBAL_CLUSTER; i <= L3_CLUSTER; i++)
+ if (!strcmp(cache_name, cache_level_names[i])) {
+ *level = (enum cache_level) i;
+ err = 0;
+ break;
+ }
+ return err;
+}
+
+const char* cache_level_name(enum cache_level level)
+{
+ int idx = level;
+
+ if (idx >= GLOBAL_CLUSTER && idx <= L3_CLUSTER)
+ return cache_level_names[idx];
+ else
+ return "INVALID";
+}
+
+
+/* proc file interface to configure the cluster size */
+static int proc_read_cluster_size(char *page, char **start,
+ off_t off, int count,
+ int *eof, void *data)
+{
+ return snprintf(page, PAGE_SIZE, "%s\n",
+ cache_level_name(*((enum cache_level*) data)));;
+}
+
+static int proc_write_cluster_size(struct file *file,
+ const char *buffer,
+ unsigned long count,
+ void *data)
+{
+ int len;
+ char cache_name[8];
+
+ len = copy_and_chomp(cache_name, sizeof(cache_name), buffer, count);
+
+ if (len > 0 && parse_cache_level(cache_name, (enum cache_level*) data))
+ printk(KERN_INFO "Cluster '%s' is unknown.\n", cache_name);
+
+ return len;
+}
+
+struct proc_dir_entry* create_cluster_file(struct proc_dir_entry* parent,
+ enum cache_level* level)
+{
+ struct proc_dir_entry* cluster_file;
+
+ cluster_file = create_proc_entry("cluster", 0644, parent);
+ if (!cluster_file) {
+ printk(KERN_ERR "Could not allocate %s/cluster "
+ "procfs entry.\n", parent->name);
+ } else {
+ cluster_file->read_proc = proc_read_cluster_size;
+ cluster_file->write_proc = proc_write_cluster_size;
+ cluster_file->data = level;
+ }
+ return cluster_file;
+}
+
diff --git a/litmus/locking.c b/litmus/locking.c
new file mode 100644
index 0000000..43d9aec
--- /dev/null
+++ b/litmus/locking.c
@@ -0,0 +1,188 @@
+#include <linux/sched.h>
+#include <litmus/litmus.h>
+#include <litmus/fdso.h>
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+#include <linux/sched.h>
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/trace.h>
+#include <litmus/wait.h>
+
+static int create_generic_lock(void** obj_ref, obj_type_t type, void* __user arg);
+static int open_generic_lock(struct od_table_entry* entry, void* __user arg);
+static int close_generic_lock(struct od_table_entry* entry);
+static void destroy_generic_lock(obj_type_t type, void* sem);
+
+struct fdso_ops generic_lock_ops = {
+ .create = create_generic_lock,
+ .open = open_generic_lock,
+ .close = close_generic_lock,
+ .destroy = destroy_generic_lock
+};
+
+static inline bool is_lock(struct od_table_entry* entry)
+{
+ return entry->class == &generic_lock_ops;
+}
+
+static inline struct litmus_lock* get_lock(struct od_table_entry* entry)
+{
+ BUG_ON(!is_lock(entry));
+ return (struct litmus_lock*) entry->obj->obj;
+}
+
+static int create_generic_lock(void** obj_ref, obj_type_t type, void* __user arg)
+{
+ struct litmus_lock* lock;
+ int err;
+
+ err = litmus->allocate_lock(&lock, type, arg);
+ if (err == 0)
+ *obj_ref = lock;
+ return err;
+}
+
+static int open_generic_lock(struct od_table_entry* entry, void* __user arg)
+{
+ struct litmus_lock* lock = get_lock(entry);
+ if (lock->ops->open)
+ return lock->ops->open(lock, arg);
+ else
+ return 0; /* default: any task can open it */
+}
+
+static int close_generic_lock(struct od_table_entry* entry)
+{
+ struct litmus_lock* lock = get_lock(entry);
+ if (lock->ops->close)
+ return lock->ops->close(lock);
+ else
+ return 0; /* default: closing succeeds */
+}
+
+static void destroy_generic_lock(obj_type_t type, void* obj)
+{
+ struct litmus_lock* lock = (struct litmus_lock*) obj;
+ lock->ops->deallocate(lock);
+}
+
+asmlinkage long sys_litmus_lock(int lock_od)
+{
+ long err = -EINVAL;
+ struct od_table_entry* entry;
+ struct litmus_lock* l;
+
+ TS_SYSCALL_IN_START;
+
+ TS_SYSCALL_IN_END;
+
+ TS_LOCK_START;
+
+ entry = get_entry_for_od(lock_od);
+ if (entry && is_lock(entry)) {
+ l = get_lock(entry);
+ TRACE_CUR("attempts to lock 0x%p\n", l);
+ err = l->ops->lock(l);
+ }
+
+ /* Note: task my have been suspended or preempted in between! Take
+ * this into account when computing overheads. */
+ TS_LOCK_END;
+
+ TS_SYSCALL_OUT_START;
+
+ return err;
+}
+
+asmlinkage long sys_litmus_unlock(int lock_od)
+{
+ long err = -EINVAL;
+ struct od_table_entry* entry;
+ struct litmus_lock* l;
+
+ TS_SYSCALL_IN_START;
+
+ TS_SYSCALL_IN_END;
+
+ TS_UNLOCK_START;
+
+ entry = get_entry_for_od(lock_od);
+ if (entry && is_lock(entry)) {
+ l = get_lock(entry);
+ TRACE_CUR("attempts to unlock 0x%p\n", l);
+ err = l->ops->unlock(l);
+ }
+
+ /* Note: task my have been preempted in between! Take this into
+ * account when computing overheads. */
+ TS_UNLOCK_END;
+
+ TS_SYSCALL_OUT_START;
+
+ return err;
+}
+
+struct task_struct* __waitqueue_remove_first(wait_queue_head_t *wq)
+{
+ wait_queue_t* q;
+ struct task_struct* t = NULL;
+
+ if (waitqueue_active(wq)) {
+ q = list_entry(wq->task_list.next,
+ wait_queue_t, task_list);
+ t = (struct task_struct*) q->private;
+ __remove_wait_queue(wq, q);
+ }
+ return(t);
+}
+
+unsigned int __add_wait_queue_prio_exclusive(
+ wait_queue_head_t* head,
+ prio_wait_queue_t *new)
+{
+ struct list_head *pos;
+ unsigned int passed = 0;
+
+ new->wq.flags |= WQ_FLAG_EXCLUSIVE;
+
+ /* find a spot where the new entry is less than the next */
+ list_for_each(pos, &head->task_list) {
+ prio_wait_queue_t* queued = list_entry(pos, prio_wait_queue_t,
+ wq.task_list);
+
+ if (unlikely(lt_before(new->priority, queued->priority) ||
+ (new->priority == queued->priority &&
+ new->tie_breaker < queued->tie_breaker))) {
+ /* pos is not less than new, thus insert here */
+ __list_add(&new->wq.task_list, pos->prev, pos);
+ goto out;
+ }
+ passed++;
+ }
+
+ /* if we get to this point either the list is empty or every entry
+ * queued element is less than new.
+ * Let's add new to the end. */
+ list_add_tail(&new->wq.task_list, &head->task_list);
+out:
+ return passed;
+}
+
+
+#else
+
+struct fdso_ops generic_lock_ops = {};
+
+asmlinkage long sys_litmus_lock(int sem_od)
+{
+ return -ENOSYS;
+}
+
+asmlinkage long sys_litmus_unlock(int sem_od)
+{
+ return -ENOSYS;
+}
+
+#endif
diff --git a/litmus/preempt.c b/litmus/preempt.c
new file mode 100644
index 0000000..6be2f26
--- /dev/null
+++ b/litmus/preempt.c
@@ -0,0 +1,137 @@
+#include <linux/sched.h>
+
+#include <litmus/litmus.h>
+#include <litmus/preempt.h>
+#include <litmus/trace.h>
+
+/* The rescheduling state of each processor.
+ */
+DEFINE_PER_CPU_SHARED_ALIGNED(atomic_t, resched_state);
+
+void sched_state_will_schedule(struct task_struct* tsk)
+{
+ /* Litmus hack: we only care about processor-local invocations of
+ * set_tsk_need_resched(). We can't reliably set the flag remotely
+ * since it might race with other updates to the scheduling state. We
+ * can't rely on the runqueue lock protecting updates to the sched
+ * state since processors do not acquire the runqueue locks for all
+ * updates to the sched state (to avoid acquiring two runqueue locks at
+ * the same time). Further, if tsk is residing on a remote processor,
+ * then that processor doesn't actually know yet that it is going to
+ * reschedule; it still must receive an IPI (unless a local invocation
+ * races).
+ */
+ if (likely(task_cpu(tsk) == smp_processor_id())) {
+ VERIFY_SCHED_STATE(TASK_SCHEDULED | SHOULD_SCHEDULE | TASK_PICKED | WILL_SCHEDULE);
+ if (is_in_sched_state(TASK_PICKED | PICKED_WRONG_TASK))
+ set_sched_state(PICKED_WRONG_TASK);
+ else
+ set_sched_state(WILL_SCHEDULE);
+ } else
+ /* Litmus tasks should never be subject to a remote
+ * set_tsk_need_resched(). */
+ BUG_ON(is_realtime(tsk));
+#ifdef CONFIG_PREEMPT_STATE_TRACE
+ TRACE_TASK(tsk, "set_tsk_need_resched() ret:%p\n",
+ __builtin_return_address(0));
+#endif
+}
+
+/* Called by the IPI handler after another CPU called smp_send_resched(). */
+void sched_state_ipi(void)
+{
+ /* If the IPI was slow, we might be in any state right now. The IPI is
+ * only meaningful if we are in SHOULD_SCHEDULE. */
+ if (is_in_sched_state(SHOULD_SCHEDULE)) {
+ /* Cause scheduler to be invoked.
+ * This will cause a transition to WILL_SCHEDULE. */
+ set_tsk_need_resched(current);
+ TRACE_STATE("IPI -> set_tsk_need_resched(%s/%d)\n",
+ current->comm, current->pid);
+ TS_SEND_RESCHED_END;
+ } else {
+ /* ignore */
+ TRACE_STATE("ignoring IPI in state %x (%s)\n",
+ get_sched_state(),
+ sched_state_name(get_sched_state()));
+ }
+}
+
+/* Called by plugins to cause a CPU to reschedule. IMPORTANT: the caller must
+ * hold the lock that is used to serialize scheduling decisions. */
+void litmus_reschedule(int cpu)
+{
+ int picked_transition_ok = 0;
+ int scheduled_transition_ok = 0;
+
+ /* The (remote) CPU could be in any state. */
+
+ /* The critical states are TASK_PICKED and TASK_SCHEDULED, as the CPU
+ * is not aware of the need to reschedule at this point. */
+
+ /* is a context switch in progress? */
+ if (cpu_is_in_sched_state(cpu, TASK_PICKED))
+ picked_transition_ok = sched_state_transition_on(
+ cpu, TASK_PICKED, PICKED_WRONG_TASK);
+
+ if (!picked_transition_ok &&
+ cpu_is_in_sched_state(cpu, TASK_SCHEDULED)) {
+ /* We either raced with the end of the context switch, or the
+ * CPU was in TASK_SCHEDULED anyway. */
+ scheduled_transition_ok = sched_state_transition_on(
+ cpu, TASK_SCHEDULED, SHOULD_SCHEDULE);
+ }
+
+ /* If the CPU was in state TASK_SCHEDULED, then we need to cause the
+ * scheduler to be invoked. */
+ if (scheduled_transition_ok) {
+ if (smp_processor_id() == cpu)
+ set_tsk_need_resched(current);
+ else {
+ TS_SEND_RESCHED_START(cpu);
+ smp_send_reschedule(cpu);
+ }
+ }
+
+ TRACE_STATE("%s picked-ok:%d sched-ok:%d\n",
+ __FUNCTION__,
+ picked_transition_ok,
+ scheduled_transition_ok);
+}
+
+void litmus_reschedule_local(void)
+{
+ if (is_in_sched_state(TASK_PICKED))
+ set_sched_state(PICKED_WRONG_TASK);
+ else if (is_in_sched_state(TASK_SCHEDULED | SHOULD_SCHEDULE)) {
+ set_sched_state(WILL_SCHEDULE);
+ set_tsk_need_resched(current);
+ }
+}
+
+#ifdef CONFIG_DEBUG_KERNEL
+
+void sched_state_plugin_check(void)
+{
+ if (!is_in_sched_state(TASK_PICKED | PICKED_WRONG_TASK)) {
+ TRACE("!!!! plugin did not call sched_state_task_picked()!"
+ "Calling sched_state_task_picked() is mandatory---fix this.\n");
+ set_sched_state(TASK_PICKED);
+ }
+}
+
+#define NAME_CHECK(x) case x: return #x
+const char* sched_state_name(int s)
+{
+ switch (s) {
+ NAME_CHECK(TASK_SCHEDULED);
+ NAME_CHECK(SHOULD_SCHEDULE);
+ NAME_CHECK(WILL_SCHEDULE);
+ NAME_CHECK(TASK_PICKED);
+ NAME_CHECK(PICKED_WRONG_TASK);
+ default:
+ return "UNKNOWN";
+ };
+}
+
+#endif
diff --git a/litmus/rt_domain.c b/litmus/rt_domain.c
new file mode 100644
index 0000000..1683d38
--- /dev/null
+++ b/litmus/rt_domain.c
@@ -0,0 +1,349 @@
+/*
+ * litmus/rt_domain.c
+ *
+ * LITMUS real-time infrastructure. This file contains the
+ * functions that manipulate RT domains. RT domains are an abstraction
+ * of a ready queue and a release queue.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+
+#include <litmus/rt_domain.h>
+
+#include <litmus/trace.h>
+
+#include <litmus/bheap.h>
+
+/* Uncomment when debugging timer races... */
+#if 0
+#define VTRACE_TASK TRACE_TASK
+#define VTRACE TRACE
+#else
+#define VTRACE_TASK(t, fmt, args...) /* shut up */
+#define VTRACE(fmt, args...) /* be quiet already */
+#endif
+
+static int dummy_resched(rt_domain_t *rt)
+{
+ return 0;
+}
+
+static int dummy_order(struct bheap_node* a, struct bheap_node* b)
+{
+ return 0;
+}
+
+/* default implementation: use default lock */
+static void default_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+ merge_ready(rt, tasks);
+}
+
+static unsigned int time2slot(lt_t time)
+{
+ return (unsigned int) time2quanta(time, FLOOR) % RELEASE_QUEUE_SLOTS;
+}
+
+static enum hrtimer_restart on_release_timer(struct hrtimer *timer)
+{
+ unsigned long flags;
+ struct release_heap* rh;
+ rh = container_of(timer, struct release_heap, timer);
+
+ TS_RELEASE_LATENCY(rh->release_time);
+
+ VTRACE("on_release_timer(0x%p) starts.\n", timer);
+
+ TS_RELEASE_START;
+
+
+ raw_spin_lock_irqsave(&rh->dom->release_lock, flags);
+ VTRACE("CB has the release_lock 0x%p\n", &rh->dom->release_lock);
+ /* remove from release queue */
+ list_del(&rh->list);
+ raw_spin_unlock_irqrestore(&rh->dom->release_lock, flags);
+ VTRACE("CB returned release_lock 0x%p\n", &rh->dom->release_lock);
+
+ /* call release callback */
+ rh->dom->release_jobs(rh->dom, &rh->heap);
+ /* WARNING: rh can be referenced from other CPUs from now on. */
+
+ TS_RELEASE_END;
+
+ VTRACE("on_release_timer(0x%p) ends.\n", timer);
+
+ return HRTIMER_NORESTART;
+}
+
+/* allocated in litmus.c */
+struct kmem_cache * release_heap_cache;
+
+struct release_heap* release_heap_alloc(int gfp_flags)
+{
+ struct release_heap* rh;
+ rh= kmem_cache_alloc(release_heap_cache, gfp_flags);
+ if (rh) {
+ /* initialize timer */
+ hrtimer_init(&rh->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ rh->timer.function = on_release_timer;
+ }
+ return rh;
+}
+
+void release_heap_free(struct release_heap* rh)
+{
+ /* make sure timer is no longer in use */
+ hrtimer_cancel(&rh->timer);
+ kmem_cache_free(release_heap_cache, rh);
+}
+
+/* Caller must hold release lock.
+ * Will return heap for given time. If no such heap exists prior to
+ * the invocation it will be created.
+ */
+static struct release_heap* get_release_heap(rt_domain_t *rt,
+ struct task_struct* t,
+ int use_task_heap)
+{
+ struct list_head* pos;
+ struct release_heap* heap = NULL;
+ struct release_heap* rh;
+ lt_t release_time = get_release(t);
+ unsigned int slot = time2slot(release_time);
+
+ /* initialize pos for the case that the list is empty */
+ pos = rt->release_queue.slot[slot].next;
+ list_for_each(pos, &rt->release_queue.slot[slot]) {
+ rh = list_entry(pos, struct release_heap, list);
+ if (release_time == rh->release_time) {
+ /* perfect match -- this happens on hyperperiod
+ * boundaries
+ */
+ heap = rh;
+ break;
+ } else if (lt_before(release_time, rh->release_time)) {
+ /* we need to insert a new node since rh is
+ * already in the future
+ */
+ break;
+ }
+ }
+ if (!heap && use_task_heap) {
+ /* use pre-allocated release heap */
+ rh = tsk_rt(t)->rel_heap;
+
+ rh->dom = rt;
+ rh->release_time = release_time;
+
+ /* add to release queue */
+ list_add(&rh->list, pos->prev);
+ heap = rh;
+ }
+ return heap;
+}
+
+static void reinit_release_heap(struct task_struct* t)
+{
+ struct release_heap* rh;
+
+ /* use pre-allocated release heap */
+ rh = tsk_rt(t)->rel_heap;
+
+ /* Make sure it is safe to use. The timer callback could still
+ * be executing on another CPU; hrtimer_cancel() will wait
+ * until the timer callback has completed. However, under no
+ * circumstances should the timer be active (= yet to be
+ * triggered).
+ *
+ * WARNING: If the CPU still holds the release_lock at this point,
+ * deadlock may occur!
+ */
+ BUG_ON(hrtimer_cancel(&rh->timer));
+
+ /* initialize */
+ bheap_init(&rh->heap);
+#ifdef CONFIG_RELEASE_MASTER
+ atomic_set(&rh->info.state, HRTIMER_START_ON_INACTIVE);
+#endif
+}
+/* arm_release_timer() - start local release timer or trigger
+ * remote timer (pull timer)
+ *
+ * Called by add_release() with:
+ * - tobe_lock taken
+ * - IRQ disabled
+ */
+#ifdef CONFIG_RELEASE_MASTER
+#define arm_release_timer(t) arm_release_timer_on((t), NO_CPU)
+static void arm_release_timer_on(rt_domain_t *_rt , int target_cpu)
+#else
+static void arm_release_timer(rt_domain_t *_rt)
+#endif
+{
+ rt_domain_t *rt = _rt;
+ struct list_head list;
+ struct list_head *pos, *safe;
+ struct task_struct* t;
+ struct release_heap* rh;
+
+ VTRACE("arm_release_timer() at %llu\n", litmus_clock());
+ list_replace_init(&rt->tobe_released, &list);
+
+ list_for_each_safe(pos, safe, &list) {
+ /* pick task of work list */
+ t = list_entry(pos, struct task_struct, rt_param.list);
+ sched_trace_task_release(t);
+ list_del(pos);
+
+ /* put into release heap while holding release_lock */
+ raw_spin_lock(&rt->release_lock);
+ VTRACE_TASK(t, "I have the release_lock 0x%p\n", &rt->release_lock);
+
+ rh = get_release_heap(rt, t, 0);
+ if (!rh) {
+ /* need to use our own, but drop lock first */
+ raw_spin_unlock(&rt->release_lock);
+ VTRACE_TASK(t, "Dropped release_lock 0x%p\n",
+ &rt->release_lock);
+
+ reinit_release_heap(t);
+ VTRACE_TASK(t, "release_heap ready\n");
+
+ raw_spin_lock(&rt->release_lock);
+ VTRACE_TASK(t, "Re-acquired release_lock 0x%p\n",
+ &rt->release_lock);
+
+ rh = get_release_heap(rt, t, 1);
+ }
+ bheap_insert(rt->order, &rh->heap, tsk_rt(t)->heap_node);
+ VTRACE_TASK(t, "arm_release_timer(): added to release heap\n");
+
+ raw_spin_unlock(&rt->release_lock);
+ VTRACE_TASK(t, "Returned the release_lock 0x%p\n", &rt->release_lock);
+
+ /* To avoid arming the timer multiple times, we only let the
+ * owner do the arming (which is the "first" task to reference
+ * this release_heap anyway).
+ */
+ if (rh == tsk_rt(t)->rel_heap) {
+ VTRACE_TASK(t, "arming timer 0x%p\n", &rh->timer);
+ /* we cannot arm the timer using hrtimer_start()
+ * as it may deadlock on rq->lock
+ *
+ * PINNED mode is ok on both local and remote CPU
+ */
+#ifdef CONFIG_RELEASE_MASTER
+ if (rt->release_master == NO_CPU &&
+ target_cpu == NO_CPU)
+#endif
+ __hrtimer_start_range_ns(&rh->timer,
+ ns_to_ktime(rh->release_time),
+ 0, HRTIMER_MODE_ABS_PINNED, 0);
+#ifdef CONFIG_RELEASE_MASTER
+ else
+ hrtimer_start_on(
+ /* target_cpu overrides release master */
+ (target_cpu != NO_CPU ?
+ target_cpu : rt->release_master),
+ &rh->info, &rh->timer,
+ ns_to_ktime(rh->release_time),
+ HRTIMER_MODE_ABS_PINNED);
+#endif
+ } else
+ VTRACE_TASK(t, "0x%p is not my timer\n", &rh->timer);
+ }
+}
+
+void rt_domain_init(rt_domain_t *rt,
+ bheap_prio_t order,
+ check_resched_needed_t check,
+ release_jobs_t release
+ )
+{
+ int i;
+
+ BUG_ON(!rt);
+ if (!check)
+ check = dummy_resched;
+ if (!release)
+ release = default_release_jobs;
+ if (!order)
+ order = dummy_order;
+
+#ifdef CONFIG_RELEASE_MASTER
+ rt->release_master = NO_CPU;
+#endif
+
+ bheap_init(&rt->ready_queue);
+ INIT_LIST_HEAD(&rt->tobe_released);
+ for (i = 0; i < RELEASE_QUEUE_SLOTS; i++)
+ INIT_LIST_HEAD(&rt->release_queue.slot[i]);
+
+ raw_spin_lock_init(&rt->ready_lock);
+ raw_spin_lock_init(&rt->release_lock);
+ raw_spin_lock_init(&rt->tobe_lock);
+
+ rt->check_resched = check;
+ rt->release_jobs = release;
+ rt->order = order;
+}
+
+/* add_ready - add a real-time task to the rt ready queue. It must be runnable.
+ * @new: the newly released task
+ */
+void __add_ready(rt_domain_t* rt, struct task_struct *new)
+{
+ TRACE("rt: adding %s/%d (%llu, %llu, %llu) rel=%llu "
+ "to ready queue at %llu\n",
+ new->comm, new->pid,
+ get_exec_cost(new), get_rt_period(new), get_rt_relative_deadline(new),
+ get_release(new), litmus_clock());
+
+ BUG_ON(bheap_node_in_heap(tsk_rt(new)->heap_node));
+
+ bheap_insert(rt->order, &rt->ready_queue, tsk_rt(new)->heap_node);
+ rt->check_resched(rt);
+}
+
+/* merge_ready - Add a sorted set of tasks to the rt ready queue. They must be runnable.
+ * @tasks - the newly released tasks
+ */
+void __merge_ready(rt_domain_t* rt, struct bheap* tasks)
+{
+ bheap_union(rt->order, &rt->ready_queue, tasks);
+ rt->check_resched(rt);
+}
+
+
+#ifdef CONFIG_RELEASE_MASTER
+void __add_release_on(rt_domain_t* rt, struct task_struct *task,
+ int target_cpu)
+{
+ TRACE_TASK(task, "add_release_on(), rel=%llu, target=%d\n",
+ get_release(task), target_cpu);
+ list_add(&tsk_rt(task)->list, &rt->tobe_released);
+ task->rt_param.domain = rt;
+
+ arm_release_timer_on(rt, target_cpu);
+}
+#endif
+
+/* add_release - add a real-time task to the rt release queue.
+ * @task: the sleeping task
+ */
+void __add_release(rt_domain_t* rt, struct task_struct *task)
+{
+ TRACE_TASK(task, "add_release(), rel=%llu\n", get_release(task));
+ list_add(&tsk_rt(task)->list, &rt->tobe_released);
+ task->rt_param.domain = rt;
+
+ arm_release_timer(rt);
+}
+
diff --git a/litmus/sched_cedf.c b/litmus/sched_cedf.c
new file mode 100644
index 0000000..b45b46f
--- /dev/null
+++ b/litmus/sched_cedf.c
@@ -0,0 +1,856 @@
+/*
+ * litmus/sched_cedf.c
+ *
+ * Implementation of the C-EDF scheduling algorithm.
+ *
+ * This implementation is based on G-EDF:
+ * - CPUs are clustered around L2 or L3 caches.
+ * - Clusters topology is automatically detected (this is arch dependent
+ * and is working only on x86 at the moment --- and only with modern
+ * cpus that exports cpuid4 information)
+ * - The plugins _does not_ attempt to put tasks in the right cluster i.e.
+ * the programmer needs to be aware of the topology to place tasks
+ * in the desired cluster
+ * - default clustering is around L2 cache (cache index = 2)
+ * supported clusters are: L1 (private cache: pedf), L2, L3, ALL (all
+ * online_cpus are placed in a single cluster).
+ *
+ * For details on functions, take a look at sched_gsn_edf.c
+ *
+ * Currently, we do not support changes in the number of online cpus.
+ * If the num_online_cpus() dynamically changes, the plugin is broken.
+ *
+ * This version uses the simple approach and serializes all scheduling
+ * decisions by the use of a queue lock. This is probably not the
+ * best way to do it, but it should suffice for now.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include <linux/module.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/preempt.h>
+#include <litmus/budget.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+#include <litmus/sched_trace.h>
+
+#include <litmus/clustered.h>
+
+#include <litmus/bheap.h>
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
+#endif
+
+/* to configure the cluster size */
+#include <litmus/litmus_proc.h>
+#include <linux/uaccess.h>
+
+/* Reference configuration variable. Determines which cache level is used to
+ * group CPUs into clusters. GLOBAL_CLUSTER, which is the default, means that
+ * all CPUs form a single cluster (just like GSN-EDF).
+ */
+static enum cache_level cluster_config = GLOBAL_CLUSTER;
+
+struct clusterdomain;
+
+/* cpu_entry_t - maintain the linked and scheduled state
+ *
+ * A cpu also contains a pointer to the cedf_domain_t cluster
+ * that owns it (struct clusterdomain*)
+ */
+typedef struct {
+ int cpu;
+ struct clusterdomain* cluster; /* owning cluster */
+ struct task_struct* linked; /* only RT tasks */
+ struct task_struct* scheduled; /* only RT tasks */
+ atomic_t will_schedule; /* prevent unneeded IPIs */
+ struct bheap_node* hn;
+} cpu_entry_t;
+
+/* one cpu_entry_t per CPU */
+DEFINE_PER_CPU(cpu_entry_t, cedf_cpu_entries);
+
+#define set_will_schedule() \
+ (atomic_set(&__get_cpu_var(cedf_cpu_entries).will_schedule, 1))
+#define clear_will_schedule() \
+ (atomic_set(&__get_cpu_var(cedf_cpu_entries).will_schedule, 0))
+#define test_will_schedule(cpu) \
+ (atomic_read(&per_cpu(cedf_cpu_entries, cpu).will_schedule))
+
+/*
+ * In C-EDF there is a cedf domain _per_ cluster
+ * The number of clusters is dynamically determined accordingly to the
+ * total cpu number and the cluster size
+ */
+typedef struct clusterdomain {
+ /* rt_domain for this cluster */
+ rt_domain_t domain;
+ /* cpus in this cluster */
+ cpu_entry_t* *cpus;
+ /* map of this cluster cpus */
+ cpumask_var_t cpu_map;
+ /* the cpus queue themselves according to priority in here */
+ struct bheap_node *heap_node;
+ struct bheap cpu_heap;
+ /* lock for this cluster */
+#define cluster_lock domain.ready_lock
+} cedf_domain_t;
+
+/* a cedf_domain per cluster; allocation is done at init/activation time */
+cedf_domain_t *cedf;
+
+#define remote_cluster(cpu) ((cedf_domain_t *) per_cpu(cedf_cpu_entries, cpu).cluster)
+#define task_cpu_cluster(task) remote_cluster(get_partition(task))
+
+/* Uncomment WANT_ALL_SCHED_EVENTS if you want to see all scheduling
+ * decisions in the TRACE() log; uncomment VERBOSE_INIT for verbose
+ * information during the initialization of the plugin (e.g., topology)
+#define WANT_ALL_SCHED_EVENTS
+ */
+#define VERBOSE_INIT
+
+static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b)
+{
+ cpu_entry_t *a, *b;
+ a = _a->value;
+ b = _b->value;
+ /* Note that a and b are inverted: we want the lowest-priority CPU at
+ * the top of the heap.
+ */
+ return edf_higher_prio(b->linked, a->linked);
+}
+
+/* update_cpu_position - Move the cpu entry to the correct place to maintain
+ * order in the cpu queue. Caller must hold cedf lock.
+ */
+static void update_cpu_position(cpu_entry_t *entry)
+{
+ cedf_domain_t *cluster = entry->cluster;
+
+ if (likely(bheap_node_in_heap(entry->hn)))
+ bheap_delete(cpu_lower_prio,
+ &cluster->cpu_heap,
+ entry->hn);
+
+ bheap_insert(cpu_lower_prio, &cluster->cpu_heap, entry->hn);
+}
+
+/* caller must hold cedf lock */
+static cpu_entry_t* lowest_prio_cpu(cedf_domain_t *cluster)
+{
+ struct bheap_node* hn;
+ hn = bheap_peek(cpu_lower_prio, &cluster->cpu_heap);
+ return hn->value;
+}
+
+
+/* link_task_to_cpu - Update the link of a CPU.
+ * Handles the case where the to-be-linked task is already
+ * scheduled on a different CPU.
+ */
+static noinline void link_task_to_cpu(struct task_struct* linked,
+ cpu_entry_t *entry)
+{
+ cpu_entry_t *sched;
+ struct task_struct* tmp;
+ int on_cpu;
+
+ BUG_ON(linked && !is_realtime(linked));
+
+ /* Currently linked task is set to be unlinked. */
+ if (entry->linked) {
+ entry->linked->rt_param.linked_on = NO_CPU;
+ }
+
+ /* Link new task to CPU. */
+ if (linked) {
+ tsk_rt(linked)->completed = 0;
+ /* handle task is already scheduled somewhere! */
+ on_cpu = linked->rt_param.scheduled_on;
+ if (on_cpu != NO_CPU) {
+ sched = &per_cpu(cedf_cpu_entries, on_cpu);
+ /* this should only happen if not linked already */
+ BUG_ON(sched->linked == linked);
+
+ /* If we are already scheduled on the CPU to which we
+ * wanted to link, we don't need to do the swap --
+ * we just link ourselves to the CPU and depend on
+ * the caller to get things right.
+ */
+ if (entry != sched) {
+ TRACE_TASK(linked,
+ "already scheduled on %d, updating link.\n",
+ sched->cpu);
+ tmp = sched->linked;
+ linked->rt_param.linked_on = sched->cpu;
+ sched->linked = linked;
+ update_cpu_position(sched);
+ linked = tmp;
+ }
+ }
+ if (linked) /* might be NULL due to swap */
+ linked->rt_param.linked_on = entry->cpu;
+ }
+ entry->linked = linked;
+#ifdef WANT_ALL_SCHED_EVENTS
+ if (linked)
+ TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
+ else
+ TRACE("NULL linked to %d.\n", entry->cpu);
+#endif
+ update_cpu_position(entry);
+}
+
+/* unlink - Make sure a task is not linked any longer to an entry
+ * where it was linked before. Must hold cedf_lock.
+ */
+static noinline void unlink(struct task_struct* t)
+{
+ cpu_entry_t *entry;
+
+ if (t->rt_param.linked_on != NO_CPU) {
+ /* unlink */
+ entry = &per_cpu(cedf_cpu_entries, t->rt_param.linked_on);
+ t->rt_param.linked_on = NO_CPU;
+ link_task_to_cpu(NULL, entry);
+ } else if (is_queued(t)) {
+ /* This is an interesting situation: t is scheduled,
+ * but was just recently unlinked. It cannot be
+ * linked anywhere else (because then it would have
+ * been relinked to this CPU), thus it must be in some
+ * queue. We must remove it from the list in this
+ * case.
+ *
+ * in C-EDF case is should be somewhere in the queue for
+ * its domain, therefore and we can get the domain using
+ * task_cpu_cluster
+ */
+ remove(&(task_cpu_cluster(t))->domain, t);
+ }
+}
+
+
+/* preempt - force a CPU to reschedule
+ */
+static void preempt(cpu_entry_t *entry)
+{
+ preempt_if_preemptable(entry->scheduled, entry->cpu);
+}
+
+/* requeue - Put an unlinked task into gsn-edf domain.
+ * Caller must hold cedf_lock.
+ */
+static noinline void requeue(struct task_struct* task)
+{
+ cedf_domain_t *cluster = task_cpu_cluster(task);
+ BUG_ON(!task);
+ /* sanity check before insertion */
+ BUG_ON(is_queued(task));
+
+ if (is_released(task, litmus_clock()))
+ __add_ready(&cluster->domain, task);
+ else {
+ /* it has got to wait */
+ add_release(&cluster->domain, task);
+ }
+}
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+static cpu_entry_t* cedf_get_nearest_available_cpu(
+ cedf_domain_t *cluster, cpu_entry_t *start)
+{
+ cpu_entry_t *affinity;
+
+ get_nearest_available_cpu(affinity, start, cedf_cpu_entries,
+#ifdef CONFIG_RELEASE_MASTER
+ cluster->domain.release_master
+#else
+ NO_CPU
+#endif
+ );
+
+ /* make sure CPU is in our cluster */
+ if (affinity && cpu_isset(affinity->cpu, *cluster->cpu_map))
+ return(affinity);
+ else
+ return(NULL);
+}
+#endif
+
+
+/* check for any necessary preemptions */
+static void check_for_preemptions(cedf_domain_t *cluster)
+{
+ struct task_struct *task;
+ cpu_entry_t *last;
+
+ for(last = lowest_prio_cpu(cluster);
+ edf_preemption_needed(&cluster->domain, last->linked);
+ last = lowest_prio_cpu(cluster)) {
+ /* preemption necessary */
+ task = __take_ready(&cluster->domain);
+ TRACE("check_for_preemptions: attempting to link task %d to %d\n",
+ task->pid, last->cpu);
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+ {
+ cpu_entry_t *affinity =
+ cedf_get_nearest_available_cpu(cluster,
+ &per_cpu(cedf_cpu_entries, task_cpu(task)));
+ if(affinity)
+ last = affinity;
+ else if(requeue_preempted_job(last->linked))
+ requeue(last->linked);
+ }
+#else
+ if (requeue_preempted_job(last->linked))
+ requeue(last->linked);
+#endif
+ link_task_to_cpu(task, last);
+ preempt(last);
+ }
+}
+
+/* cedf_job_arrival: task is either resumed or released */
+static noinline void cedf_job_arrival(struct task_struct* task)
+{
+ cedf_domain_t *cluster = task_cpu_cluster(task);
+ BUG_ON(!task);
+
+ requeue(task);
+ check_for_preemptions(cluster);
+}
+
+static void cedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+ cedf_domain_t* cluster = container_of(rt, cedf_domain_t, domain);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+
+ __merge_ready(&cluster->domain, tasks);
+ check_for_preemptions(cluster);
+
+ raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
+}
+
+/* caller holds cedf_lock */
+static noinline void job_completion(struct task_struct *t, int forced)
+{
+ BUG_ON(!t);
+
+ sched_trace_task_completion(t, forced);
+
+ TRACE_TASK(t, "job_completion().\n");
+
+ /* set flags */
+ tsk_rt(t)->completed = 1;
+ /* prepare for next period */
+ prepare_for_next_period(t);
+ if (is_released(t, litmus_clock()))
+ sched_trace_task_release(t);
+ /* unlink */
+ unlink(t);
+ /* requeue
+ * But don't requeue a blocking task. */
+ if (is_running(t))
+ cedf_job_arrival(t);
+}
+
+/* cedf_tick - this function is called for every local timer
+ * interrupt.
+ *
+ * checks whether the current task has expired and checks
+ * whether we need to preempt it if it has not expired
+ */
+static void cedf_tick(struct task_struct* t)
+{
+ if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
+ if (!is_np(t)) {
+ /* np tasks will be preempted when they become
+ * preemptable again
+ */
+ litmus_reschedule_local();
+ set_will_schedule();
+ TRACE("cedf_scheduler_tick: "
+ "%d is preemptable "
+ " => FORCE_RESCHED\n", t->pid);
+ } else if (is_user_np(t)) {
+ TRACE("cedf_scheduler_tick: "
+ "%d is non-preemptable, "
+ "preemption delayed.\n", t->pid);
+ request_exit_np(t);
+ }
+ }
+}
+
+/* Getting schedule() right is a bit tricky. schedule() may not make any
+ * assumptions on the state of the current task since it may be called for a
+ * number of reasons. The reasons include a scheduler_tick() determined that it
+ * was necessary, because sys_exit_np() was called, because some Linux
+ * subsystem determined so, or even (in the worst case) because there is a bug
+ * hidden somewhere. Thus, we must take extreme care to determine what the
+ * current state is.
+ *
+ * The CPU could currently be scheduling a task (or not), be linked (or not).
+ *
+ * The following assertions for the scheduled task could hold:
+ *
+ * - !is_running(scheduled) // the job blocks
+ * - scheduled->timeslice == 0 // the job completed (forcefully)
+ * - is_completed() // the job completed (by syscall)
+ * - linked != scheduled // we need to reschedule (for any reason)
+ * - is_np(scheduled) // rescheduling must be delayed,
+ * sys_exit_np must be requested
+ *
+ * Any of these can occur together.
+ */
+static struct task_struct* cedf_schedule(struct task_struct * prev)
+{
+ cpu_entry_t* entry = &__get_cpu_var(cedf_cpu_entries);
+ cedf_domain_t *cluster = entry->cluster;
+ int out_of_time, sleep, preempt, np, exists, blocks;
+ struct task_struct* next = NULL;
+
+#ifdef CONFIG_RELEASE_MASTER
+ /* Bail out early if we are the release master.
+ * The release master never schedules any real-time tasks.
+ */
+ if (unlikely(cluster->domain.release_master == entry->cpu)) {
+ sched_state_task_picked();
+ return NULL;
+ }
+#endif
+
+ raw_spin_lock(&cluster->cluster_lock);
+ clear_will_schedule();
+
+ /* sanity checking */
+ BUG_ON(entry->scheduled && entry->scheduled != prev);
+ BUG_ON(entry->scheduled && !is_realtime(prev));
+ BUG_ON(is_realtime(prev) && !entry->scheduled);
+
+ /* (0) Determine state */
+ exists = entry->scheduled != NULL;
+ blocks = exists && !is_running(entry->scheduled);
+ out_of_time = exists &&
+ budget_enforced(entry->scheduled) &&
+ budget_exhausted(entry->scheduled);
+ np = exists && is_np(entry->scheduled);
+ sleep = exists && is_completed(entry->scheduled);
+ preempt = entry->scheduled != entry->linked;
+
+#ifdef WANT_ALL_SCHED_EVENTS
+ TRACE_TASK(prev, "invoked cedf_schedule.\n");
+#endif
+
+ if (exists)
+ TRACE_TASK(prev,
+ "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d "
+ "state:%d sig:%d\n",
+ blocks, out_of_time, np, sleep, preempt,
+ prev->state, signal_pending(prev));
+ if (entry->linked && preempt)
+ TRACE_TASK(prev, "will be preempted by %s/%d\n",
+ entry->linked->comm, entry->linked->pid);
+
+
+ /* If a task blocks we have no choice but to reschedule.
+ */
+ if (blocks)
+ unlink(entry->scheduled);
+
+ /* Request a sys_exit_np() call if we would like to preempt but cannot.
+ * We need to make sure to update the link structure anyway in case
+ * that we are still linked. Multiple calls to request_exit_np() don't
+ * hurt.
+ */
+ if (np && (out_of_time || preempt || sleep)) {
+ unlink(entry->scheduled);
+ request_exit_np(entry->scheduled);
+ }
+
+ /* Any task that is preemptable and either exhausts its execution
+ * budget or wants to sleep completes. We may have to reschedule after
+ * this. Don't do a job completion if we block (can't have timers running
+ * for blocked jobs).
+ */
+ if (!np && (out_of_time || sleep) && !blocks)
+ job_completion(entry->scheduled, !sleep);
+
+ /* Link pending task if we became unlinked.
+ */
+ if (!entry->linked)
+ link_task_to_cpu(__take_ready(&cluster->domain), entry);
+
+ /* The final scheduling decision. Do we need to switch for some reason?
+ * If linked is different from scheduled, then select linked as next.
+ */
+ if ((!np || blocks) &&
+ entry->linked != entry->scheduled) {
+ /* Schedule a linked job? */
+ if (entry->linked) {
+ entry->linked->rt_param.scheduled_on = entry->cpu;
+ next = entry->linked;
+ }
+ if (entry->scheduled) {
+ /* not gonna be scheduled soon */
+ entry->scheduled->rt_param.scheduled_on = NO_CPU;
+ TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
+ }
+ } else
+ /* Only override Linux scheduler if we have a real-time task
+ * scheduled that needs to continue.
+ */
+ if (exists)
+ next = prev;
+
+ sched_state_task_picked();
+ raw_spin_unlock(&cluster->cluster_lock);
+
+#ifdef WANT_ALL_SCHED_EVENTS
+ TRACE("cedf_lock released, next=0x%p\n", next);
+
+ if (next)
+ TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+ else if (exists && !next)
+ TRACE("becomes idle at %llu.\n", litmus_clock());
+#endif
+
+
+ return next;
+}
+
+
+/* _finish_switch - we just finished the switch away from prev
+ */
+static void cedf_finish_switch(struct task_struct *prev)
+{
+ cpu_entry_t* entry = &__get_cpu_var(cedf_cpu_entries);
+
+ entry->scheduled = is_realtime(current) ? current : NULL;
+#ifdef WANT_ALL_SCHED_EVENTS
+ TRACE_TASK(prev, "switched away from\n");
+#endif
+}
+
+
+/* Prepare a task for running in RT mode
+ */
+static void cedf_task_new(struct task_struct * t, int on_rq, int running)
+{
+ unsigned long flags;
+ cpu_entry_t* entry;
+ cedf_domain_t* cluster;
+
+ TRACE("gsn edf: task new %d\n", t->pid);
+
+ /* the cluster doesn't change even if t is running */
+ cluster = task_cpu_cluster(t);
+
+ raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+
+ /* setup job params */
+ release_at(t, litmus_clock());
+
+ if (running) {
+ entry = &per_cpu(cedf_cpu_entries, task_cpu(t));
+ BUG_ON(entry->scheduled);
+
+#ifdef CONFIG_RELEASE_MASTER
+ if (entry->cpu != cluster->domain.release_master) {
+#endif
+ entry->scheduled = t;
+ tsk_rt(t)->scheduled_on = task_cpu(t);
+#ifdef CONFIG_RELEASE_MASTER
+ } else {
+ /* do not schedule on release master */
+ preempt(entry); /* force resched */
+ tsk_rt(t)->scheduled_on = NO_CPU;
+ }
+#endif
+ } else {
+ t->rt_param.scheduled_on = NO_CPU;
+ }
+ t->rt_param.linked_on = NO_CPU;
+
+ cedf_job_arrival(t);
+ raw_spin_unlock_irqrestore(&(cluster->cluster_lock), flags);
+}
+
+static void cedf_task_wake_up(struct task_struct *task)
+{
+ unsigned long flags;
+ lt_t now;
+ cedf_domain_t *cluster;
+
+ TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+
+ cluster = task_cpu_cluster(task);
+
+ raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+ now = litmus_clock();
+ if (is_tardy(task, now)) {
+ /* new sporadic release */
+ release_at(task, now);
+ sched_trace_task_release(task);
+ }
+ else {
+ if (task->rt.time_slice) {
+ /* came back in time before deadline
+ */
+ tsk_rt(task)->completed = 0;
+ }
+ }
+ cedf_job_arrival(task);
+ raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
+}
+
+static void cedf_task_block(struct task_struct *t)
+{
+ unsigned long flags;
+ cedf_domain_t *cluster;
+
+ TRACE_TASK(t, "block at %llu\n", litmus_clock());
+
+ cluster = task_cpu_cluster(t);
+
+ /* unlink if necessary */
+ raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+ unlink(t);
+ raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
+
+ BUG_ON(!is_realtime(t));
+}
+
+
+static void cedf_task_exit(struct task_struct * t)
+{
+ unsigned long flags;
+ cedf_domain_t *cluster = task_cpu_cluster(t);
+
+ /* unlink if necessary */
+ raw_spin_lock_irqsave(&cluster->cluster_lock, flags);
+ unlink(t);
+ if (tsk_rt(t)->scheduled_on != NO_CPU) {
+ cpu_entry_t *cpu;
+ cpu = &per_cpu(cedf_cpu_entries, tsk_rt(t)->scheduled_on);
+ cpu->scheduled = NULL;
+ tsk_rt(t)->scheduled_on = NO_CPU;
+ }
+ raw_spin_unlock_irqrestore(&cluster->cluster_lock, flags);
+
+ BUG_ON(!is_realtime(t));
+ TRACE_TASK(t, "RIP\n");
+}
+
+static long cedf_admit_task(struct task_struct* tsk)
+{
+ return task_cpu(tsk) == tsk->rt_param.task_params.cpu ? 0 : -EINVAL;
+}
+
+/* total number of cluster */
+static int num_clusters;
+/* we do not support cluster of different sizes */
+static unsigned int cluster_size;
+
+#ifdef VERBOSE_INIT
+static void print_cluster_topology(cpumask_var_t mask, int cpu)
+{
+ int chk;
+ char buf[255];
+
+ chk = cpulist_scnprintf(buf, 254, mask);
+ buf[chk] = '\0';
+ printk(KERN_INFO "CPU = %d, shared cpu(s) = %s\n", cpu, buf);
+
+}
+#endif
+
+static int clusters_allocated = 0;
+
+static void cleanup_cedf(void)
+{
+ int i;
+
+ if (clusters_allocated) {
+ for (i = 0; i < num_clusters; i++) {
+ kfree(cedf[i].cpus);
+ kfree(cedf[i].heap_node);
+ free_cpumask_var(cedf[i].cpu_map);
+ }
+
+ kfree(cedf);
+ }
+}
+
+static long cedf_activate_plugin(void)
+{
+ int i, j, cpu, ccpu, cpu_count;
+ cpu_entry_t *entry;
+
+ cpumask_var_t mask;
+ int chk = 0;
+
+ /* de-allocate old clusters, if any */
+ cleanup_cedf();
+
+ printk(KERN_INFO "C-EDF: Activate Plugin, cluster configuration = %d\n",
+ cluster_config);
+
+ /* need to get cluster_size first */
+ if(!zalloc_cpumask_var(&mask, GFP_ATOMIC))
+ return -ENOMEM;
+
+ if (unlikely(cluster_config == GLOBAL_CLUSTER)) {
+ cluster_size = num_online_cpus();
+ } else {
+ chk = get_shared_cpu_map(mask, 0, cluster_config);
+ if (chk) {
+ /* if chk != 0 then it is the max allowed index */
+ printk(KERN_INFO "C-EDF: Cluster configuration = %d "
+ "is not supported on this hardware.\n",
+ cluster_config);
+ /* User should notice that the configuration failed, so
+ * let's bail out. */
+ return -EINVAL;
+ }
+
+ cluster_size = cpumask_weight(mask);
+ }
+
+ if ((num_online_cpus() % cluster_size) != 0) {
+ /* this can't be right, some cpus are left out */
+ printk(KERN_ERR "C-EDF: Trying to group %d cpus in %d!\n",
+ num_online_cpus(), cluster_size);
+ return -1;
+ }
+
+ num_clusters = num_online_cpus() / cluster_size;
+ printk(KERN_INFO "C-EDF: %d cluster(s) of size = %d\n",
+ num_clusters, cluster_size);
+
+ /* initialize clusters */
+ cedf = kmalloc(num_clusters * sizeof(cedf_domain_t), GFP_ATOMIC);
+ for (i = 0; i < num_clusters; i++) {
+
+ cedf[i].cpus = kmalloc(cluster_size * sizeof(cpu_entry_t),
+ GFP_ATOMIC);
+ cedf[i].heap_node = kmalloc(
+ cluster_size * sizeof(struct bheap_node),
+ GFP_ATOMIC);
+ bheap_init(&(cedf[i].cpu_heap));
+ edf_domain_init(&(cedf[i].domain), NULL, cedf_release_jobs);
+
+ if(!zalloc_cpumask_var(&cedf[i].cpu_map, GFP_ATOMIC))
+ return -ENOMEM;
+#ifdef CONFIG_RELEASE_MASTER
+ cedf[i].domain.release_master = atomic_read(&release_master_cpu);
+#endif
+ }
+
+ /* cycle through cluster and add cpus to them */
+ for (i = 0; i < num_clusters; i++) {
+
+ for_each_online_cpu(cpu) {
+ /* check if the cpu is already in a cluster */
+ for (j = 0; j < num_clusters; j++)
+ if (cpumask_test_cpu(cpu, cedf[j].cpu_map))
+ break;
+ /* if it is in a cluster go to next cpu */
+ if (j < num_clusters &&
+ cpumask_test_cpu(cpu, cedf[j].cpu_map))
+ continue;
+
+ /* this cpu isn't in any cluster */
+ /* get the shared cpus */
+ if (unlikely(cluster_config == GLOBAL_CLUSTER))
+ cpumask_copy(mask, cpu_online_mask);
+ else
+ get_shared_cpu_map(mask, cpu, cluster_config);
+
+ cpumask_copy(cedf[i].cpu_map, mask);
+#ifdef VERBOSE_INIT
+ print_cluster_topology(mask, cpu);
+#endif
+ /* add cpus to current cluster and init cpu_entry_t */
+ cpu_count = 0;
+ for_each_cpu(ccpu, cedf[i].cpu_map) {
+
+ entry = &per_cpu(cedf_cpu_entries, ccpu);
+ cedf[i].cpus[cpu_count] = entry;
+ atomic_set(&entry->will_schedule, 0);
+ entry->cpu = ccpu;
+ entry->cluster = &cedf[i];
+ entry->hn = &(cedf[i].heap_node[cpu_count]);
+ bheap_node_init(&entry->hn, entry);
+
+ cpu_count++;
+
+ entry->linked = NULL;
+ entry->scheduled = NULL;
+#ifdef CONFIG_RELEASE_MASTER
+ /* only add CPUs that should schedule jobs */
+ if (entry->cpu != entry->cluster->domain.release_master)
+#endif
+ update_cpu_position(entry);
+ }
+ /* done with this cluster */
+ break;
+ }
+ }
+
+ free_cpumask_var(mask);
+ clusters_allocated = 1;
+ return 0;
+}
+
+/* Plugin object */
+static struct sched_plugin cedf_plugin __cacheline_aligned_in_smp = {
+ .plugin_name = "C-EDF",
+ .finish_switch = cedf_finish_switch,
+ .tick = cedf_tick,
+ .task_new = cedf_task_new,
+ .complete_job = complete_job,
+ .task_exit = cedf_task_exit,
+ .schedule = cedf_schedule,
+ .task_wake_up = cedf_task_wake_up,
+ .task_block = cedf_task_block,
+ .admit_task = cedf_admit_task,
+ .activate_plugin = cedf_activate_plugin,
+};
+
+static struct proc_dir_entry *cluster_file = NULL, *cedf_dir = NULL;
+
+static int __init init_cedf(void)
+{
+ int err, fs;
+
+ err = register_sched_plugin(&cedf_plugin);
+ if (!err) {
+ fs = make_plugin_proc_dir(&cedf_plugin, &cedf_dir);
+ if (!fs)
+ cluster_file = create_cluster_file(cedf_dir, &cluster_config);
+ else
+ printk(KERN_ERR "Could not allocate C-EDF procfs dir.\n");
+ }
+ return err;
+}
+
+static void clean_cedf(void)
+{
+ cleanup_cedf();
+ if (cluster_file)
+ remove_proc_entry("cluster", cedf_dir);
+ if (cedf_dir)
+ remove_plugin_proc_dir(&cedf_plugin);
+}
+
+module_init(init_cedf);
+module_exit(clean_cedf);
diff --git a/litmus/sched_gsn_edf.c b/litmus/sched_gsn_edf.c
new file mode 100644
index 0000000..b8548b8
--- /dev/null
+++ b/litmus/sched_gsn_edf.c
@@ -0,0 +1,1022 @@
+/*
+ * litmus/sched_gsn_edf.c
+ *
+ * Implementation of the GSN-EDF scheduling algorithm.
+ *
+ * This version uses the simple approach and serializes all scheduling
+ * decisions by the use of a queue lock. This is probably not the
+ * best way to do it, but it should suffice for now.
+ */
+
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/slab.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
+
+#include <litmus/preempt.h>
+#include <litmus/budget.h>
+
+#include <litmus/bheap.h>
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+#include <litmus/affinity.h>
+#endif
+
+#include <linux/module.h>
+
+/* Overview of GSN-EDF operations.
+ *
+ * For a detailed explanation of GSN-EDF have a look at the FMLP paper. This
+ * description only covers how the individual operations are implemented in
+ * LITMUS.
+ *
+ * link_task_to_cpu(T, cpu) - Low-level operation to update the linkage
+ * structure (NOT the actually scheduled
+ * task). If there is another linked task To
+ * already it will set To->linked_on = NO_CPU
+ * (thereby removing its association with this
+ * CPU). However, it will not requeue the
+ * previously linked task (if any). It will set
+ * T's state to not completed and check whether
+ * it is already running somewhere else. If T
+ * is scheduled somewhere else it will link
+ * it to that CPU instead (and pull the linked
+ * task to cpu). T may be NULL.
+ *
+ * unlink(T) - Unlink removes T from all scheduler data
+ * structures. If it is linked to some CPU it
+ * will link NULL to that CPU. If it is
+ * currently queued in the gsnedf queue it will
+ * be removed from the rt_domain. It is safe to
+ * call unlink(T) if T is not linked. T may not
+ * be NULL.
+ *
+ * requeue(T) - Requeue will insert T into the appropriate
+ * queue. If the system is in real-time mode and
+ * the T is released already, it will go into the
+ * ready queue. If the system is not in
+ * real-time mode is T, then T will go into the
+ * release queue. If T's release time is in the
+ * future, it will go into the release
+ * queue. That means that T's release time/job
+ * no/etc. has to be updated before requeu(T) is
+ * called. It is not safe to call requeue(T)
+ * when T is already queued. T may not be NULL.
+ *
+ * gsnedf_job_arrival(T) - This is the catch all function when T enters
+ * the system after either a suspension or at a
+ * job release. It will queue T (which means it
+ * is not safe to call gsnedf_job_arrival(T) if
+ * T is already queued) and then check whether a
+ * preemption is necessary. If a preemption is
+ * necessary it will update the linkage
+ * accordingly and cause scheduled to be called
+ * (either with an IPI or need_resched). It is
+ * safe to call gsnedf_job_arrival(T) if T's
+ * next job has not been actually released yet
+ * (releast time in the future). T will be put
+ * on the release queue in that case.
+ *
+ * job_completion(T) - Take care of everything that needs to be done
+ * to prepare T for its next release and place
+ * it in the right queue with
+ * gsnedf_job_arrival().
+ *
+ *
+ * When we now that T is linked to CPU then link_task_to_cpu(NULL, CPU) is
+ * equivalent to unlink(T). Note that if you unlink a task from a CPU none of
+ * the functions will automatically propagate pending task from the ready queue
+ * to a linked task. This is the job of the calling function ( by means of
+ * __take_ready).
+ */
+
+
+/* cpu_entry_t - maintain the linked and scheduled state
+ */
+typedef struct {
+ int cpu;
+ struct task_struct* linked; /* only RT tasks */
+ struct task_struct* scheduled; /* only RT tasks */
+ struct bheap_node* hn;
+} cpu_entry_t;
+DEFINE_PER_CPU(cpu_entry_t, gsnedf_cpu_entries);
+
+cpu_entry_t* gsnedf_cpus[NR_CPUS];
+
+/* the cpus queue themselves according to priority in here */
+static struct bheap_node gsnedf_heap_node[NR_CPUS];
+static struct bheap gsnedf_cpu_heap;
+
+static rt_domain_t gsnedf;
+#define gsnedf_lock (gsnedf.ready_lock)
+
+
+/* Uncomment this if you want to see all scheduling decisions in the
+ * TRACE() log.
+#define WANT_ALL_SCHED_EVENTS
+ */
+
+static int cpu_lower_prio(struct bheap_node *_a, struct bheap_node *_b)
+{
+ cpu_entry_t *a, *b;
+ a = _a->value;
+ b = _b->value;
+ /* Note that a and b are inverted: we want the lowest-priority CPU at
+ * the top of the heap.
+ */
+ return edf_higher_prio(b->linked, a->linked);
+}
+
+/* update_cpu_position - Move the cpu entry to the correct place to maintain
+ * order in the cpu queue. Caller must hold gsnedf lock.
+ */
+static void update_cpu_position(cpu_entry_t *entry)
+{
+ if (likely(bheap_node_in_heap(entry->hn)))
+ bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn);
+ bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap, entry->hn);
+}
+
+/* caller must hold gsnedf lock */
+static cpu_entry_t* lowest_prio_cpu(void)
+{
+ struct bheap_node* hn;
+ hn = bheap_peek(cpu_lower_prio, &gsnedf_cpu_heap);
+ return hn->value;
+}
+
+
+/* link_task_to_cpu - Update the link of a CPU.
+ * Handles the case where the to-be-linked task is already
+ * scheduled on a different CPU.
+ */
+static noinline void link_task_to_cpu(struct task_struct* linked,
+ cpu_entry_t *entry)
+{
+ cpu_entry_t *sched;
+ struct task_struct* tmp;
+ int on_cpu;
+
+ BUG_ON(linked && !is_realtime(linked));
+
+ /* Currently linked task is set to be unlinked. */
+ if (entry->linked) {
+ entry->linked->rt_param.linked_on = NO_CPU;
+ }
+
+ /* Link new task to CPU. */
+ if (linked) {
+ tsk_rt(linked)->completed = 0;
+ /* handle task is already scheduled somewhere! */
+ on_cpu = linked->rt_param.scheduled_on;
+ if (on_cpu != NO_CPU) {
+ sched = &per_cpu(gsnedf_cpu_entries, on_cpu);
+ /* this should only happen if not linked already */
+ BUG_ON(sched->linked == linked);
+
+ /* If we are already scheduled on the CPU to which we
+ * wanted to link, we don't need to do the swap --
+ * we just link ourselves to the CPU and depend on
+ * the caller to get things right.
+ */
+ if (entry != sched) {
+ TRACE_TASK(linked,
+ "already scheduled on %d, updating link.\n",
+ sched->cpu);
+ tmp = sched->linked;
+ linked->rt_param.linked_on = sched->cpu;
+ sched->linked = linked;
+ update_cpu_position(sched);
+ linked = tmp;
+ }
+ }
+ if (linked) /* might be NULL due to swap */
+ linked->rt_param.linked_on = entry->cpu;
+ }
+ entry->linked = linked;
+#ifdef WANT_ALL_SCHED_EVENTS
+ if (linked)
+ TRACE_TASK(linked, "linked to %d.\n", entry->cpu);
+ else
+ TRACE("NULL linked to %d.\n", entry->cpu);
+#endif
+ update_cpu_position(entry);
+}
+
+/* unlink - Make sure a task is not linked any longer to an entry
+ * where it was linked before. Must hold gsnedf_lock.
+ */
+static noinline void unlink(struct task_struct* t)
+{
+ cpu_entry_t *entry;
+
+ if (t->rt_param.linked_on != NO_CPU) {
+ /* unlink */
+ entry = &per_cpu(gsnedf_cpu_entries, t->rt_param.linked_on);
+ t->rt_param.linked_on = NO_CPU;
+ link_task_to_cpu(NULL, entry);
+ } else if (is_queued(t)) {
+ /* This is an interesting situation: t is scheduled,
+ * but was just recently unlinked. It cannot be
+ * linked anywhere else (because then it would have
+ * been relinked to this CPU), thus it must be in some
+ * queue. We must remove it from the list in this
+ * case.
+ */
+ remove(&gsnedf, t);
+ }
+}
+
+
+/* preempt - force a CPU to reschedule
+ */
+static void preempt(cpu_entry_t *entry)
+{
+ preempt_if_preemptable(entry->scheduled, entry->cpu);
+}
+
+/* requeue - Put an unlinked task into gsn-edf domain.
+ * Caller must hold gsnedf_lock.
+ */
+static noinline void requeue(struct task_struct* task)
+{
+ BUG_ON(!task);
+ /* sanity check before insertion */
+ BUG_ON(is_queued(task));
+
+ if (is_released(task, litmus_clock()))
+ __add_ready(&gsnedf, task);
+ else {
+ /* it has got to wait */
+ add_release(&gsnedf, task);
+ }
+}
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+static cpu_entry_t* gsnedf_get_nearest_available_cpu(cpu_entry_t *start)
+{
+ cpu_entry_t *affinity;
+
+ get_nearest_available_cpu(affinity, start, gsnedf_cpu_entries,
+#ifdef CONFIG_RELEASE_MASTER
+ gsnedf.release_master
+#else
+ NO_CPU
+#endif
+ );
+
+ return(affinity);
+}
+#endif
+
+/* check for any necessary preemptions */
+static void check_for_preemptions(void)
+{
+ struct task_struct *task;
+ cpu_entry_t *last;
+
+ for (last = lowest_prio_cpu();
+ edf_preemption_needed(&gsnedf, last->linked);
+ last = lowest_prio_cpu()) {
+ /* preemption necessary */
+ task = __take_ready(&gsnedf);
+ TRACE("check_for_preemptions: attempting to link task %d to %d\n",
+ task->pid, last->cpu);
+
+#ifdef CONFIG_SCHED_CPU_AFFINITY
+ {
+ cpu_entry_t *affinity =
+ gsnedf_get_nearest_available_cpu(
+ &per_cpu(gsnedf_cpu_entries, task_cpu(task)));
+ if (affinity)
+ last = affinity;
+ else if (requeue_preempted_job(last->linked))
+ requeue(last->linked);
+ }
+#else
+ if (requeue_preempted_job(last->linked))
+ requeue(last->linked);
+#endif
+
+ link_task_to_cpu(task, last);
+ preempt(last);
+ }
+}
+
+/* gsnedf_job_arrival: task is either resumed or released */
+static noinline void gsnedf_job_arrival(struct task_struct* task)
+{
+ BUG_ON(!task);
+
+ requeue(task);
+ check_for_preemptions();
+}
+
+static void gsnedf_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&gsnedf_lock, flags);
+
+ __merge_ready(rt, tasks);
+ check_for_preemptions();
+
+ raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+
+/* caller holds gsnedf_lock */
+static noinline void job_completion(struct task_struct *t, int forced)
+{
+ BUG_ON(!t);
+
+ sched_trace_task_completion(t, forced);
+
+ TRACE_TASK(t, "job_completion().\n");
+
+ /* set flags */
+ tsk_rt(t)->completed = 1;
+ /* prepare for next period */
+ prepare_for_next_period(t);
+ if (is_released(t, litmus_clock()))
+ sched_trace_task_release(t);
+ /* unlink */
+ unlink(t);
+ /* requeue
+ * But don't requeue a blocking task. */
+ if (is_running(t))
+ gsnedf_job_arrival(t);
+}
+
+/* gsnedf_tick - this function is called for every local timer
+ * interrupt.
+ *
+ * checks whether the current task has expired and checks
+ * whether we need to preempt it if it has not expired
+ */
+static void gsnedf_tick(struct task_struct* t)
+{
+ if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
+ if (!is_np(t)) {
+ /* np tasks will be preempted when they become
+ * preemptable again
+ */
+ litmus_reschedule_local();
+ TRACE("gsnedf_scheduler_tick: "
+ "%d is preemptable "
+ " => FORCE_RESCHED\n", t->pid);
+ } else if (is_user_np(t)) {
+ TRACE("gsnedf_scheduler_tick: "
+ "%d is non-preemptable, "
+ "preemption delayed.\n", t->pid);
+ request_exit_np(t);
+ }
+ }
+}
+
+/* Getting schedule() right is a bit tricky. schedule() may not make any
+ * assumptions on the state of the current task since it may be called for a
+ * number of reasons. The reasons include a scheduler_tick() determined that it
+ * was necessary, because sys_exit_np() was called, because some Linux
+ * subsystem determined so, or even (in the worst case) because there is a bug
+ * hidden somewhere. Thus, we must take extreme care to determine what the
+ * current state is.
+ *
+ * The CPU could currently be scheduling a task (or not), be linked (or not).
+ *
+ * The following assertions for the scheduled task could hold:
+ *
+ * - !is_running(scheduled) // the job blocks
+ * - scheduled->timeslice == 0 // the job completed (forcefully)
+ * - is_completed() // the job completed (by syscall)
+ * - linked != scheduled // we need to reschedule (for any reason)
+ * - is_np(scheduled) // rescheduling must be delayed,
+ * sys_exit_np must be requested
+ *
+ * Any of these can occur together.
+ */
+static struct task_struct* gsnedf_schedule(struct task_struct * prev)
+{
+ cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries);
+ int out_of_time, sleep, preempt, np, exists, blocks;
+ struct task_struct* next = NULL;
+
+#ifdef CONFIG_RELEASE_MASTER
+ /* Bail out early if we are the release master.
+ * The release master never schedules any real-time tasks.
+ */
+ if (unlikely(gsnedf.release_master == entry->cpu)) {
+ sched_state_task_picked();
+ return NULL;
+ }
+#endif
+
+ raw_spin_lock(&gsnedf_lock);
+
+ /* sanity checking */
+ BUG_ON(entry->scheduled && entry->scheduled != prev);
+ BUG_ON(entry->scheduled && !is_realtime(prev));
+ BUG_ON(is_realtime(prev) && !entry->scheduled);
+
+ /* (0) Determine state */
+ exists = entry->scheduled != NULL;
+ blocks = exists && !is_running(entry->scheduled);
+ out_of_time = exists && budget_enforced(entry->scheduled)
+ && budget_exhausted(entry->scheduled);
+ np = exists && is_np(entry->scheduled);
+ sleep = exists && is_completed(entry->scheduled);
+ preempt = entry->scheduled != entry->linked;
+
+#ifdef WANT_ALL_SCHED_EVENTS
+ TRACE_TASK(prev, "invoked gsnedf_schedule.\n");
+#endif
+
+ if (exists)
+ TRACE_TASK(prev,
+ "blocks:%d out_of_time:%d np:%d sleep:%d preempt:%d "
+ "state:%d sig:%d\n",
+ blocks, out_of_time, np, sleep, preempt,
+ prev->state, signal_pending(prev));
+ if (entry->linked && preempt)
+ TRACE_TASK(prev, "will be preempted by %s/%d\n",
+ entry->linked->comm, entry->linked->pid);
+
+
+ /* If a task blocks we have no choice but to reschedule.
+ */
+ if (blocks)
+ unlink(entry->scheduled);
+
+ /* Request a sys_exit_np() call if we would like to preempt but cannot.
+ * We need to make sure to update the link structure anyway in case
+ * that we are still linked. Multiple calls to request_exit_np() don't
+ * hurt.
+ */
+ if (np && (out_of_time || preempt || sleep)) {
+ unlink(entry->scheduled);
+ request_exit_np(entry->scheduled);
+ }
+
+ /* Any task that is preemptable and either exhausts its execution
+ * budget or wants to sleep completes. We may have to reschedule after
+ * this. Don't do a job completion if we block (can't have timers running
+ * for blocked jobs).
+ */
+ if (!np && (out_of_time || sleep) && !blocks)
+ job_completion(entry->scheduled, !sleep);
+
+ /* Link pending task if we became unlinked.
+ */
+ if (!entry->linked)
+ link_task_to_cpu(__take_ready(&gsnedf), entry);
+
+ /* The final scheduling decision. Do we need to switch for some reason?
+ * If linked is different from scheduled, then select linked as next.
+ */
+ if ((!np || blocks) &&
+ entry->linked != entry->scheduled) {
+ /* Schedule a linked job? */
+ if (entry->linked) {
+ entry->linked->rt_param.scheduled_on = entry->cpu;
+ next = entry->linked;
+ TRACE_TASK(next, "scheduled_on = P%d\n", smp_processor_id());
+ }
+ if (entry->scheduled) {
+ /* not gonna be scheduled soon */
+ entry->scheduled->rt_param.scheduled_on = NO_CPU;
+ TRACE_TASK(entry->scheduled, "scheduled_on = NO_CPU\n");
+ }
+ } else
+ /* Only override Linux scheduler if we have a real-time task
+ * scheduled that needs to continue.
+ */
+ if (exists)
+ next = prev;
+
+ sched_state_task_picked();
+
+ raw_spin_unlock(&gsnedf_lock);
+
+#ifdef WANT_ALL_SCHED_EVENTS
+ TRACE("gsnedf_lock released, next=0x%p\n", next);
+
+ if (next)
+ TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+ else if (exists && !next)
+ TRACE("becomes idle at %llu.\n", litmus_clock());
+#endif
+
+
+ return next;
+}
+
+
+/* _finish_switch - we just finished the switch away from prev
+ */
+static void gsnedf_finish_switch(struct task_struct *prev)
+{
+ cpu_entry_t* entry = &__get_cpu_var(gsnedf_cpu_entries);
+
+ entry->scheduled = is_realtime(current) ? current : NULL;
+#ifdef WANT_ALL_SCHED_EVENTS
+ TRACE_TASK(prev, "switched away from\n");
+#endif
+}
+
+
+/* Prepare a task for running in RT mode
+ */
+static void gsnedf_task_new(struct task_struct * t, int on_rq, int running)
+{
+ unsigned long flags;
+ cpu_entry_t* entry;
+
+ TRACE("gsn edf: task new %d\n", t->pid);
+
+ raw_spin_lock_irqsave(&gsnedf_lock, flags);
+
+ /* setup job params */
+ release_at(t, litmus_clock());
+
+ if (running) {
+ entry = &per_cpu(gsnedf_cpu_entries, task_cpu(t));
+ BUG_ON(entry->scheduled);
+
+#ifdef CONFIG_RELEASE_MASTER
+ if (entry->cpu != gsnedf.release_master) {
+#endif
+ entry->scheduled = t;
+ tsk_rt(t)->scheduled_on = task_cpu(t);
+#ifdef CONFIG_RELEASE_MASTER
+ } else {
+ /* do not schedule on release master */
+ preempt(entry); /* force resched */
+ tsk_rt(t)->scheduled_on = NO_CPU;
+ }
+#endif
+ } else {
+ t->rt_param.scheduled_on = NO_CPU;
+ }
+ t->rt_param.linked_on = NO_CPU;
+
+ gsnedf_job_arrival(t);
+ raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+
+static void gsnedf_task_wake_up(struct task_struct *task)
+{
+ unsigned long flags;
+ lt_t now;
+
+ TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+
+ raw_spin_lock_irqsave(&gsnedf_lock, flags);
+ now = litmus_clock();
+ if (is_tardy(task, now)) {
+ /* new sporadic release */
+ release_at(task, now);
+ sched_trace_task_release(task);
+ }
+ else {
+ if (task->rt.time_slice) {
+ /* came back in time before deadline
+ */
+ tsk_rt(task)->completed = 0;
+ }
+ }
+ gsnedf_job_arrival(task);
+ raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+}
+
+static void gsnedf_task_block(struct task_struct *t)
+{
+ unsigned long flags;
+
+ TRACE_TASK(t, "block at %llu\n", litmus_clock());
+
+ /* unlink if necessary */
+ raw_spin_lock_irqsave(&gsnedf_lock, flags);
+ unlink(t);
+ raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+
+ BUG_ON(!is_realtime(t));
+}
+
+
+static void gsnedf_task_exit(struct task_struct * t)
+{
+ unsigned long flags;
+
+ /* unlink if necessary */
+ raw_spin_lock_irqsave(&gsnedf_lock, flags);
+ unlink(t);
+ if (tsk_rt(t)->scheduled_on != NO_CPU) {
+ gsnedf_cpus[tsk_rt(t)->scheduled_on]->scheduled = NULL;
+ tsk_rt(t)->scheduled_on = NO_CPU;
+ }
+ raw_spin_unlock_irqrestore(&gsnedf_lock, flags);
+
+ BUG_ON(!is_realtime(t));
+ TRACE_TASK(t, "RIP\n");
+}
+
+
+static long gsnedf_admit_task(struct task_struct* tsk)
+{
+ return 0;
+}
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+#include <litmus/fdso.h>
+
+/* called with IRQs off */
+static void set_priority_inheritance(struct task_struct* t, struct task_struct* prio_inh)
+{
+ int linked_on;
+ int check_preempt = 0;
+
+ raw_spin_lock(&gsnedf_lock);
+
+ TRACE_TASK(t, "inherits priority from %s/%d\n", prio_inh->comm, prio_inh->pid);
+ tsk_rt(t)->inh_task = prio_inh;
+
+ linked_on = tsk_rt(t)->linked_on;
+
+ /* If it is scheduled, then we need to reorder the CPU heap. */
+ if (linked_on != NO_CPU) {
+ TRACE_TASK(t, "%s: linked on %d\n",
+ __FUNCTION__, linked_on);
+ /* Holder is scheduled; need to re-order CPUs.
+ * We can't use heap_decrease() here since
+ * the cpu_heap is ordered in reverse direction, so
+ * it is actually an increase. */
+ bheap_delete(cpu_lower_prio, &gsnedf_cpu_heap,
+ gsnedf_cpus[linked_on]->hn);
+ bheap_insert(cpu_lower_prio, &gsnedf_cpu_heap,
+ gsnedf_cpus[linked_on]->hn);
+ } else {
+ /* holder may be queued: first stop queue changes */
+ raw_spin_lock(&gsnedf.release_lock);
+ if (is_queued(t)) {
+ TRACE_TASK(t, "%s: is queued\n",
+ __FUNCTION__);
+ /* We need to update the position of holder in some
+ * heap. Note that this could be a release heap if we
+ * budget enforcement is used and this job overran. */
+ check_preempt =
+ !bheap_decrease(edf_ready_order,
+ tsk_rt(t)->heap_node);
+ } else {
+ /* Nothing to do: if it is not queued and not linked
+ * then it is either sleeping or currently being moved
+ * by other code (e.g., a timer interrupt handler) that
+ * will use the correct priority when enqueuing the
+ * task. */
+ TRACE_TASK(t, "%s: is NOT queued => Done.\n",
+ __FUNCTION__);
+ }
+ raw_spin_unlock(&gsnedf.release_lock);
+
+ /* If holder was enqueued in a release heap, then the following
+ * preemption check is pointless, but we can't easily detect
+ * that case. If you want to fix this, then consider that
+ * simply adding a state flag requires O(n) time to update when
+ * releasing n tasks, which conflicts with the goal to have
+ * O(log n) merges. */
+ if (check_preempt) {
+ /* heap_decrease() hit the top level of the heap: make
+ * sure preemption checks get the right task, not the
+ * potentially stale cache. */
+ bheap_uncache_min(edf_ready_order,
+ &gsnedf.ready_queue);
+ check_for_preemptions();
+ }
+ }
+
+ raw_spin_unlock(&gsnedf_lock);
+}
+
+/* called with IRQs off */
+static void clear_priority_inheritance(struct task_struct* t)
+{
+ raw_spin_lock(&gsnedf_lock);
+
+ /* A job only stops inheriting a priority when it releases a
+ * resource. Thus we can make the following assumption.*/
+ BUG_ON(tsk_rt(t)->scheduled_on == NO_CPU);
+
+ TRACE_TASK(t, "priority restored\n");
+ tsk_rt(t)->inh_task = NULL;
+
+ /* Check if rescheduling is necessary. We can't use heap_decrease()
+ * since the priority was effectively lowered. */
+ unlink(t);
+ gsnedf_job_arrival(t);
+
+ raw_spin_unlock(&gsnedf_lock);
+}
+
+
+/* ******************** FMLP support ********************** */
+
+/* struct for semaphore with priority inheritance */
+struct fmlp_semaphore {
+ struct litmus_lock litmus_lock;
+
+ /* current resource holder */
+ struct task_struct *owner;
+
+ /* highest-priority waiter */
+ struct task_struct *hp_waiter;
+
+ /* FIFO queue of waiting tasks */
+ wait_queue_head_t wait;
+};
+
+static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock)
+{
+ return container_of(lock, struct fmlp_semaphore, litmus_lock);
+}
+
+/* caller is responsible for locking */
+struct task_struct* find_hp_waiter(struct fmlp_semaphore *sem,
+ struct task_struct* skip)
+{
+ struct list_head *pos;
+ struct task_struct *queued, *found = NULL;
+
+ list_for_each(pos, &sem->wait.task_list) {
+ queued = (struct task_struct*) list_entry(pos, wait_queue_t,
+ task_list)->private;
+
+ /* Compare task prios, find high prio task. */
+ if (queued != skip && edf_higher_prio(queued, found))
+ found = queued;
+ }
+ return found;
+}
+
+int gsnedf_fmlp_lock(struct litmus_lock* l)
+{
+ struct task_struct* t = current;
+ struct fmlp_semaphore *sem = fmlp_from_lock(l);
+ wait_queue_t wait;
+ unsigned long flags;
+
+ if (!is_realtime(t))
+ return -EPERM;
+
+ spin_lock_irqsave(&sem->wait.lock, flags);
+
+ if (sem->owner) {
+ /* resource is not free => must suspend and wait */
+
+ init_waitqueue_entry(&wait, t);
+
+ /* FIXME: interruptible would be nice some day */
+ set_task_state(t, TASK_UNINTERRUPTIBLE);
+
+ __add_wait_queue_tail_exclusive(&sem->wait, &wait);
+
+ /* check if we need to activate priority inheritance */
+ if (edf_higher_prio(t, sem->hp_waiter)) {
+ sem->hp_waiter = t;
+ if (edf_higher_prio(t, sem->owner))
+ set_priority_inheritance(sem->owner, sem->hp_waiter);
+ }
+
+ TS_LOCK_SUSPEND;
+
+ /* release lock before sleeping */
+ spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+ /* We depend on the FIFO order. Thus, we don't need to recheck
+ * when we wake up; we are guaranteed to have the lock since
+ * there is only one wake up per release.
+ */
+
+ schedule();
+
+ TS_LOCK_RESUME;
+
+ /* Since we hold the lock, no other task will change
+ * ->owner. We can thus check it without acquiring the spin
+ * lock. */
+ BUG_ON(sem->owner != t);
+ } else {
+ /* it's ours now */
+ sem->owner = t;
+
+ spin_unlock_irqrestore(&sem->wait.lock, flags);
+ }
+
+ return 0;
+}
+
+int gsnedf_fmlp_unlock(struct litmus_lock* l)
+{
+ struct task_struct *t = current, *next;
+ struct fmlp_semaphore *sem = fmlp_from_lock(l);
+ unsigned long flags;
+ int err = 0;
+
+ spin_lock_irqsave(&sem->wait.lock, flags);
+
+ if (sem->owner != t) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* check if there are jobs waiting for this resource */
+ next = __waitqueue_remove_first(&sem->wait);
+ if (next) {
+ /* next becomes the resouce holder */
+ sem->owner = next;
+ TRACE_CUR("lock ownership passed to %s/%d\n", next->comm, next->pid);
+
+ /* determine new hp_waiter if necessary */
+ if (next == sem->hp_waiter) {
+ TRACE_TASK(next, "was highest-prio waiter\n");
+ /* next has the highest priority --- it doesn't need to
+ * inherit. However, we need to make sure that the
+ * next-highest priority in the queue is reflected in
+ * hp_waiter. */
+ sem->hp_waiter = find_hp_waiter(sem, next);
+ if (sem->hp_waiter)
+ TRACE_TASK(sem->hp_waiter, "is new highest-prio waiter\n");
+ else
+ TRACE("no further waiters\n");
+ } else {
+ /* Well, if next is not the highest-priority waiter,
+ * then it ought to inherit the highest-priority
+ * waiter's priority. */
+ set_priority_inheritance(next, sem->hp_waiter);
+ }
+
+ /* wake up next */
+ wake_up_process(next);
+ } else
+ /* becomes available */
+ sem->owner = NULL;
+
+ /* we lose the benefit of priority inheritance (if any) */
+ if (tsk_rt(t)->inh_task)
+ clear_priority_inheritance(t);
+
+out:
+ spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+ return err;
+}
+
+int gsnedf_fmlp_close(struct litmus_lock* l)
+{
+ struct task_struct *t = current;
+ struct fmlp_semaphore *sem = fmlp_from_lock(l);
+ unsigned long flags;
+
+ int owner;
+
+ spin_lock_irqsave(&sem->wait.lock, flags);
+
+ owner = sem->owner == t;
+
+ spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+ if (owner)
+ gsnedf_fmlp_unlock(l);
+
+ return 0;
+}
+
+void gsnedf_fmlp_free(struct litmus_lock* lock)
+{
+ kfree(fmlp_from_lock(lock));
+}
+
+static struct litmus_lock_ops gsnedf_fmlp_lock_ops = {
+ .close = gsnedf_fmlp_close,
+ .lock = gsnedf_fmlp_lock,
+ .unlock = gsnedf_fmlp_unlock,
+ .deallocate = gsnedf_fmlp_free,
+};
+
+static struct litmus_lock* gsnedf_new_fmlp(void)
+{
+ struct fmlp_semaphore* sem;
+
+ sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+ if (!sem)
+ return NULL;
+
+ sem->owner = NULL;
+ sem->hp_waiter = NULL;
+ init_waitqueue_head(&sem->wait);
+ sem->litmus_lock.ops = &gsnedf_fmlp_lock_ops;
+
+ return &sem->litmus_lock;
+}
+
+/* **** lock constructor **** */
+
+
+static long gsnedf_allocate_lock(struct litmus_lock **lock, int type,
+ void* __user unused)
+{
+ int err = -ENXIO;
+
+ /* GSN-EDF currently only supports the FMLP for global resources. */
+ switch (type) {
+
+ case FMLP_SEM:
+ /* Flexible Multiprocessor Locking Protocol */
+ *lock = gsnedf_new_fmlp();
+ if (*lock)
+ err = 0;
+ else
+ err = -ENOMEM;
+ break;
+
+ };
+
+ return err;
+}
+
+#endif
+
+
+static long gsnedf_activate_plugin(void)
+{
+ int cpu;
+ cpu_entry_t *entry;
+
+ bheap_init(&gsnedf_cpu_heap);
+#ifdef CONFIG_RELEASE_MASTER
+ gsnedf.release_master = atomic_read(&release_master_cpu);
+#endif
+
+ for_each_online_cpu(cpu) {
+ entry = &per_cpu(gsnedf_cpu_entries, cpu);
+ bheap_node_init(&entry->hn, entry);
+ entry->linked = NULL;
+ entry->scheduled = NULL;
+#ifdef CONFIG_RELEASE_MASTER
+ if (cpu != gsnedf.release_master) {
+#endif
+ TRACE("GSN-EDF: Initializing CPU #%d.\n", cpu);
+ update_cpu_position(entry);
+#ifdef CONFIG_RELEASE_MASTER
+ } else {
+ TRACE("GSN-EDF: CPU %d is release master.\n", cpu);
+ }
+#endif
+ }
+ return 0;
+}
+
+/* Plugin object */
+static struct sched_plugin gsn_edf_plugin __cacheline_aligned_in_smp = {
+ .plugin_name = "GSN-EDF",
+ .finish_switch = gsnedf_finish_switch,
+ .tick = gsnedf_tick,
+ .task_new = gsnedf_task_new,
+ .complete_job = complete_job,
+ .task_exit = gsnedf_task_exit,
+ .schedule = gsnedf_schedule,
+ .task_wake_up = gsnedf_task_wake_up,
+ .task_block = gsnedf_task_block,
+ .admit_task = gsnedf_admit_task,
+ .activate_plugin = gsnedf_activate_plugin,
+#ifdef CONFIG_LITMUS_LOCKING
+ .allocate_lock = gsnedf_allocate_lock,
+#endif
+};
+
+
+static int __init init_gsn_edf(void)
+{
+ int cpu;
+ cpu_entry_t *entry;
+
+ bheap_init(&gsnedf_cpu_heap);
+ /* initialize CPU state */
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ entry = &per_cpu(gsnedf_cpu_entries, cpu);
+ gsnedf_cpus[cpu] = entry;
+ entry->cpu = cpu;
+ entry->hn = &gsnedf_heap_node[cpu];
+ bheap_node_init(&entry->hn, entry);
+ }
+ edf_domain_init(&gsnedf, NULL, gsnedf_release_jobs);
+ return register_sched_plugin(&gsn_edf_plugin);
+}
+
+
+module_init(init_gsn_edf);
diff --git a/litmus/sched_litmus.c b/litmus/sched_litmus.c
new file mode 100644
index 0000000..6b32cf0
--- /dev/null
+++ b/litmus/sched_litmus.c
@@ -0,0 +1,330 @@
+/* This file is included from kernel/sched.c */
+
+#include <litmus/litmus.h>
+#include <litmus/budget.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/preempt.h>
+
+static void update_time_litmus(struct rq *rq, struct task_struct *p)
+{
+ u64 delta = rq->clock - p->se.exec_start;
+ if (unlikely((s64)delta < 0))
+ delta = 0;
+ /* per job counter */
+ p->rt_param.job_params.exec_time += delta;
+ /* task counter */
+ p->se.sum_exec_runtime += delta;
+ /* sched_clock() */
+ p->se.exec_start = rq->clock;
+ cpuacct_charge(p, delta);
+}
+
+static void double_rq_lock(struct rq *rq1, struct rq *rq2);
+static void double_rq_unlock(struct rq *rq1, struct rq *rq2);
+
+/*
+ * litmus_tick gets called by scheduler_tick() with HZ freq
+ * Interrupts are disabled
+ */
+static void litmus_tick(struct rq *rq, struct task_struct *p)
+{
+ TS_PLUGIN_TICK_START;
+
+ if (is_realtime(p))
+ update_time_litmus(rq, p);
+
+ /* plugin tick */
+ litmus->tick(p);
+
+ TS_PLUGIN_TICK_END;
+
+ return;
+}
+
+static struct task_struct *
+litmus_schedule(struct rq *rq, struct task_struct *prev)
+{
+ struct rq* other_rq;
+ struct task_struct *next;
+
+ long was_running;
+ lt_t _maybe_deadlock = 0;
+
+ /* let the plugin schedule */
+ next = litmus->schedule(prev);
+
+ sched_state_plugin_check();
+
+ /* check if a global plugin pulled a task from a different RQ */
+ if (next && task_rq(next) != rq) {
+ /* we need to migrate the task */
+ other_rq = task_rq(next);
+ TRACE_TASK(next, "migrate from %d\n", other_rq->cpu);
+
+ /* while we drop the lock, the prev task could change its
+ * state
+ */
+ was_running = is_running(prev);
+ mb();
+ raw_spin_unlock(&rq->lock);
+
+ /* Don't race with a concurrent switch. This could deadlock in
+ * the case of cross or circular migrations. It's the job of
+ * the plugin to make sure that doesn't happen.
+ */
+ TRACE_TASK(next, "stack_in_use=%d\n",
+ next->rt_param.stack_in_use);
+ if (next->rt_param.stack_in_use != NO_CPU) {
+ TRACE_TASK(next, "waiting to deschedule\n");
+ _maybe_deadlock = litmus_clock();
+ }
+ while (next->rt_param.stack_in_use != NO_CPU) {
+ cpu_relax();
+ mb();
+ if (next->rt_param.stack_in_use == NO_CPU)
+ TRACE_TASK(next,"descheduled. Proceeding.\n");
+
+ if (lt_before(_maybe_deadlock + 10000000,
+ litmus_clock())) {
+ /* We've been spinning for 10ms.
+ * Something can't be right!
+ * Let's abandon the task and bail out; at least
+ * we will have debug info instead of a hard
+ * deadlock.
+ */
+ TRACE_TASK(next,"stack too long in use. "
+ "Deadlock?\n");
+ next = NULL;
+
+ /* bail out */
+ raw_spin_lock(&rq->lock);
+ return next;
+ }
+ }
+#ifdef __ARCH_WANT_UNLOCKED_CTXSW
+ if (next->on_cpu)
+ TRACE_TASK(next, "waiting for !oncpu");
+ while (next->on_cpu) {
+ cpu_relax();
+ mb();
+ }
+#endif
+ double_rq_lock(rq, other_rq);
+ mb();
+ if (is_realtime(prev) && is_running(prev) != was_running) {
+ TRACE_TASK(prev,
+ "state changed while we dropped"
+ " the lock: is_running=%d, was_running=%d\n",
+ is_running(prev), was_running);
+ if (is_running(prev) && !was_running) {
+ /* prev task became unblocked
+ * we need to simulate normal sequence of events
+ * to scheduler plugins.
+ */
+ litmus->task_block(prev);
+ litmus->task_wake_up(prev);
+ }
+ }
+
+ set_task_cpu(next, smp_processor_id());
+
+ /* DEBUG: now that we have the lock we need to make sure a
+ * couple of things still hold:
+ * - it is still a real-time task
+ * - it is still runnable (could have been stopped)
+ * If either is violated, then the active plugin is
+ * doing something wrong.
+ */
+ if (!is_realtime(next) || !is_running(next)) {
+ /* BAD BAD BAD */
+ TRACE_TASK(next,"BAD: migration invariant FAILED: "
+ "rt=%d running=%d\n",
+ is_realtime(next),
+ is_running(next));
+ /* drop the task */
+ next = NULL;
+ }
+ /* release the other CPU's runqueue, but keep ours */
+ raw_spin_unlock(&other_rq->lock);
+ }
+ if (next) {
+ next->rt_param.stack_in_use = rq->cpu;
+ next->se.exec_start = rq->clock;
+ }
+
+ update_enforcement_timer(next);
+ return next;
+}
+
+static void enqueue_task_litmus(struct rq *rq, struct task_struct *p,
+ int flags)
+{
+ if (flags & ENQUEUE_WAKEUP) {
+ sched_trace_task_resume(p);
+ tsk_rt(p)->present = 1;
+ /* LITMUS^RT plugins need to update the state
+ * _before_ making it available in global structures.
+ * Linux gets away with being lazy about the task state
+ * update. We can't do that, hence we update the task
+ * state already here.
+ *
+ * WARNING: this needs to be re-evaluated when porting
+ * to newer kernel versions.
+ */
+ p->state = TASK_RUNNING;
+ litmus->task_wake_up(p);
+
+ rq->litmus.nr_running++;
+ } else
+ TRACE_TASK(p, "ignoring an enqueue, not a wake up.\n");
+}
+
+static void dequeue_task_litmus(struct rq *rq, struct task_struct *p,
+ int flags)
+{
+ if (flags & DEQUEUE_SLEEP) {
+ litmus->task_block(p);
+ tsk_rt(p)->present = 0;
+ sched_trace_task_block(p);
+
+ rq->litmus.nr_running--;
+ } else
+ TRACE_TASK(p, "ignoring a dequeue, not going to sleep.\n");
+}
+
+static void yield_task_litmus(struct rq *rq)
+{
+ TS_SYSCALL_IN_START;
+ TS_SYSCALL_IN_END;
+
+ BUG_ON(rq->curr != current);
+ /* sched_yield() is called to trigger delayed preemptions.
+ * Thus, mark the current task as needing to be rescheduled.
+ * This will cause the scheduler plugin to be invoked, which can
+ * then determine if a preemption is still required.
+ */
+ clear_exit_np(current);
+ litmus_reschedule_local();
+
+ TS_SYSCALL_OUT_START;
+}
+
+/* Plugins are responsible for this.
+ */
+static void check_preempt_curr_litmus(struct rq *rq, struct task_struct *p, int flags)
+{
+}
+
+static void put_prev_task_litmus(struct rq *rq, struct task_struct *p)
+{
+}
+
+static void pre_schedule_litmus(struct rq *rq, struct task_struct *prev)
+{
+ update_time_litmus(rq, prev);
+ if (!is_running(prev))
+ tsk_rt(prev)->present = 0;
+}
+
+/* pick_next_task_litmus() - litmus_schedule() function
+ *
+ * return the next task to be scheduled
+ */
+static struct task_struct *pick_next_task_litmus(struct rq *rq)
+{
+ /* get the to-be-switched-out task (prev) */
+ struct task_struct *prev = rq->litmus.prev;
+ struct task_struct *next;
+
+ /* if not called from schedule() but from somewhere
+ * else (e.g., migration), return now!
+ */
+ if(!rq->litmus.prev)
+ return NULL;
+
+ rq->litmus.prev = NULL;
+
+ TS_PLUGIN_SCHED_START;
+ next = litmus_schedule(rq, prev);
+ TS_PLUGIN_SCHED_END;
+
+ return next;
+}
+
+static void task_tick_litmus(struct rq *rq, struct task_struct *p, int queued)
+{
+ /* nothing to do; tick related tasks are done by litmus_tick() */
+ return;
+}
+
+static void switched_to_litmus(struct rq *rq, struct task_struct *p)
+{
+}
+
+static void prio_changed_litmus(struct rq *rq, struct task_struct *p,
+ int oldprio)
+{
+}
+
+unsigned int get_rr_interval_litmus(struct rq *rq, struct task_struct *p)
+{
+ /* return infinity */
+ return 0;
+}
+
+/* This is called when a task became a real-time task, either due to a SCHED_*
+ * class transition or due to PI mutex inheritance. We don't handle Linux PI
+ * mutex inheritance yet (and probably never will). Use LITMUS provided
+ * synchronization primitives instead.
+ */
+static void set_curr_task_litmus(struct rq *rq)
+{
+ rq->curr->se.exec_start = rq->clock;
+}
+
+
+#ifdef CONFIG_SMP
+/* execve tries to rebalance task in this scheduling domain.
+ * We don't care about the scheduling domain; can gets called from
+ * exec, fork, wakeup.
+ */
+static int
+select_task_rq_litmus(struct task_struct *p, int sd_flag, int flags)
+{
+ /* preemption is already disabled.
+ * We don't want to change cpu here
+ */
+ return task_cpu(p);
+}
+#endif
+
+static const struct sched_class litmus_sched_class = {
+ /* From 34f971f6 the stop/migrate worker threads have a class on
+ * their own, which is the highest prio class. We don't support
+ * cpu-hotplug or cpu throttling. Allows Litmus to use up to 1.0
+ * CPU capacity.
+ */
+ .next = &stop_sched_class,
+ .enqueue_task = enqueue_task_litmus,
+ .dequeue_task = dequeue_task_litmus,
+ .yield_task = yield_task_litmus,
+
+ .check_preempt_curr = check_preempt_curr_litmus,
+
+ .pick_next_task = pick_next_task_litmus,
+ .put_prev_task = put_prev_task_litmus,
+
+#ifdef CONFIG_SMP
+ .select_task_rq = select_task_rq_litmus,
+
+ .pre_schedule = pre_schedule_litmus,
+#endif
+
+ .set_curr_task = set_curr_task_litmus,
+ .task_tick = task_tick_litmus,
+
+ .get_rr_interval = get_rr_interval_litmus,
+
+ .prio_changed = prio_changed_litmus,
+ .switched_to = switched_to_litmus,
+};
diff --git a/litmus/sched_pfair.c b/litmus/sched_pfair.c
new file mode 100644
index 0000000..6a89b00
--- /dev/null
+++ b/litmus/sched_pfair.c
@@ -0,0 +1,1074 @@
+/*
+ * kernel/sched_pfair.c
+ *
+ * Implementation of the PD^2 pfair scheduling algorithm. This
+ * implementation realizes "early releasing," i.e., it is work-conserving.
+ *
+ */
+
+#include <asm/div64.h>
+#include <linux/delay.h>
+#include <linux/module.h>
+#include <linux/spinlock.h>
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/slab.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/preempt.h>
+#include <litmus/rt_domain.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+
+#include <litmus/bheap.h>
+
+/* to configure the cluster size */
+#include <litmus/litmus_proc.h>
+
+#include <litmus/clustered.h>
+
+static enum cache_level pfair_cluster_level = GLOBAL_CLUSTER;
+
+struct subtask {
+ /* measured in quanta relative to job release */
+ quanta_t release;
+ quanta_t deadline;
+ quanta_t overlap; /* called "b bit" by PD^2 */
+ quanta_t group_deadline;
+};
+
+struct pfair_param {
+ quanta_t quanta; /* number of subtasks */
+ quanta_t cur; /* index of current subtask */
+
+ quanta_t release; /* in quanta */
+ quanta_t period; /* in quanta */
+
+ quanta_t last_quantum; /* when scheduled last */
+ int last_cpu; /* where scheduled last */
+
+ struct pfair_cluster* cluster; /* where this task is scheduled */
+
+ struct subtask subtasks[0]; /* allocate together with pfair_param */
+};
+
+#define tsk_pfair(tsk) ((tsk)->rt_param.pfair)
+
+struct pfair_state {
+ struct cluster_cpu topology;
+
+ volatile quanta_t cur_tick; /* updated by the CPU that is advancing
+ * the time */
+ volatile quanta_t local_tick; /* What tick is the local CPU currently
+ * executing? Updated only by the local
+ * CPU. In QEMU, this may lag behind the
+ * current tick. In a real system, with
+ * proper timers and aligned quanta,
+ * that should only be the case for a
+ * very short time after the time
+ * advanced. With staggered quanta, it
+ * will lag for the duration of the
+ * offset.
+ */
+
+ struct task_struct* linked; /* the task that should be executing */
+ struct task_struct* local; /* the local copy of linked */
+ struct task_struct* scheduled; /* what is actually scheduled */
+
+ lt_t offset; /* stagger offset */
+ unsigned int missed_updates;
+ unsigned int missed_quanta;
+};
+
+struct pfair_cluster {
+ struct scheduling_cluster topology;
+
+ /* The "global" time in this cluster. */
+ quanta_t pfair_time; /* the "official" PFAIR clock */
+
+ /* The ready queue for this cluster. */
+ rt_domain_t pfair;
+
+ /* The set of jobs that should have their release enacted at the next
+ * quantum boundary.
+ */
+ struct bheap release_queue;
+ raw_spinlock_t release_lock;
+};
+
+#define RT_F_REQUEUE 0x2
+
+static inline struct pfair_cluster* cpu_cluster(struct pfair_state* state)
+{
+ return container_of(state->topology.cluster, struct pfair_cluster, topology);
+}
+
+static inline int cpu_id(struct pfair_state* state)
+{
+ return state->topology.id;
+}
+
+static inline struct pfair_state* from_cluster_list(struct list_head* pos)
+{
+ return list_entry(pos, struct pfair_state, topology.cluster_list);
+}
+
+static inline struct pfair_cluster* from_domain(rt_domain_t* rt)
+{
+ return container_of(rt, struct pfair_cluster, pfair);
+}
+
+static inline raw_spinlock_t* cluster_lock(struct pfair_cluster* cluster)
+{
+ /* The ready_lock is used to serialize all scheduling events. */
+ return &cluster->pfair.ready_lock;
+}
+
+static inline raw_spinlock_t* cpu_lock(struct pfair_state* state)
+{
+ return cluster_lock(cpu_cluster(state));
+}
+
+DEFINE_PER_CPU(struct pfair_state, pfair_state);
+struct pfair_state* *pstate; /* short cut */
+
+static struct pfair_cluster* pfair_clusters;
+static int num_pfair_clusters;
+
+/* Enable for lots of trace info.
+ * #define PFAIR_DEBUG
+ */
+
+#ifdef PFAIR_DEBUG
+#define PTRACE_TASK(t, f, args...) TRACE_TASK(t, f, ## args)
+#define PTRACE(f, args...) TRACE(f, ## args)
+#else
+#define PTRACE_TASK(t, f, args...)
+#define PTRACE(f, args...)
+#endif
+
+/* gcc will inline all of these accessor functions... */
+static struct subtask* cur_subtask(struct task_struct* t)
+{
+ return tsk_pfair(t)->subtasks + tsk_pfair(t)->cur;
+}
+
+static quanta_t cur_deadline(struct task_struct* t)
+{
+ return cur_subtask(t)->deadline + tsk_pfair(t)->release;
+}
+
+static quanta_t cur_release(struct task_struct* t)
+{
+ /* This is early releasing: only the release of the first subtask
+ * counts. */
+ return tsk_pfair(t)->release;
+}
+
+static quanta_t cur_overlap(struct task_struct* t)
+{
+ return cur_subtask(t)->overlap;
+}
+
+static quanta_t cur_group_deadline(struct task_struct* t)
+{
+ quanta_t gdl = cur_subtask(t)->group_deadline;
+ if (gdl)
+ return gdl + tsk_pfair(t)->release;
+ else
+ return gdl;
+}
+
+
+static int pfair_higher_prio(struct task_struct* first,
+ struct task_struct* second)
+{
+ return /* first task must exist */
+ first && (
+ /* Does the second task exist and is it a real-time task? If
+ * not, the first task (which is a RT task) has higher
+ * priority.
+ */
+ !second || !is_realtime(second) ||
+
+ /* Is the (subtask) deadline of the first task earlier?
+ * Then it has higher priority.
+ */
+ time_before(cur_deadline(first), cur_deadline(second)) ||
+
+ /* Do we have a deadline tie?
+ * Then break by B-bit.
+ */
+ (cur_deadline(first) == cur_deadline(second) &&
+ (cur_overlap(first) > cur_overlap(second) ||
+
+ /* Do we have a B-bit tie?
+ * Then break by group deadline.
+ */
+ (cur_overlap(first) == cur_overlap(second) &&
+ (time_after(cur_group_deadline(first),
+ cur_group_deadline(second)) ||
+
+ /* Do we have a group deadline tie?
+ * Then break by PID, which are unique.
+ */
+ (cur_group_deadline(first) ==
+ cur_group_deadline(second) &&
+ first->pid < second->pid))))));
+}
+
+int pfair_ready_order(struct bheap_node* a, struct bheap_node* b)
+{
+ return pfair_higher_prio(bheap2task(a), bheap2task(b));
+}
+
+static void pfair_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+ struct pfair_cluster* cluster = from_domain(rt);
+ unsigned long flags;
+
+ raw_spin_lock_irqsave(&cluster->release_lock, flags);
+
+ bheap_union(pfair_ready_order, &cluster->release_queue, tasks);
+
+ raw_spin_unlock_irqrestore(&cluster->release_lock, flags);
+}
+
+static void prepare_release(struct task_struct* t, quanta_t at)
+{
+ tsk_pfair(t)->release = at;
+ tsk_pfair(t)->cur = 0;
+}
+
+/* pull released tasks from the release queue */
+static void poll_releases(struct pfair_cluster* cluster)
+{
+ raw_spin_lock(&cluster->release_lock);
+ __merge_ready(&cluster->pfair, &cluster->release_queue);
+ raw_spin_unlock(&cluster->release_lock);
+}
+
+static void check_preempt(struct task_struct* t)
+{
+ int cpu = NO_CPU;
+ if (tsk_rt(t)->linked_on != tsk_rt(t)->scheduled_on &&
+ is_present(t)) {
+ /* the task can be scheduled and
+ * is not scheduled where it ought to be scheduled
+ */
+ cpu = tsk_rt(t)->linked_on != NO_CPU ?
+ tsk_rt(t)->linked_on :
+ tsk_rt(t)->scheduled_on;
+ PTRACE_TASK(t, "linked_on:%d, scheduled_on:%d\n",
+ tsk_rt(t)->linked_on, tsk_rt(t)->scheduled_on);
+ /* preempt */
+ litmus_reschedule(cpu);
+ }
+}
+
+/* caller must hold pfair.ready_lock */
+static void drop_all_references(struct task_struct *t)
+{
+ int cpu;
+ struct pfair_state* s;
+ struct pfair_cluster* cluster;
+ if (bheap_node_in_heap(tsk_rt(t)->heap_node)) {
+ /* It must be in the ready queue; drop references isn't called
+ * when the job is in a release queue. */
+ cluster = tsk_pfair(t)->cluster;
+ bheap_delete(pfair_ready_order, &cluster->pfair.ready_queue,
+ tsk_rt(t)->heap_node);
+ }
+ for (cpu = 0; cpu < num_online_cpus(); cpu++) {
+ s = &per_cpu(pfair_state, cpu);
+ if (s->linked == t)
+ s->linked = NULL;
+ if (s->local == t)
+ s->local = NULL;
+ if (s->scheduled == t)
+ s->scheduled = NULL;
+ }
+ /* make sure we don't have a stale linked_on field */
+ tsk_rt(t)->linked_on = NO_CPU;
+}
+
+static void pfair_prepare_next_period(struct task_struct* t)
+{
+ struct pfair_param* p = tsk_pfair(t);
+
+ prepare_for_next_period(t);
+ tsk_rt(t)->completed = 0;
+ p->release += p->period;
+}
+
+/* returns 1 if the task needs to go the release queue */
+static int advance_subtask(quanta_t time, struct task_struct* t, int cpu)
+{
+ struct pfair_param* p = tsk_pfair(t);
+ int to_relq;
+ p->cur = (p->cur + 1) % p->quanta;
+ if (!p->cur) {
+ if (is_present(t)) {
+ /* The job overran; we start a new budget allocation. */
+ pfair_prepare_next_period(t);
+ } else {
+ /* remove task from system until it wakes */
+ drop_all_references(t);
+ tsk_rt(t)->flags = RT_F_REQUEUE;
+ TRACE_TASK(t, "on %d advanced to subtask %lu (not present)\n",
+ cpu, p->cur);
+ return 0;
+ }
+ }
+ to_relq = time_after(cur_release(t), time);
+ TRACE_TASK(t, "on %d advanced to subtask %lu -> to_relq=%d (cur_release:%lu time:%lu)\n",
+ cpu, p->cur, to_relq, cur_release(t), time);
+ return to_relq;
+}
+
+static void advance_subtasks(struct pfair_cluster *cluster, quanta_t time)
+{
+ struct task_struct* l;
+ struct pfair_param* p;
+ struct list_head* pos;
+ struct pfair_state* cpu;
+
+ list_for_each(pos, &cluster->topology.cpus) {
+ cpu = from_cluster_list(pos);
+ l = cpu->linked;
+ cpu->missed_updates += cpu->linked != cpu->local;
+ if (l) {
+ p = tsk_pfair(l);
+ p->last_quantum = time;
+ p->last_cpu = cpu_id(cpu);
+ if (advance_subtask(time, l, cpu_id(cpu))) {
+ //cpu->linked = NULL;
+ PTRACE_TASK(l, "should go to release queue. "
+ "scheduled_on=%d present=%d\n",
+ tsk_rt(l)->scheduled_on,
+ tsk_rt(l)->present);
+ }
+ }
+ }
+}
+
+static int target_cpu(quanta_t time, struct task_struct* t, int default_cpu)
+{
+ int cpu;
+ if (tsk_rt(t)->scheduled_on != NO_CPU) {
+ /* always observe scheduled_on linkage */
+ default_cpu = tsk_rt(t)->scheduled_on;
+ } else if (tsk_pfair(t)->last_quantum == time - 1) {
+ /* back2back quanta */
+ /* Only observe last_quantum if no scheduled_on is in the way.
+ * This should only kick in if a CPU missed quanta, and that
+ * *should* only happen in QEMU.
+ */
+ cpu = tsk_pfair(t)->last_cpu;
+ if (!pstate[cpu]->linked ||
+ tsk_rt(pstate[cpu]->linked)->scheduled_on != cpu) {
+ default_cpu = cpu;
+ }
+ }
+ return default_cpu;
+}
+
+/* returns one if linking was redirected */
+static int pfair_link(quanta_t time, int cpu,
+ struct task_struct* t)
+{
+ int target = target_cpu(time, t, cpu);
+ struct task_struct* prev = pstate[cpu]->linked;
+ struct task_struct* other;
+ struct pfair_cluster* cluster = cpu_cluster(pstate[cpu]);
+
+ if (target != cpu) {
+ BUG_ON(pstate[target]->topology.cluster != pstate[cpu]->topology.cluster);
+ other = pstate[target]->linked;
+ pstate[target]->linked = t;
+ tsk_rt(t)->linked_on = target;
+ if (!other)
+ /* linked ok, but reschedule this CPU */
+ return 1;
+ if (target < cpu) {
+ /* link other to cpu instead */
+ tsk_rt(other)->linked_on = cpu;
+ pstate[cpu]->linked = other;
+ if (prev) {
+ /* prev got pushed back into the ready queue */
+ tsk_rt(prev)->linked_on = NO_CPU;
+ __add_ready(&cluster->pfair, prev);
+ }
+ /* we are done with this cpu */
+ return 0;
+ } else {
+ /* re-add other, it's original CPU was not considered yet */
+ tsk_rt(other)->linked_on = NO_CPU;
+ __add_ready(&cluster->pfair, other);
+ /* reschedule this CPU */
+ return 1;
+ }
+ } else {
+ pstate[cpu]->linked = t;
+ tsk_rt(t)->linked_on = cpu;
+ if (prev) {
+ /* prev got pushed back into the ready queue */
+ tsk_rt(prev)->linked_on = NO_CPU;
+ __add_ready(&cluster->pfair, prev);
+ }
+ /* we are done with this CPU */
+ return 0;
+ }
+}
+
+static void schedule_subtasks(struct pfair_cluster *cluster, quanta_t time)
+{
+ int retry;
+ struct list_head *pos;
+ struct pfair_state *cpu_state;
+
+ list_for_each(pos, &cluster->topology.cpus) {
+ cpu_state = from_cluster_list(pos);
+ retry = 1;
+#ifdef CONFIG_RELEASE_MASTER
+ /* skip release master */
+ if (cluster->pfair.release_master == cpu_id(cpu_state))
+ continue;
+#endif
+ while (retry) {
+ if (pfair_higher_prio(__peek_ready(&cluster->pfair),
+ cpu_state->linked))
+ retry = pfair_link(time, cpu_id(cpu_state),
+ __take_ready(&cluster->pfair));
+ else
+ retry = 0;
+ }
+ }
+}
+
+static void schedule_next_quantum(struct pfair_cluster *cluster, quanta_t time)
+{
+ struct pfair_state *cpu;
+ struct list_head* pos;
+
+ /* called with interrupts disabled */
+ PTRACE("--- Q %lu at %llu PRE-SPIN\n",
+ time, litmus_clock());
+ raw_spin_lock(cluster_lock(cluster));
+ PTRACE("<<< Q %lu at %llu\n",
+ time, litmus_clock());
+
+ sched_trace_quantum_boundary();
+
+ advance_subtasks(cluster, time);
+ poll_releases(cluster);
+ schedule_subtasks(cluster, time);
+
+ list_for_each(pos, &cluster->topology.cpus) {
+ cpu = from_cluster_list(pos);
+ if (cpu->linked)
+ PTRACE_TASK(cpu->linked,
+ " linked on %d.\n", cpu_id(cpu));
+ else
+ PTRACE("(null) linked on %d.\n", cpu_id(cpu));
+ }
+ /* We are done. Advance time. */
+ mb();
+ list_for_each(pos, &cluster->topology.cpus) {
+ cpu = from_cluster_list(pos);
+ if (cpu->local_tick != cpu->cur_tick) {
+ TRACE("BAD Quantum not acked on %d "
+ "(l:%lu c:%lu p:%lu)\n",
+ cpu_id(cpu),
+ cpu->local_tick,
+ cpu->cur_tick,
+ cluster->pfair_time);
+ cpu->missed_quanta++;
+ }
+ cpu->cur_tick = time;
+ }
+ PTRACE(">>> Q %lu at %llu\n",
+ time, litmus_clock());
+ raw_spin_unlock(cluster_lock(cluster));
+}
+
+static noinline void wait_for_quantum(quanta_t q, struct pfair_state* state)
+{
+ quanta_t loc;
+
+ goto first; /* skip mb() on first iteration */
+ do {
+ cpu_relax();
+ mb();
+ first: loc = state->cur_tick;
+ /* FIXME: what if loc > cur? */
+ } while (time_before(loc, q));
+ PTRACE("observed cur_tick:%lu >= q:%lu\n",
+ loc, q);
+}
+
+static quanta_t current_quantum(struct pfair_state* state)
+{
+ lt_t t = litmus_clock() - state->offset;
+ return time2quanta(t, FLOOR);
+}
+
+static void catchup_quanta(quanta_t from, quanta_t target,
+ struct pfair_state* state)
+{
+ quanta_t cur = from, time;
+ TRACE("+++< BAD catching up quanta from %lu to %lu\n",
+ from, target);
+ while (time_before(cur, target)) {
+ wait_for_quantum(cur, state);
+ cur++;
+ time = cmpxchg(&cpu_cluster(state)->pfair_time,
+ cur - 1, /* expected */
+ cur /* next */
+ );
+ if (time == cur - 1)
+ schedule_next_quantum(cpu_cluster(state), cur);
+ }
+ TRACE("+++> catching up done\n");
+}
+
+/* pfair_tick - this function is called for every local timer
+ * interrupt.
+ */
+static void pfair_tick(struct task_struct* t)
+{
+ struct pfair_state* state = &__get_cpu_var(pfair_state);
+ quanta_t time, cur;
+ int retry = 10;
+
+ do {
+ cur = current_quantum(state);
+ PTRACE("q %lu at %llu\n", cur, litmus_clock());
+
+ /* Attempt to advance time. First CPU to get here
+ * will prepare the next quantum.
+ */
+ time = cmpxchg(&cpu_cluster(state)->pfair_time,
+ cur - 1, /* expected */
+ cur /* next */
+ );
+ if (time == cur - 1) {
+ /* exchange succeeded */
+ wait_for_quantum(cur - 1, state);
+ schedule_next_quantum(cpu_cluster(state), cur);
+ retry = 0;
+ } else if (time_before(time, cur - 1)) {
+ /* the whole system missed a tick !? */
+ catchup_quanta(time, cur, state);
+ retry--;
+ } else if (time_after(time, cur)) {
+ /* our timer lagging behind!? */
+ TRACE("BAD pfair_time:%lu > cur:%lu\n", time, cur);
+ retry--;
+ } else {
+ /* Some other CPU already started scheduling
+ * this quantum. Let it do its job and then update.
+ */
+ retry = 0;
+ }
+ } while (retry);
+
+ /* Spin locally until time advances. */
+ wait_for_quantum(cur, state);
+
+ /* copy assignment */
+ /* FIXME: what if we race with a future update? Corrupted state? */
+ state->local = state->linked;
+ /* signal that we are done */
+ mb();
+ state->local_tick = state->cur_tick;
+
+ if (state->local != current
+ && (is_realtime(current) || is_present(state->local)))
+ litmus_reschedule_local();
+}
+
+static int safe_to_schedule(struct task_struct* t, int cpu)
+{
+ int where = tsk_rt(t)->scheduled_on;
+ if (where != NO_CPU && where != cpu) {
+ TRACE_TASK(t, "BAD: can't be scheduled on %d, "
+ "scheduled already on %d.\n", cpu, where);
+ return 0;
+ } else
+ return is_present(t) && !is_completed(t);
+}
+
+static struct task_struct* pfair_schedule(struct task_struct * prev)
+{
+ struct pfair_state* state = &__get_cpu_var(pfair_state);
+ struct pfair_cluster* cluster = cpu_cluster(state);
+ int blocks, completion, out_of_time;
+ struct task_struct* next = NULL;
+
+#ifdef CONFIG_RELEASE_MASTER
+ /* Bail out early if we are the release master.
+ * The release master never schedules any real-time tasks.
+ */
+ if (unlikely(cluster->pfair.release_master == cpu_id(state))) {
+ sched_state_task_picked();
+ return NULL;
+ }
+#endif
+
+ raw_spin_lock(cpu_lock(state));
+
+ blocks = is_realtime(prev) && !is_running(prev);
+ completion = is_realtime(prev) && is_completed(prev);
+ out_of_time = is_realtime(prev) && time_after(cur_release(prev),
+ state->local_tick);
+
+ if (is_realtime(prev))
+ PTRACE_TASK(prev, "blocks:%d completion:%d out_of_time:%d\n",
+ blocks, completion, out_of_time);
+
+ if (completion) {
+ sched_trace_task_completion(prev, 0);
+ pfair_prepare_next_period(prev);
+ prepare_release(prev, cur_release(prev));
+ }
+
+ if (!blocks && (completion || out_of_time)) {
+ drop_all_references(prev);
+ sched_trace_task_release(prev);
+ add_release(&cluster->pfair, prev);
+ }
+
+ if (state->local && safe_to_schedule(state->local, cpu_id(state)))
+ next = state->local;
+
+ if (prev != next) {
+ tsk_rt(prev)->scheduled_on = NO_CPU;
+ if (next)
+ tsk_rt(next)->scheduled_on = cpu_id(state);
+ }
+ sched_state_task_picked();
+ raw_spin_unlock(cpu_lock(state));
+
+ if (next)
+ TRACE_TASK(next, "scheduled rel=%lu at %lu (%llu)\n",
+ tsk_pfair(next)->release, cpu_cluster(state)->pfair_time, litmus_clock());
+ else if (is_realtime(prev))
+ TRACE("Becomes idle at %lu (%llu)\n", cpu_cluster(state)->pfair_time, litmus_clock());
+
+ return next;
+}
+
+static void pfair_task_new(struct task_struct * t, int on_rq, int running)
+{
+ unsigned long flags;
+ struct pfair_cluster* cluster;
+
+ TRACE("pfair: task new %d state:%d\n", t->pid, t->state);
+
+ cluster = tsk_pfair(t)->cluster;
+
+ raw_spin_lock_irqsave(cluster_lock(cluster), flags);
+
+ prepare_release(t, cluster->pfair_time + 1);
+
+ t->rt_param.scheduled_on = NO_CPU;
+
+ if (running) {
+#ifdef CONFIG_RELEASE_MASTER
+ if (task_cpu(t) != cluster->pfair.release_master)
+#endif
+ t->rt_param.scheduled_on = task_cpu(t);
+ __add_ready(&cluster->pfair, t);
+ }
+
+ check_preempt(t);
+
+ raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
+}
+
+static void pfair_task_wake_up(struct task_struct *t)
+{
+ unsigned long flags;
+ lt_t now;
+ int requeue = 0;
+ struct pfair_cluster* cluster;
+
+ cluster = tsk_pfair(t)->cluster;
+
+ TRACE_TASK(t, "wakes at %llu, release=%lu, pfair_time:%lu\n",
+ litmus_clock(), cur_release(t), cluster->pfair_time);
+
+ raw_spin_lock_irqsave(cluster_lock(cluster), flags);
+
+ /* If a task blocks and wakes before its next job release,
+ * then it may resume if it is currently linked somewhere
+ * (as if it never blocked at all). Otherwise, we have a
+ * new sporadic job release.
+ */
+ requeue = tsk_rt(t)->flags == RT_F_REQUEUE;
+ now = litmus_clock();
+ if (lt_before(get_deadline(t), now)) {
+ TRACE_TASK(t, "sporadic release!\n");
+ release_at(t, now);
+ prepare_release(t, time2quanta(now, CEIL));
+ sched_trace_task_release(t);
+ }
+
+ /* only add to ready queue if the task isn't still linked somewhere */
+ if (requeue) {
+ TRACE_TASK(t, "requeueing required\n");
+ tsk_rt(t)->completed = 0;
+ __add_ready(&cluster->pfair, t);
+ }
+
+ check_preempt(t);
+
+ raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
+ TRACE_TASK(t, "wake up done at %llu\n", litmus_clock());
+}
+
+static void pfair_task_block(struct task_struct *t)
+{
+ BUG_ON(!is_realtime(t));
+ TRACE_TASK(t, "blocks at %llu, state:%d\n",
+ litmus_clock(), t->state);
+}
+
+static void pfair_task_exit(struct task_struct * t)
+{
+ unsigned long flags;
+ struct pfair_cluster *cluster;
+
+ BUG_ON(!is_realtime(t));
+
+ cluster = tsk_pfair(t)->cluster;
+
+ /* Remote task from release or ready queue, and ensure
+ * that it is not the scheduled task for ANY CPU. We
+ * do this blanket check because occassionally when
+ * tasks exit while blocked, the task_cpu of the task
+ * might not be the same as the CPU that the PFAIR scheduler
+ * has chosen for it.
+ */
+ raw_spin_lock_irqsave(cluster_lock(cluster), flags);
+
+ TRACE_TASK(t, "RIP, state:%d\n", t->state);
+ drop_all_references(t);
+
+ raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
+
+ kfree(t->rt_param.pfair);
+ t->rt_param.pfair = NULL;
+}
+
+
+static void pfair_release_at(struct task_struct* task, lt_t start)
+{
+ unsigned long flags;
+ quanta_t release;
+
+ struct pfair_cluster *cluster;
+
+ cluster = tsk_pfair(task)->cluster;
+
+ BUG_ON(!is_realtime(task));
+
+ raw_spin_lock_irqsave(cluster_lock(cluster), flags);
+ release_at(task, start);
+ release = time2quanta(start, CEIL);
+
+ TRACE_TASK(task, "sys release at %lu\n", release);
+
+ drop_all_references(task);
+ prepare_release(task, release);
+ add_release(&cluster->pfair, task);
+
+ raw_spin_unlock_irqrestore(cluster_lock(cluster), flags);
+}
+
+static void init_subtask(struct subtask* sub, unsigned long i,
+ lt_t quanta, lt_t period)
+{
+ /* since i is zero-based, the formulas are shifted by one */
+ lt_t tmp;
+
+ /* release */
+ tmp = period * i;
+ do_div(tmp, quanta); /* floor */
+ sub->release = (quanta_t) tmp;
+
+ /* deadline */
+ tmp = period * (i + 1);
+ if (do_div(tmp, quanta)) /* ceil */
+ tmp++;
+ sub->deadline = (quanta_t) tmp;
+
+ /* next release */
+ tmp = period * (i + 1);
+ do_div(tmp, quanta); /* floor */
+ sub->overlap = sub->deadline - (quanta_t) tmp;
+
+ /* Group deadline.
+ * Based on the formula given in Uma's thesis.
+ */
+ if (2 * quanta >= period) {
+ /* heavy */
+ tmp = (sub->deadline - (i + 1)) * period;
+ if (period > quanta &&
+ do_div(tmp, (period - quanta))) /* ceil */
+ tmp++;
+ sub->group_deadline = (quanta_t) tmp;
+ } else
+ sub->group_deadline = 0;
+}
+
+static void dump_subtasks(struct task_struct* t)
+{
+ unsigned long i;
+ for (i = 0; i < t->rt_param.pfair->quanta; i++)
+ TRACE_TASK(t, "SUBTASK %lu: rel=%lu dl=%lu bbit:%lu gdl:%lu\n",
+ i + 1,
+ t->rt_param.pfair->subtasks[i].release,
+ t->rt_param.pfair->subtasks[i].deadline,
+ t->rt_param.pfair->subtasks[i].overlap,
+ t->rt_param.pfair->subtasks[i].group_deadline);
+}
+
+static long pfair_admit_task(struct task_struct* t)
+{
+ lt_t quanta;
+ lt_t period;
+ s64 quantum_length = ktime_to_ns(tick_period);
+ struct pfair_param* param;
+ unsigned long i;
+
+ /* first check that the task is in the right cluster */
+ if (cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]) !=
+ cpu_cluster(pstate[task_cpu(t)]))
+ return -EINVAL;
+
+ if (get_rt_period(t) != get_rt_relative_deadline(t)) {
+ printk(KERN_INFO "%s: Admission rejected. "
+ "Only implicit deadlines are currently supported.\n",
+ litmus->plugin_name);
+ return -EINVAL;
+ }
+
+ /* Pfair is a tick-based method, so the time
+ * of interest is jiffies. Calculate tick-based
+ * times for everything.
+ * (Ceiling of exec cost, floor of period.)
+ */
+
+ quanta = get_exec_cost(t);
+ period = get_rt_period(t);
+
+ quanta = time2quanta(get_exec_cost(t), CEIL);
+
+ if (do_div(period, quantum_length))
+ printk(KERN_WARNING
+ "The period of %s/%d is not a multiple of %llu.\n",
+ t->comm, t->pid, (unsigned long long) quantum_length);
+
+ if (quanta == period) {
+ /* special case: task has weight 1.0 */
+ printk(KERN_INFO
+ "Admitting weight 1.0 task. (%s/%d, %llu, %llu).\n",
+ t->comm, t->pid, quanta, period);
+ quanta = 1;
+ period = 1;
+ }
+
+ param = kmalloc(sizeof(*param) +
+ quanta * sizeof(struct subtask), GFP_ATOMIC);
+
+ if (!param)
+ return -ENOMEM;
+
+ param->quanta = quanta;
+ param->cur = 0;
+ param->release = 0;
+ param->period = period;
+
+ param->cluster = cpu_cluster(pstate[tsk_rt(t)->task_params.cpu]);
+
+ for (i = 0; i < quanta; i++)
+ init_subtask(param->subtasks + i, i, quanta, period);
+
+ if (t->rt_param.pfair)
+ /* get rid of stale allocation */
+ kfree(t->rt_param.pfair);
+
+ t->rt_param.pfair = param;
+
+ /* spew out some debug info */
+ dump_subtasks(t);
+
+ return 0;
+}
+
+static void pfair_init_cluster(struct pfair_cluster* cluster)
+{
+ rt_domain_init(&cluster->pfair, pfair_ready_order, NULL, pfair_release_jobs);
+ bheap_init(&cluster->release_queue);
+ raw_spin_lock_init(&cluster->release_lock);
+ INIT_LIST_HEAD(&cluster->topology.cpus);
+}
+
+static void cleanup_clusters(void)
+{
+ int i;
+
+ if (num_pfair_clusters)
+ kfree(pfair_clusters);
+ pfair_clusters = NULL;
+ num_pfair_clusters = 0;
+
+ /* avoid stale pointers */
+ for (i = 0; i < num_online_cpus(); i++) {
+ pstate[i]->topology.cluster = NULL;
+ printk("P%d missed %u updates and %u quanta.\n", cpu_id(pstate[i]),
+ pstate[i]->missed_updates, pstate[i]->missed_quanta);
+ }
+}
+
+static long pfair_activate_plugin(void)
+{
+ int err, i;
+ struct pfair_state* state;
+ struct pfair_cluster* cluster ;
+ quanta_t now;
+ int cluster_size;
+ struct cluster_cpu* cpus[NR_CPUS];
+ struct scheduling_cluster* clust[NR_CPUS];
+
+ cluster_size = get_cluster_size(pfair_cluster_level);
+
+ if (cluster_size <= 0 || num_online_cpus() % cluster_size != 0)
+ return -EINVAL;
+
+ num_pfair_clusters = num_online_cpus() / cluster_size;
+
+ pfair_clusters = kzalloc(num_pfair_clusters * sizeof(struct pfair_cluster), GFP_ATOMIC);
+ if (!pfair_clusters) {
+ num_pfair_clusters = 0;
+ printk(KERN_ERR "Could not allocate Pfair clusters!\n");
+ return -ENOMEM;
+ }
+
+ state = &__get_cpu_var(pfair_state);
+ now = current_quantum(state);
+ TRACE("Activating PFAIR at q=%lu\n", now);
+
+ for (i = 0; i < num_pfair_clusters; i++) {
+ cluster = &pfair_clusters[i];
+ pfair_init_cluster(cluster);
+ cluster->pfair_time = now;
+ clust[i] = &cluster->topology;
+#ifdef CONFIG_RELEASE_MASTER
+ cluster->pfair.release_master = atomic_read(&release_master_cpu);
+#endif
+ }
+
+ for (i = 0; i < num_online_cpus(); i++) {
+ state = &per_cpu(pfair_state, i);
+ state->cur_tick = now;
+ state->local_tick = now;
+ state->missed_quanta = 0;
+ state->missed_updates = 0;
+ state->offset = cpu_stagger_offset(i);
+ printk(KERN_ERR "cpus[%d] set; %d\n", i, num_online_cpus());
+ cpus[i] = &state->topology;
+ }
+
+ err = assign_cpus_to_clusters(pfair_cluster_level, clust, num_pfair_clusters,
+ cpus, num_online_cpus());
+
+ if (err < 0)
+ cleanup_clusters();
+
+ return err;
+}
+
+static long pfair_deactivate_plugin(void)
+{
+ cleanup_clusters();
+ return 0;
+}
+
+/* Plugin object */
+static struct sched_plugin pfair_plugin __cacheline_aligned_in_smp = {
+ .plugin_name = "PFAIR",
+ .tick = pfair_tick,
+ .task_new = pfair_task_new,
+ .task_exit = pfair_task_exit,
+ .schedule = pfair_schedule,
+ .task_wake_up = pfair_task_wake_up,
+ .task_block = pfair_task_block,
+ .admit_task = pfair_admit_task,
+ .release_at = pfair_release_at,
+ .complete_job = complete_job,
+ .activate_plugin = pfair_activate_plugin,
+ .deactivate_plugin = pfair_deactivate_plugin,
+};
+
+
+static struct proc_dir_entry *cluster_file = NULL, *pfair_dir = NULL;
+
+static int __init init_pfair(void)
+{
+ int cpu, err, fs;
+ struct pfair_state *state;
+
+ /*
+ * initialize short_cut for per-cpu pfair state;
+ * there may be a problem here if someone removes a cpu
+ * while we are doing this initialization... and if cpus
+ * are added / removed later... but we don't support CPU hotplug atm anyway.
+ */
+ pstate = kmalloc(sizeof(struct pfair_state*) * num_online_cpus(), GFP_KERNEL);
+
+ /* initialize CPU state */
+ for (cpu = 0; cpu < num_online_cpus(); cpu++) {
+ state = &per_cpu(pfair_state, cpu);
+ state->topology.id = cpu;
+ state->cur_tick = 0;
+ state->local_tick = 0;
+ state->linked = NULL;
+ state->local = NULL;
+ state->scheduled = NULL;
+ state->missed_quanta = 0;
+ state->offset = cpu_stagger_offset(cpu);
+ pstate[cpu] = state;
+ }
+
+ pfair_clusters = NULL;
+ num_pfair_clusters = 0;
+
+ err = register_sched_plugin(&pfair_plugin);
+ if (!err) {
+ fs = make_plugin_proc_dir(&pfair_plugin, &pfair_dir);
+ if (!fs)
+ cluster_file = create_cluster_file(pfair_dir, &pfair_cluster_level);
+ else
+ printk(KERN_ERR "Could not allocate PFAIR procfs dir.\n");
+ }
+
+ return err;
+}
+
+static void __exit clean_pfair(void)
+{
+ kfree(pstate);
+
+ if (cluster_file)
+ remove_proc_entry("cluster", pfair_dir);
+ if (pfair_dir)
+ remove_plugin_proc_dir(&pfair_plugin);
+}
+
+module_init(init_pfair);
+module_exit(clean_pfair);
diff --git a/litmus/sched_pfp.c b/litmus/sched_pfp.c
new file mode 100644
index 0000000..91e5239
--- /dev/null
+++ b/litmus/sched_pfp.c
@@ -0,0 +1,1709 @@
+/*
+ * litmus/sched_pfp.c
+ *
+ * Implementation of partitioned fixed-priority scheduling.
+ * Based on PSN-EDF.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+
+#include <litmus/litmus.h>
+#include <litmus/wait.h>
+#include <litmus/jobs.h>
+#include <litmus/preempt.h>
+#include <litmus/fp_common.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
+#include <litmus/budget.h>
+
+#include <linux/uaccess.h>
+
+
+typedef struct {
+ rt_domain_t domain;
+ struct fp_prio_queue ready_queue;
+ int cpu;
+ struct task_struct* scheduled; /* only RT tasks */
+/*
+ * scheduling lock slock
+ * protects the domain and serializes scheduling decisions
+ */
+#define slock domain.ready_lock
+
+} pfp_domain_t;
+
+DEFINE_PER_CPU(pfp_domain_t, pfp_domains);
+
+pfp_domain_t* pfp_doms[NR_CPUS];
+
+#define local_pfp (&__get_cpu_var(pfp_domains))
+#define remote_dom(cpu) (&per_cpu(pfp_domains, cpu).domain)
+#define remote_pfp(cpu) (&per_cpu(pfp_domains, cpu))
+#define task_dom(task) remote_dom(get_partition(task))
+#define task_pfp(task) remote_pfp(get_partition(task))
+
+/* we assume the lock is being held */
+static void preempt(pfp_domain_t *pfp)
+{
+ preempt_if_preemptable(pfp->scheduled, pfp->cpu);
+}
+
+static unsigned int priority_index(struct task_struct* t)
+{
+#ifdef CONFIG_LITMUS_LOCKING
+ if (unlikely(t->rt_param.inh_task))
+ /* use effective priority */
+ t = t->rt_param.inh_task;
+
+ if (is_priority_boosted(t)) {
+ /* zero is reserved for priority-boosted tasks */
+ return 0;
+ } else
+#endif
+ return get_priority(t);
+}
+
+
+static void pfp_release_jobs(rt_domain_t* rt, struct bheap* tasks)
+{
+ pfp_domain_t *pfp = container_of(rt, pfp_domain_t, domain);
+ unsigned long flags;
+ struct task_struct* t;
+ struct bheap_node* hn;
+
+ raw_spin_lock_irqsave(&pfp->slock, flags);
+
+ while (!bheap_empty(tasks)) {
+ hn = bheap_take(fp_ready_order, tasks);
+ t = bheap2task(hn);
+ TRACE_TASK(t, "released (part:%d prio:%d)\n",
+ get_partition(t), get_priority(t));
+ fp_prio_add(&pfp->ready_queue, t, priority_index(t));
+ }
+
+ /* do we need to preempt? */
+ if (fp_higher_prio(fp_prio_peek(&pfp->ready_queue), pfp->scheduled)) {
+ TRACE_CUR("preempted by new release\n");
+ preempt(pfp);
+ }
+
+ raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+
+static void pfp_preempt_check(pfp_domain_t *pfp)
+{
+ if (fp_higher_prio(fp_prio_peek(&pfp->ready_queue), pfp->scheduled))
+ preempt(pfp);
+}
+
+static void pfp_domain_init(pfp_domain_t* pfp,
+ int cpu)
+{
+ fp_domain_init(&pfp->domain, NULL, pfp_release_jobs);
+ pfp->cpu = cpu;
+ pfp->scheduled = NULL;
+ fp_prio_queue_init(&pfp->ready_queue);
+}
+
+static void requeue(struct task_struct* t, pfp_domain_t *pfp)
+{
+ BUG_ON(!is_running(t));
+
+ tsk_rt(t)->completed = 0;
+ if (is_released(t, litmus_clock()))
+ fp_prio_add(&pfp->ready_queue, t, priority_index(t));
+ else
+ add_release(&pfp->domain, t); /* it has got to wait */
+}
+
+static void job_completion(struct task_struct* t, int forced)
+{
+ sched_trace_task_completion(t,forced);
+ TRACE_TASK(t, "job_completion().\n");
+
+ tsk_rt(t)->completed = 1;
+ prepare_for_next_period(t);
+ if (is_released(t, litmus_clock()))
+ sched_trace_task_release(t);
+}
+
+static void pfp_tick(struct task_struct *t)
+{
+ pfp_domain_t *pfp = local_pfp;
+
+ /* Check for inconsistency. We don't need the lock for this since
+ * ->scheduled is only changed in schedule, which obviously is not
+ * executing in parallel on this CPU
+ */
+ BUG_ON(is_realtime(t) && t != pfp->scheduled);
+
+ if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
+ if (!is_np(t)) {
+ litmus_reschedule_local();
+ TRACE("pfp_scheduler_tick: "
+ "%d is preemptable "
+ " => FORCE_RESCHED\n", t->pid);
+ } else if (is_user_np(t)) {
+ TRACE("pfp_scheduler_tick: "
+ "%d is non-preemptable, "
+ "preemption delayed.\n", t->pid);
+ request_exit_np(t);
+ }
+ }
+}
+
+static struct task_struct* pfp_schedule(struct task_struct * prev)
+{
+ pfp_domain_t* pfp = local_pfp;
+ struct task_struct* next;
+
+ int out_of_time, sleep, preempt, np, exists, blocks, resched, migrate;
+
+ raw_spin_lock(&pfp->slock);
+
+ /* sanity checking
+ * differently from gedf, when a task exits (dead)
+ * pfp->schedule may be null and prev _is_ realtime
+ */
+ BUG_ON(pfp->scheduled && pfp->scheduled != prev);
+ BUG_ON(pfp->scheduled && !is_realtime(prev));
+
+ /* (0) Determine state */
+ exists = pfp->scheduled != NULL;
+ blocks = exists && !is_running(pfp->scheduled);
+ out_of_time = exists &&
+ budget_enforced(pfp->scheduled) &&
+ budget_exhausted(pfp->scheduled);
+ np = exists && is_np(pfp->scheduled);
+ sleep = exists && is_completed(pfp->scheduled);
+ migrate = exists && get_partition(pfp->scheduled) != pfp->cpu;
+ preempt = migrate || fp_preemption_needed(&pfp->ready_queue, prev);
+
+ /* If we need to preempt do so.
+ * The following checks set resched to 1 in case of special
+ * circumstances.
+ */
+ resched = preempt;
+
+ /* If a task blocks we have no choice but to reschedule.
+ */
+ if (blocks)
+ resched = 1;
+
+ /* Request a sys_exit_np() call if we would like to preempt but cannot.
+ * Multiple calls to request_exit_np() don't hurt.
+ */
+ if (np && (out_of_time || preempt || sleep))
+ request_exit_np(pfp->scheduled);
+
+ /* Any task that is preemptable and either exhausts its execution
+ * budget or wants to sleep completes. We may have to reschedule after
+ * this.
+ */
+ if (!np && (out_of_time || sleep) && !blocks && !migrate) {
+ job_completion(pfp->scheduled, !sleep);
+ resched = 1;
+ }
+
+ /* The final scheduling decision. Do we need to switch for some reason?
+ * Switch if we are in RT mode and have no task or if we need to
+ * resched.
+ */
+ next = NULL;
+ if ((!np || blocks) && (resched || !exists)) {
+ /* When preempting a task that does not block, then
+ * re-insert it into either the ready queue or the
+ * release queue (if it completed). requeue() picks
+ * the appropriate queue.
+ */
+ if (pfp->scheduled && !blocks && !migrate)
+ requeue(pfp->scheduled, pfp);
+ next = fp_prio_take(&pfp->ready_queue);
+ if (next == prev) {
+ struct task_struct *t = fp_prio_peek(&pfp->ready_queue);
+ TRACE_TASK(next, "next==prev sleep=%d oot=%d np=%d preempt=%d migrate=%d "
+ "boost=%d empty=%d prio-idx=%u prio=%u\n",
+ sleep, out_of_time, np, preempt, migrate,
+ is_priority_boosted(next),
+ t == NULL,
+ priority_index(next),
+ get_priority(next));
+ if (t)
+ TRACE_TASK(t, "waiter boost=%d prio-idx=%u prio=%u\n",
+ is_priority_boosted(t),
+ priority_index(t),
+ get_priority(t));
+ }
+ /* If preempt is set, we should not see the same task again. */
+ BUG_ON(preempt && next == prev);
+ /* Similarly, if preempt is set, then next may not be NULL,
+ * unless it's a migration. */
+ BUG_ON(preempt && !migrate && next == NULL);
+ } else
+ /* Only override Linux scheduler if we have a real-time task
+ * scheduled that needs to continue.
+ */
+ if (exists)
+ next = prev;
+
+ if (next) {
+ TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+ tsk_rt(next)->completed = 0;
+ } else {
+ TRACE("becoming idle at %llu\n", litmus_clock());
+ }
+
+ pfp->scheduled = next;
+ sched_state_task_picked();
+ raw_spin_unlock(&pfp->slock);
+
+ return next;
+}
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+/* prev is no longer scheduled --- see if it needs to migrate */
+static void pfp_finish_switch(struct task_struct *prev)
+{
+ pfp_domain_t *to;
+
+ if (is_realtime(prev) &&
+ is_running(prev) &&
+ get_partition(prev) != smp_processor_id()) {
+ TRACE_TASK(prev, "needs to migrate from P%d to P%d\n",
+ smp_processor_id(), get_partition(prev));
+
+ to = task_pfp(prev);
+
+ raw_spin_lock(&to->slock);
+
+ TRACE_TASK(prev, "adding to queue on P%d\n", to->cpu);
+ requeue(prev, to);
+ if (fp_preemption_needed(&to->ready_queue, to->scheduled))
+ preempt(to);
+
+ raw_spin_unlock(&to->slock);
+
+ }
+}
+
+#endif
+
+/* Prepare a task for running in RT mode
+ */
+static void pfp_task_new(struct task_struct * t, int on_rq, int running)
+{
+ pfp_domain_t* pfp = task_pfp(t);
+ unsigned long flags;
+
+ TRACE_TASK(t, "P-FP: task new, cpu = %d\n",
+ t->rt_param.task_params.cpu);
+
+ /* setup job parameters */
+ release_at(t, litmus_clock());
+
+ /* The task should be running in the queue, otherwise signal
+ * code will try to wake it up with fatal consequences.
+ */
+ raw_spin_lock_irqsave(&pfp->slock, flags);
+ if (running) {
+ /* there shouldn't be anything else running at the time */
+ BUG_ON(pfp->scheduled);
+ pfp->scheduled = t;
+ } else {
+ requeue(t, pfp);
+ /* maybe we have to reschedule */
+ pfp_preempt_check(pfp);
+ }
+ raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+
+static void pfp_task_wake_up(struct task_struct *task)
+{
+ unsigned long flags;
+ pfp_domain_t* pfp = task_pfp(task);
+ lt_t now;
+
+ TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+ raw_spin_lock_irqsave(&pfp->slock, flags);
+
+#ifdef CONFIG_LITMUS_LOCKING
+ /* Should only be queued when processing a fake-wake up due to a
+ * migration-related state change. */
+ if (unlikely(is_queued(task))) {
+ TRACE_TASK(task, "WARNING: waking task still queued. Is this right?\n");
+ goto out_unlock;
+ }
+#else
+ BUG_ON(is_queued(task));
+#endif
+ now = litmus_clock();
+ if (is_tardy(task, now)
+#ifdef CONFIG_LITMUS_LOCKING
+ /* We need to take suspensions because of semaphores into
+ * account! If a job resumes after being suspended due to acquiring
+ * a semaphore, it should never be treated as a new job release.
+ */
+ && !is_priority_boosted(task)
+#endif
+ ) {
+ /* new sporadic release */
+ release_at(task, now);
+ sched_trace_task_release(task);
+ }
+
+ /* Only add to ready queue if it is not the currently-scheduled
+ * task. This could be the case if a task was woken up concurrently
+ * on a remote CPU before the executing CPU got around to actually
+ * de-scheduling the task, i.e., wake_up() raced with schedule()
+ * and won. Also, don't requeue if it is still queued, which can
+ * happen under the DPCP due wake-ups racing with migrations.
+ */
+ if (pfp->scheduled != task) {
+ requeue(task, pfp);
+ pfp_preempt_check(pfp);
+ }
+
+#ifdef CONFIG_LITMUS_LOCKING
+out_unlock:
+#endif
+ raw_spin_unlock_irqrestore(&pfp->slock, flags);
+ TRACE_TASK(task, "wake up done\n");
+}
+
+static void pfp_task_block(struct task_struct *t)
+{
+ /* only running tasks can block, thus t is in no queue */
+ TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state);
+
+ BUG_ON(!is_realtime(t));
+
+ /* If this task blocked normally, it shouldn't be queued. The exception is
+ * if this is a simulated block()/wakeup() pair from the pull-migration code path.
+ * This should only happen if the DPCP is being used.
+ */
+#ifdef CONFIG_LITMUS_LOCKING
+ if (unlikely(is_queued(t)))
+ TRACE_TASK(t, "WARNING: blocking task still queued. Is this right?\n");
+#else
+ BUG_ON(is_queued(t));
+#endif
+}
+
+static void pfp_task_exit(struct task_struct * t)
+{
+ unsigned long flags;
+ pfp_domain_t* pfp = task_pfp(t);
+ rt_domain_t* dom;
+
+ raw_spin_lock_irqsave(&pfp->slock, flags);
+ if (is_queued(t)) {
+ BUG(); /* This currently doesn't work. */
+ /* dequeue */
+ dom = task_dom(t);
+ remove(dom, t);
+ }
+ if (pfp->scheduled == t) {
+ pfp->scheduled = NULL;
+ preempt(pfp);
+ }
+ TRACE_TASK(t, "RIP, now reschedule\n");
+
+ raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+#include <litmus/fdso.h>
+#include <litmus/srp.h>
+
+static void fp_dequeue(pfp_domain_t* pfp, struct task_struct* t)
+{
+ BUG_ON(pfp->scheduled == t && is_queued(t));
+ if (is_queued(t))
+ fp_prio_remove(&pfp->ready_queue, t, priority_index(t));
+}
+
+static void fp_set_prio_inh(pfp_domain_t* pfp, struct task_struct* t,
+ struct task_struct* prio_inh)
+{
+ int requeue;
+
+ if (!t || t->rt_param.inh_task == prio_inh) {
+ /* no update required */
+ if (t)
+ TRACE_TASK(t, "no prio-inh update required\n");
+ return;
+ }
+
+ requeue = is_queued(t);
+ TRACE_TASK(t, "prio-inh: is_queued:%d\n", requeue);
+
+ if (requeue)
+ /* first remove */
+ fp_dequeue(pfp, t);
+
+ t->rt_param.inh_task = prio_inh;
+
+ if (requeue)
+ /* add again to the right queue */
+ fp_prio_add(&pfp->ready_queue, t, priority_index(t));
+}
+
+static int effective_agent_priority(int prio)
+{
+ /* make sure agents have higher priority */
+ return prio - LITMUS_MAX_PRIORITY;
+}
+
+static lt_t prio_point(int eprio)
+{
+ /* make sure we have non-negative prio points */
+ return eprio + LITMUS_MAX_PRIORITY;
+}
+
+static int prio_from_point(lt_t prio_point)
+{
+ return ((int) prio_point) - LITMUS_MAX_PRIORITY;
+}
+
+static void boost_priority(struct task_struct* t, lt_t priority_point)
+{
+ unsigned long flags;
+ pfp_domain_t* pfp = task_pfp(t);
+
+ raw_spin_lock_irqsave(&pfp->slock, flags);
+
+
+ TRACE_TASK(t, "priority boosted at %llu\n", litmus_clock());
+
+ tsk_rt(t)->priority_boosted = 1;
+ /* tie-break by protocol-specific priority point */
+ tsk_rt(t)->boost_start_time = priority_point;
+
+ /* Priority boosting currently only takes effect for already-scheduled
+ * tasks. This is sufficient since priority boosting only kicks in as
+ * part of lock acquisitions. */
+ BUG_ON(pfp->scheduled != t);
+
+ raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+
+static void unboost_priority(struct task_struct* t)
+{
+ unsigned long flags;
+ pfp_domain_t* pfp = task_pfp(t);
+ lt_t now;
+
+ raw_spin_lock_irqsave(&pfp->slock, flags);
+ now = litmus_clock();
+
+ /* assumption: this only happens when the job is scheduled */
+ BUG_ON(pfp->scheduled != t);
+
+ TRACE_TASK(t, "priority restored at %llu\n", now);
+
+ /* priority boosted jobs must be scheduled */
+ BUG_ON(pfp->scheduled != t);
+
+ tsk_rt(t)->priority_boosted = 0;
+ tsk_rt(t)->boost_start_time = 0;
+
+ /* check if this changes anything */
+ if (fp_preemption_needed(&pfp->ready_queue, pfp->scheduled))
+ preempt(pfp);
+
+ raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+
+/* ******************** SRP support ************************ */
+
+static unsigned int pfp_get_srp_prio(struct task_struct* t)
+{
+ return get_priority(t);
+}
+
+/* ******************** FMLP support ********************** */
+
+struct fmlp_semaphore {
+ struct litmus_lock litmus_lock;
+
+ /* current resource holder */
+ struct task_struct *owner;
+
+ /* FIFO queue of waiting tasks */
+ wait_queue_head_t wait;
+};
+
+static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock)
+{
+ return container_of(lock, struct fmlp_semaphore, litmus_lock);
+}
+int pfp_fmlp_lock(struct litmus_lock* l)
+{
+ struct task_struct* t = current;
+ struct fmlp_semaphore *sem = fmlp_from_lock(l);
+ wait_queue_t wait;
+ unsigned long flags;
+ lt_t time_of_request;
+
+ if (!is_realtime(t))
+ return -EPERM;
+
+ spin_lock_irqsave(&sem->wait.lock, flags);
+
+ /* tie-break by this point in time */
+ time_of_request = litmus_clock();
+
+ /* Priority-boost ourself *before* we suspend so that
+ * our priority is boosted when we resume. */
+ boost_priority(t, time_of_request);
+
+ if (sem->owner) {
+ /* resource is not free => must suspend and wait */
+
+ init_waitqueue_entry(&wait, t);
+
+ /* FIXME: interruptible would be nice some day */
+ set_task_state(t, TASK_UNINTERRUPTIBLE);
+
+ __add_wait_queue_tail_exclusive(&sem->wait, &wait);
+
+ TS_LOCK_SUSPEND;
+
+ /* release lock before sleeping */
+ spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+ /* We depend on the FIFO order. Thus, we don't need to recheck
+ * when we wake up; we are guaranteed to have the lock since
+ * there is only one wake up per release.
+ */
+
+ schedule();
+
+ TS_LOCK_RESUME;
+
+ /* Since we hold the lock, no other task will change
+ * ->owner. We can thus check it without acquiring the spin
+ * lock. */
+ BUG_ON(sem->owner != t);
+ } else {
+ /* it's ours now */
+ sem->owner = t;
+
+ spin_unlock_irqrestore(&sem->wait.lock, flags);
+ }
+
+ return 0;
+}
+
+int pfp_fmlp_unlock(struct litmus_lock* l)
+{
+ struct task_struct *t = current, *next;
+ struct fmlp_semaphore *sem = fmlp_from_lock(l);
+ unsigned long flags;
+ int err = 0;
+
+ spin_lock_irqsave(&sem->wait.lock, flags);
+
+ if (sem->owner != t) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* we lose the benefit of priority boosting */
+
+ unboost_priority(t);
+
+ /* check if there are jobs waiting for this resource */
+ next = __waitqueue_remove_first(&sem->wait);
+ if (next) {
+ /* next becomes the resouce holder */
+ sem->owner = next;
+
+ /* Wake up next. The waiting job is already priority-boosted. */
+ wake_up_process(next);
+ } else
+ /* resource becomes available */
+ sem->owner = NULL;
+
+out:
+ spin_unlock_irqrestore(&sem->wait.lock, flags);
+ return err;
+}
+
+int pfp_fmlp_close(struct litmus_lock* l)
+{
+ struct task_struct *t = current;
+ struct fmlp_semaphore *sem = fmlp_from_lock(l);
+ unsigned long flags;
+
+ int owner;
+
+ spin_lock_irqsave(&sem->wait.lock, flags);
+
+ owner = sem->owner == t;
+
+ spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+ if (owner)
+ pfp_fmlp_unlock(l);
+
+ return 0;
+}
+
+void pfp_fmlp_free(struct litmus_lock* lock)
+{
+ kfree(fmlp_from_lock(lock));
+}
+
+static struct litmus_lock_ops pfp_fmlp_lock_ops = {
+ .close = pfp_fmlp_close,
+ .lock = pfp_fmlp_lock,
+ .unlock = pfp_fmlp_unlock,
+ .deallocate = pfp_fmlp_free,
+};
+
+static struct litmus_lock* pfp_new_fmlp(void)
+{
+ struct fmlp_semaphore* sem;
+
+ sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+ if (!sem)
+ return NULL;
+
+ sem->owner = NULL;
+ init_waitqueue_head(&sem->wait);
+ sem->litmus_lock.ops = &pfp_fmlp_lock_ops;
+
+ return &sem->litmus_lock;
+}
+
+/* ******************** MPCP support ********************** */
+
+struct mpcp_semaphore {
+ struct litmus_lock litmus_lock;
+
+ /* current resource holder */
+ struct task_struct *owner;
+
+ /* priority queue of waiting tasks */
+ wait_queue_head_t wait;
+
+ /* priority ceiling per cpu */
+ unsigned int prio_ceiling[NR_CPUS];
+
+ /* should jobs spin "virtually" for this resource? */
+ int vspin;
+};
+
+#define OMEGA_CEILING UINT_MAX
+
+/* Since jobs spin "virtually" while waiting to acquire a lock,
+ * they first must aquire a local per-cpu resource.
+ */
+static DEFINE_PER_CPU(wait_queue_head_t, mpcpvs_vspin_wait);
+static DEFINE_PER_CPU(struct task_struct*, mpcpvs_vspin);
+
+/* called with preemptions off <=> no local modifications */
+static void mpcp_vspin_enter(void)
+{
+ struct task_struct* t = current;
+
+ while (1) {
+ if (__get_cpu_var(mpcpvs_vspin) == NULL) {
+ /* good, we get to issue our request */
+ __get_cpu_var(mpcpvs_vspin) = t;
+ break;
+ } else {
+ /* some job is spinning => enqueue in request queue */
+ prio_wait_queue_t wait;
+ wait_queue_head_t* vspin = &__get_cpu_var(mpcpvs_vspin_wait);
+ unsigned long flags;
+
+ /* ordered by regular priority */
+ init_prio_waitqueue_entry(&wait, t, prio_point(get_priority(t)));
+
+ spin_lock_irqsave(&vspin->lock, flags);
+
+ set_task_state(t, TASK_UNINTERRUPTIBLE);
+
+ __add_wait_queue_prio_exclusive(vspin, &wait);
+
+ spin_unlock_irqrestore(&vspin->lock, flags);
+
+ TS_LOCK_SUSPEND;
+
+ preempt_enable_no_resched();
+
+ schedule();
+
+ preempt_disable();
+
+ TS_LOCK_RESUME;
+ /* Recheck if we got it --- some higher-priority process might
+ * have swooped in. */
+ }
+ }
+ /* ok, now it is ours */
+}
+
+/* called with preemptions off */
+static void mpcp_vspin_exit(void)
+{
+ struct task_struct* t = current, *next;
+ unsigned long flags;
+ wait_queue_head_t* vspin = &__get_cpu_var(mpcpvs_vspin_wait);
+
+ BUG_ON(__get_cpu_var(mpcpvs_vspin) != t);
+
+ /* no spinning job */
+ __get_cpu_var(mpcpvs_vspin) = NULL;
+
+ /* see if anyone is waiting for us to stop "spinning" */
+ spin_lock_irqsave(&vspin->lock, flags);
+ next = __waitqueue_remove_first(vspin);
+
+ if (next)
+ wake_up_process(next);
+
+ spin_unlock_irqrestore(&vspin->lock, flags);
+}
+
+static inline struct mpcp_semaphore* mpcp_from_lock(struct litmus_lock* lock)
+{
+ return container_of(lock, struct mpcp_semaphore, litmus_lock);
+}
+
+int pfp_mpcp_lock(struct litmus_lock* l)
+{
+ struct task_struct* t = current;
+ struct mpcp_semaphore *sem = mpcp_from_lock(l);
+ prio_wait_queue_t wait;
+ unsigned long flags;
+
+ if (!is_realtime(t))
+ return -EPERM;
+
+ preempt_disable();
+
+ if (sem->vspin)
+ mpcp_vspin_enter();
+
+ /* Priority-boost ourself *before* we suspend so that
+ * our priority is boosted when we resume. Use the priority
+ * ceiling for the local partition. */
+ boost_priority(t, sem->prio_ceiling[get_partition(t)]);
+
+ spin_lock_irqsave(&sem->wait.lock, flags);
+
+ preempt_enable_no_resched();
+
+ if (sem->owner) {
+ /* resource is not free => must suspend and wait */
+
+ /* ordered by regular priority */
+ init_prio_waitqueue_entry(&wait, t, prio_point(get_priority(t)));
+
+ /* FIXME: interruptible would be nice some day */
+ set_task_state(t, TASK_UNINTERRUPTIBLE);
+
+ __add_wait_queue_prio_exclusive(&sem->wait, &wait);
+
+ TS_LOCK_SUSPEND;
+
+ /* release lock before sleeping */
+ spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+ /* We depend on the FIFO order. Thus, we don't need to recheck
+ * when we wake up; we are guaranteed to have the lock since
+ * there is only one wake up per release.
+ */
+
+ schedule();
+
+ TS_LOCK_RESUME;
+
+ /* Since we hold the lock, no other task will change
+ * ->owner. We can thus check it without acquiring the spin
+ * lock. */
+ BUG_ON(sem->owner != t);
+ } else {
+ /* it's ours now */
+ sem->owner = t;
+
+ spin_unlock_irqrestore(&sem->wait.lock, flags);
+ }
+
+ return 0;
+}
+
+int pfp_mpcp_unlock(struct litmus_lock* l)
+{
+ struct task_struct *t = current, *next;
+ struct mpcp_semaphore *sem = mpcp_from_lock(l);
+ unsigned long flags;
+ int err = 0;
+
+ spin_lock_irqsave(&sem->wait.lock, flags);
+
+ if (sem->owner != t) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* we lose the benefit of priority boosting */
+
+ unboost_priority(t);
+
+ /* check if there are jobs waiting for this resource */
+ next = __waitqueue_remove_first(&sem->wait);
+ if (next) {
+ /* next becomes the resouce holder */
+ sem->owner = next;
+
+ /* Wake up next. The waiting job is already priority-boosted. */
+ wake_up_process(next);
+ } else
+ /* resource becomes available */
+ sem->owner = NULL;
+
+out:
+ spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+ if (sem->vspin && err == 0) {
+ preempt_disable();
+ mpcp_vspin_exit();
+ preempt_enable();
+ }
+
+ return err;
+}
+
+int pfp_mpcp_open(struct litmus_lock* l, void* config)
+{
+ struct task_struct *t = current;
+ struct mpcp_semaphore *sem = mpcp_from_lock(l);
+ int cpu, local_cpu;
+ unsigned long flags;
+
+ if (!is_realtime(t))
+ /* we need to know the real-time priority */
+ return -EPERM;
+
+ local_cpu = get_partition(t);
+
+ spin_lock_irqsave(&sem->wait.lock, flags);
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++)
+ if (cpu != local_cpu)
+ {
+ sem->prio_ceiling[cpu] = min(sem->prio_ceiling[cpu],
+ get_priority(t));
+ TRACE_CUR("priority ceiling for sem %p is now %d on cpu %d\n",
+ sem, sem->prio_ceiling[cpu], cpu);
+ }
+
+ spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+ return 0;
+}
+
+int pfp_mpcp_close(struct litmus_lock* l)
+{
+ struct task_struct *t = current;
+ struct mpcp_semaphore *sem = mpcp_from_lock(l);
+ unsigned long flags;
+
+ int owner;
+
+ spin_lock_irqsave(&sem->wait.lock, flags);
+
+ owner = sem->owner == t;
+
+ spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+ if (owner)
+ pfp_mpcp_unlock(l);
+
+ return 0;
+}
+
+void pfp_mpcp_free(struct litmus_lock* lock)
+{
+ kfree(mpcp_from_lock(lock));
+}
+
+static struct litmus_lock_ops pfp_mpcp_lock_ops = {
+ .close = pfp_mpcp_close,
+ .lock = pfp_mpcp_lock,
+ .open = pfp_mpcp_open,
+ .unlock = pfp_mpcp_unlock,
+ .deallocate = pfp_mpcp_free,
+};
+
+static struct litmus_lock* pfp_new_mpcp(int vspin)
+{
+ struct mpcp_semaphore* sem;
+ int cpu;
+
+ sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+ if (!sem)
+ return NULL;
+
+ sem->owner = NULL;
+ init_waitqueue_head(&sem->wait);
+ sem->litmus_lock.ops = &pfp_mpcp_lock_ops;
+
+ for (cpu = 0; cpu < NR_CPUS; cpu++)
+ sem->prio_ceiling[cpu] = OMEGA_CEILING;
+
+ /* mark as virtual spinning */
+ sem->vspin = vspin;
+
+ return &sem->litmus_lock;
+}
+
+
+/* ******************** PCP support ********************** */
+
+
+struct pcp_semaphore {
+ struct litmus_lock litmus_lock;
+
+ struct list_head ceiling;
+
+ /* current resource holder */
+ struct task_struct *owner;
+
+ /* priority ceiling --- can be negative due to DPCP support */
+ int prio_ceiling;
+
+ /* on which processor is this PCP semaphore allocated? */
+ int on_cpu;
+};
+
+static inline struct pcp_semaphore* pcp_from_lock(struct litmus_lock* lock)
+{
+ return container_of(lock, struct pcp_semaphore, litmus_lock);
+}
+
+
+struct pcp_state {
+ struct list_head system_ceiling;
+
+ /* highest-priority waiting task */
+ struct task_struct* hp_waiter;
+
+ /* list of jobs waiting to get past the system ceiling */
+ wait_queue_head_t ceiling_blocked;
+};
+
+static void pcp_init_state(struct pcp_state* s)
+{
+ INIT_LIST_HEAD(&s->system_ceiling);
+ s->hp_waiter = NULL;
+ init_waitqueue_head(&s->ceiling_blocked);
+}
+
+static DEFINE_PER_CPU(struct pcp_state, pcp_state);
+
+/* assumes preemptions are off */
+static struct pcp_semaphore* pcp_get_ceiling(void)
+{
+ struct list_head* top = __get_cpu_var(pcp_state).system_ceiling.next;
+
+ if (top)
+ return list_entry(top, struct pcp_semaphore, ceiling);
+ else
+ return NULL;
+}
+
+/* assumes preempt off */
+static void pcp_add_ceiling(struct pcp_semaphore* sem)
+{
+ struct list_head *pos;
+ struct list_head *in_use = &__get_cpu_var(pcp_state).system_ceiling;
+ struct pcp_semaphore* held;
+
+ BUG_ON(sem->on_cpu != smp_processor_id());
+ BUG_ON(in_list(&sem->ceiling));
+
+ list_for_each(pos, in_use) {
+ held = list_entry(pos, struct pcp_semaphore, ceiling);
+ if (held->prio_ceiling >= sem->prio_ceiling) {
+ __list_add(&sem->ceiling, pos->prev, pos);
+ return;
+ }
+ }
+
+ /* we hit the end of the list */
+
+ list_add_tail(&sem->ceiling, in_use);
+}
+
+/* assumes preempt off */
+static int pcp_exceeds_ceiling(struct pcp_semaphore* ceiling,
+ struct task_struct* task,
+ int effective_prio)
+{
+ return ceiling == NULL ||
+ ceiling->prio_ceiling > effective_prio ||
+ ceiling->owner == task;
+}
+
+/* assumes preempt off */
+static void pcp_priority_inheritance(void)
+{
+ unsigned long flags;
+ pfp_domain_t* pfp = local_pfp;
+
+ struct pcp_semaphore* ceiling = pcp_get_ceiling();
+ struct task_struct *blocker, *blocked;
+
+ blocker = ceiling ? ceiling->owner : NULL;
+ blocked = __get_cpu_var(pcp_state).hp_waiter;
+
+ raw_spin_lock_irqsave(&pfp->slock, flags);
+
+ /* Current is no longer inheriting anything by default. This should be
+ * the currently scheduled job, and hence not currently queued. */
+ BUG_ON(current != pfp->scheduled);
+
+ fp_set_prio_inh(pfp, current, NULL);
+ fp_set_prio_inh(pfp, blocked, NULL);
+ fp_set_prio_inh(pfp, blocker, NULL);
+
+
+ /* Let blocking job inherit priority of blocked job, if required. */
+ if (blocker && blocked &&
+ fp_higher_prio(blocked, blocker)) {
+ TRACE_TASK(blocker, "PCP inherits from %s/%d (prio %u -> %u) \n",
+ blocked->comm, blocked->pid,
+ get_priority(blocker), get_priority(blocked));
+ fp_set_prio_inh(pfp, blocker, blocked);
+ }
+
+ /* check if anything changed */
+ if (fp_higher_prio(fp_prio_peek(&pfp->ready_queue), pfp->scheduled))
+ preempt(pfp);
+
+ raw_spin_unlock_irqrestore(&pfp->slock, flags);
+}
+
+/* called with preemptions off */
+static void pcp_raise_ceiling(struct pcp_semaphore* sem,
+ int effective_prio)
+{
+ struct task_struct* t = current;
+ struct pcp_semaphore* ceiling;
+ prio_wait_queue_t wait;
+ unsigned int waiting_higher_prio;
+
+ do {
+ ceiling = pcp_get_ceiling();
+ if (pcp_exceeds_ceiling(ceiling, t, effective_prio))
+ break;
+
+ TRACE_CUR("PCP ceiling-blocked, wanted sem %p, but %s/%d has the ceiling \n",
+ sem, ceiling->owner->comm, ceiling->owner->pid);
+
+ /* we need to wait until the ceiling is lowered */
+
+ /* enqueue in priority order */
+ init_prio_waitqueue_entry(&wait, t, prio_point(effective_prio));
+ set_task_state(t, TASK_UNINTERRUPTIBLE);
+ waiting_higher_prio = add_wait_queue_prio_exclusive(
+ &__get_cpu_var(pcp_state).ceiling_blocked, &wait);
+
+ if (waiting_higher_prio == 0) {
+ TRACE_CUR("PCP new highest-prio waiter => prio inheritance\n");
+
+ /* we are the new highest-priority waiting job
+ * => update inheritance */
+ __get_cpu_var(pcp_state).hp_waiter = t;
+ pcp_priority_inheritance();
+ }
+
+ TS_LOCK_SUSPEND;
+
+ preempt_enable_no_resched();
+ schedule();
+ preempt_disable();
+
+ /* pcp_resume_unblocked() removed us from wait queue */
+
+ TS_LOCK_RESUME;
+ } while(1);
+
+ TRACE_CUR("PCP got the ceiling and sem %p\n", sem);
+
+ /* We are good to go. The semaphore should be available. */
+ BUG_ON(sem->owner != NULL);
+
+ sem->owner = t;
+
+ pcp_add_ceiling(sem);
+}
+
+static void pcp_resume_unblocked(void)
+{
+ wait_queue_head_t *blocked = &__get_cpu_var(pcp_state).ceiling_blocked;
+ unsigned long flags;
+ prio_wait_queue_t* q;
+ struct task_struct* t = NULL;
+
+ struct pcp_semaphore* ceiling = pcp_get_ceiling();
+
+ spin_lock_irqsave(&blocked->lock, flags);
+
+ while (waitqueue_active(blocked)) {
+ /* check first == highest-priority waiting job */
+ q = list_entry(blocked->task_list.next,
+ prio_wait_queue_t, wq.task_list);
+ t = (struct task_struct*) q->wq.private;
+
+ /* can it proceed now? => let it go */
+ if (pcp_exceeds_ceiling(ceiling, t,
+ prio_from_point(q->priority))) {
+ __remove_wait_queue(blocked, &q->wq);
+ wake_up_process(t);
+ } else {
+ /* We are done. Update highest-priority waiter. */
+ __get_cpu_var(pcp_state).hp_waiter = t;
+ goto out;
+ }
+ }
+ /* If we get here, then there are no more waiting
+ * jobs. */
+ __get_cpu_var(pcp_state).hp_waiter = NULL;
+out:
+ spin_unlock_irqrestore(&blocked->lock, flags);
+}
+
+/* assumes preempt off */
+static void pcp_lower_ceiling(struct pcp_semaphore* sem)
+{
+ BUG_ON(!in_list(&sem->ceiling));
+ BUG_ON(sem->owner != current);
+ BUG_ON(sem->on_cpu != smp_processor_id());
+
+ /* remove from ceiling list */
+ list_del(&sem->ceiling);
+
+ /* release */
+ sem->owner = NULL;
+
+ TRACE_CUR("PCP released sem %p\n", sem);
+
+ /* Wake up all ceiling-blocked jobs that now pass the ceiling. */
+ pcp_resume_unblocked();
+
+ pcp_priority_inheritance();
+}
+
+static void pcp_update_prio_ceiling(struct pcp_semaphore* sem,
+ int effective_prio)
+{
+ /* This needs to be synchronized on something.
+ * Might as well use waitqueue lock for the processor.
+ * We assume this happens only before the task set starts execution,
+ * (i.e., during initialization), but it may happen on multiple processors
+ * at the same time.
+ */
+ unsigned long flags;
+
+ struct pcp_state* s = &per_cpu(pcp_state, sem->on_cpu);
+
+ spin_lock_irqsave(&s->ceiling_blocked.lock, flags);
+
+ sem->prio_ceiling = min(sem->prio_ceiling, effective_prio);
+
+ spin_unlock_irqrestore(&s->ceiling_blocked.lock, flags);
+}
+
+static void pcp_init_semaphore(struct pcp_semaphore* sem, int cpu)
+{
+ sem->owner = NULL;
+ INIT_LIST_HEAD(&sem->ceiling);
+ sem->prio_ceiling = INT_MAX;
+ sem->on_cpu = cpu;
+}
+
+int pfp_pcp_lock(struct litmus_lock* l)
+{
+ struct task_struct* t = current;
+ struct pcp_semaphore *sem = pcp_from_lock(l);
+
+ int eprio = effective_agent_priority(get_priority(t));
+ int from = get_partition(t);
+ int to = sem->on_cpu;
+
+ if (!is_realtime(t) || from != to)
+ return -EPERM;
+
+ preempt_disable();
+
+ pcp_raise_ceiling(sem, eprio);
+
+ preempt_enable();
+
+ return 0;
+}
+
+int pfp_pcp_unlock(struct litmus_lock* l)
+{
+ struct task_struct *t = current;
+ struct pcp_semaphore *sem = pcp_from_lock(l);
+
+ int err = 0;
+
+ preempt_disable();
+
+ if (sem->on_cpu != smp_processor_id() || sem->owner != t) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* give it back */
+ pcp_lower_ceiling(sem);
+
+out:
+ preempt_enable();
+
+ return err;
+}
+
+int pfp_pcp_open(struct litmus_lock* l, void* __user config)
+{
+ struct task_struct *t = current;
+ struct pcp_semaphore *sem = pcp_from_lock(l);
+
+ int cpu, eprio;
+
+ if (!is_realtime(t))
+ /* we need to know the real-time priority */
+ return -EPERM;
+
+ if (get_user(cpu, (int*) config))
+ return -EFAULT;
+
+ /* make sure the resource location matches */
+ if (cpu != sem->on_cpu)
+ return -EINVAL;
+
+ eprio = effective_agent_priority(get_priority(t));
+
+ pcp_update_prio_ceiling(sem, eprio);
+
+ return 0;
+}
+
+int pfp_pcp_close(struct litmus_lock* l)
+{
+ struct task_struct *t = current;
+ struct pcp_semaphore *sem = pcp_from_lock(l);
+
+ int owner = 0;
+
+ preempt_disable();
+
+ if (sem->on_cpu == smp_processor_id())
+ owner = sem->owner == t;
+
+ preempt_enable();
+
+ if (owner)
+ pfp_pcp_unlock(l);
+
+ return 0;
+}
+
+void pfp_pcp_free(struct litmus_lock* lock)
+{
+ kfree(pcp_from_lock(lock));
+}
+
+
+static struct litmus_lock_ops pfp_pcp_lock_ops = {
+ .close = pfp_pcp_close,
+ .lock = pfp_pcp_lock,
+ .open = pfp_pcp_open,
+ .unlock = pfp_pcp_unlock,
+ .deallocate = pfp_pcp_free,
+};
+
+
+static struct litmus_lock* pfp_new_pcp(int on_cpu)
+{
+ struct pcp_semaphore* sem;
+
+ sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+ if (!sem)
+ return NULL;
+
+ sem->litmus_lock.ops = &pfp_pcp_lock_ops;
+ pcp_init_semaphore(sem, on_cpu);
+
+ return &sem->litmus_lock;
+}
+
+/* ******************** DPCP support ********************** */
+
+struct dpcp_semaphore {
+ struct litmus_lock litmus_lock;
+ struct pcp_semaphore pcp;
+ int owner_cpu;
+};
+
+static inline struct dpcp_semaphore* dpcp_from_lock(struct litmus_lock* lock)
+{
+ return container_of(lock, struct dpcp_semaphore, litmus_lock);
+}
+
+/* called with preemptions disabled */
+static void pfp_migrate_to(int target_cpu)
+{
+ struct task_struct* t = current;
+ pfp_domain_t *from;
+
+ if (get_partition(t) == target_cpu)
+ return;
+
+ /* make sure target_cpu makes sense */
+ BUG_ON(!cpu_online(target_cpu));
+
+ local_irq_disable();
+
+ /* scheduled task should not be in any ready or release queue */
+ BUG_ON(is_queued(t));
+
+ /* lock both pfp domains in order of address */
+ from = task_pfp(t);
+
+ raw_spin_lock(&from->slock);
+
+ /* switch partitions */
+ tsk_rt(t)->task_params.cpu = target_cpu;
+
+ raw_spin_unlock(&from->slock);
+
+ /* Don't trace scheduler costs as part of
+ * locking overhead. Scheduling costs are accounted for
+ * explicitly. */
+ TS_LOCK_SUSPEND;
+
+ local_irq_enable();
+ preempt_enable_no_resched();
+
+ /* deschedule to be migrated */
+ schedule();
+
+ /* we are now on the target processor */
+ preempt_disable();
+
+ /* start recording costs again */
+ TS_LOCK_RESUME;
+
+ BUG_ON(smp_processor_id() != target_cpu);
+}
+
+int pfp_dpcp_lock(struct litmus_lock* l)
+{
+ struct task_struct* t = current;
+ struct dpcp_semaphore *sem = dpcp_from_lock(l);
+ int eprio = effective_agent_priority(get_priority(t));
+ int from = get_partition(t);
+ int to = sem->pcp.on_cpu;
+
+ if (!is_realtime(t))
+ return -EPERM;
+
+ preempt_disable();
+
+ /* Priority-boost ourself *before* we suspend so that
+ * our priority is boosted when we resume. */
+
+ boost_priority(t, get_priority(t));
+
+ pfp_migrate_to(to);
+
+ pcp_raise_ceiling(&sem->pcp, eprio);
+
+ /* yep, we got it => execute request */
+ sem->owner_cpu = from;
+
+ preempt_enable();
+
+ return 0;
+}
+
+int pfp_dpcp_unlock(struct litmus_lock* l)
+{
+ struct task_struct *t = current;
+ struct dpcp_semaphore *sem = dpcp_from_lock(l);
+ int err = 0;
+ int home;
+
+ preempt_disable();
+
+ if (sem->pcp.on_cpu != smp_processor_id() || sem->pcp.owner != t) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ home = sem->owner_cpu;
+
+ /* give it back */
+ pcp_lower_ceiling(&sem->pcp);
+
+ /* we lose the benefit of priority boosting */
+ unboost_priority(t);
+
+ pfp_migrate_to(home);
+
+out:
+ preempt_enable();
+
+ return err;
+}
+
+int pfp_dpcp_open(struct litmus_lock* l, void* __user config)
+{
+ struct task_struct *t = current;
+ struct dpcp_semaphore *sem = dpcp_from_lock(l);
+ int cpu, eprio;
+
+ if (!is_realtime(t))
+ /* we need to know the real-time priority */
+ return -EPERM;
+
+ if (get_user(cpu, (int*) config))
+ return -EFAULT;
+
+ /* make sure the resource location matches */
+ if (cpu != sem->pcp.on_cpu)
+ return -EINVAL;
+
+ eprio = effective_agent_priority(get_priority(t));
+
+ pcp_update_prio_ceiling(&sem->pcp, eprio);
+
+ return 0;
+}
+
+int pfp_dpcp_close(struct litmus_lock* l)
+{
+ struct task_struct *t = current;
+ struct dpcp_semaphore *sem = dpcp_from_lock(l);
+ int owner = 0;
+
+ preempt_disable();
+
+ if (sem->pcp.on_cpu == smp_processor_id())
+ owner = sem->pcp.owner == t;
+
+ preempt_enable();
+
+ if (owner)
+ pfp_dpcp_unlock(l);
+
+ return 0;
+}
+
+void pfp_dpcp_free(struct litmus_lock* lock)
+{
+ kfree(dpcp_from_lock(lock));
+}
+
+static struct litmus_lock_ops pfp_dpcp_lock_ops = {
+ .close = pfp_dpcp_close,
+ .lock = pfp_dpcp_lock,
+ .open = pfp_dpcp_open,
+ .unlock = pfp_dpcp_unlock,
+ .deallocate = pfp_dpcp_free,
+};
+
+static struct litmus_lock* pfp_new_dpcp(int on_cpu)
+{
+ struct dpcp_semaphore* sem;
+
+ sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+ if (!sem)
+ return NULL;
+
+ sem->litmus_lock.ops = &pfp_dpcp_lock_ops;
+ sem->owner_cpu = NO_CPU;
+ pcp_init_semaphore(&sem->pcp, on_cpu);
+
+ return &sem->litmus_lock;
+}
+
+
+/* **** lock constructor **** */
+
+
+static long pfp_allocate_lock(struct litmus_lock **lock, int type,
+ void* __user config)
+{
+ int err = -ENXIO, cpu;
+ struct srp_semaphore* srp;
+
+ /* P-FP currently supports the SRP for local resources and the FMLP
+ * for global resources. */
+ switch (type) {
+ case FMLP_SEM:
+ /* FIFO Mutex Locking Protocol */
+ *lock = pfp_new_fmlp();
+ if (*lock)
+ err = 0;
+ else
+ err = -ENOMEM;
+ break;
+
+ case MPCP_SEM:
+ /* Multiprocesor Priority Ceiling Protocol */
+ *lock = pfp_new_mpcp(0);
+ if (*lock)
+ err = 0;
+ else
+ err = -ENOMEM;
+ break;
+
+ case MPCP_VS_SEM:
+ /* Multiprocesor Priority Ceiling Protocol with virtual spinning */
+ *lock = pfp_new_mpcp(1);
+ if (*lock)
+ err = 0;
+ else
+ err = -ENOMEM;
+ break;
+
+ case DPCP_SEM:
+ /* Distributed Priority Ceiling Protocol */
+ if (get_user(cpu, (int*) config))
+ return -EFAULT;
+
+ if (!cpu_online(cpu))
+ return -EINVAL;
+
+ *lock = pfp_new_dpcp(cpu);
+ if (*lock)
+ err = 0;
+ else
+ err = -ENOMEM;
+ break;
+
+ case SRP_SEM:
+ /* Baker's Stack Resource Policy */
+ srp = allocate_srp_semaphore();
+ if (srp) {
+ *lock = &srp->litmus_lock;
+ err = 0;
+ } else
+ err = -ENOMEM;
+ break;
+
+ case PCP_SEM:
+ /* Priority Ceiling Protocol */
+ if (get_user(cpu, (int*) config))
+ return -EFAULT;
+
+ if (!cpu_online(cpu))
+ return -EINVAL;
+
+ *lock = pfp_new_pcp(cpu);
+ if (*lock)
+ err = 0;
+ else
+ err = -ENOMEM;
+ break;
+ };
+
+ return err;
+}
+
+#endif
+
+static long pfp_admit_task(struct task_struct* tsk)
+{
+ if (task_cpu(tsk) == tsk->rt_param.task_params.cpu &&
+#ifdef CONFIG_RELEASE_MASTER
+ /* don't allow tasks on release master CPU */
+ task_cpu(tsk) != remote_dom(task_cpu(tsk))->release_master &&
+#endif
+ litmus_is_valid_fixed_prio(get_priority(tsk)))
+ return 0;
+ else
+ return -EINVAL;
+}
+
+static long pfp_activate_plugin(void)
+{
+#if defined(CONFIG_RELEASE_MASTER) || defined(CONFIG_LITMUS_LOCKING)
+ int cpu;
+#endif
+
+#ifdef CONFIG_RELEASE_MASTER
+ for_each_online_cpu(cpu) {
+ remote_dom(cpu)->release_master = atomic_read(&release_master_cpu);
+ }
+#endif
+
+#ifdef CONFIG_LITMUS_LOCKING
+ get_srp_prio = pfp_get_srp_prio;
+
+ for_each_online_cpu(cpu) {
+ init_waitqueue_head(&per_cpu(mpcpvs_vspin_wait, cpu));
+ per_cpu(mpcpvs_vspin, cpu) = NULL;
+
+ pcp_init_state(&per_cpu(pcp_state, cpu));
+ pfp_doms[cpu] = remote_pfp(cpu);
+ }
+
+#endif
+
+ return 0;
+}
+
+
+/* Plugin object */
+static struct sched_plugin pfp_plugin __cacheline_aligned_in_smp = {
+ .plugin_name = "P-FP",
+ .tick = pfp_tick,
+ .task_new = pfp_task_new,
+ .complete_job = complete_job,
+ .task_exit = pfp_task_exit,
+ .schedule = pfp_schedule,
+ .task_wake_up = pfp_task_wake_up,
+ .task_block = pfp_task_block,
+ .admit_task = pfp_admit_task,
+ .activate_plugin = pfp_activate_plugin,
+#ifdef CONFIG_LITMUS_LOCKING
+ .allocate_lock = pfp_allocate_lock,
+ .finish_switch = pfp_finish_switch,
+#endif
+};
+
+
+static int __init init_pfp(void)
+{
+ int i;
+
+ /* We do not really want to support cpu hotplug, do we? ;)
+ * However, if we are so crazy to do so,
+ * we cannot use num_online_cpu()
+ */
+ for (i = 0; i < num_online_cpus(); i++) {
+ pfp_domain_init(remote_pfp(i), i);
+ }
+ return register_sched_plugin(&pfp_plugin);
+}
+
+module_init(init_pfp);
+
diff --git a/litmus/sched_plugin.c b/litmus/sched_plugin.c
new file mode 100644
index 0000000..00a1900
--- /dev/null
+++ b/litmus/sched_plugin.c
@@ -0,0 +1,227 @@
+/* sched_plugin.c -- core infrastructure for the scheduler plugin system
+ *
+ * This file includes the initialization of the plugin system, the no-op Linux
+ * scheduler plugin, some dummy functions, and some helper functions.
+ */
+
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/sched.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/preempt.h>
+#include <litmus/jobs.h>
+
+/*
+ * Generic function to trigger preemption on either local or remote cpu
+ * from scheduler plugins. The key feature is that this function is
+ * non-preemptive section aware and does not invoke the scheduler / send
+ * IPIs if the to-be-preempted task is actually non-preemptive.
+ */
+void preempt_if_preemptable(struct task_struct* t, int cpu)
+{
+ /* t is the real-time task executing on CPU on_cpu If t is NULL, then
+ * on_cpu is currently scheduling background work.
+ */
+
+ int reschedule = 0;
+
+ if (!t)
+ /* move non-real-time task out of the way */
+ reschedule = 1;
+ else {
+ if (smp_processor_id() == cpu) {
+ /* local CPU case */
+ /* check if we need to poke userspace */
+ if (is_user_np(t))
+ /* Yes, poke it. This doesn't have to be atomic since
+ * the task is definitely not executing. */
+ request_exit_np(t);
+ else if (!is_kernel_np(t))
+ /* only if we are allowed to preempt the
+ * currently-executing task */
+ reschedule = 1;
+ } else {
+ /* Remote CPU case. Only notify if it's not a kernel
+ * NP section and if we didn't set the userspace
+ * flag. */
+ reschedule = !(is_kernel_np(t) || request_exit_np_atomic(t));
+ }
+ }
+ if (likely(reschedule))
+ litmus_reschedule(cpu);
+}
+
+
+/*************************************************************
+ * Dummy plugin functions *
+ *************************************************************/
+
+static void litmus_dummy_finish_switch(struct task_struct * prev)
+{
+}
+
+static struct task_struct* litmus_dummy_schedule(struct task_struct * prev)
+{
+ sched_state_task_picked();
+ return NULL;
+}
+
+static void litmus_dummy_tick(struct task_struct* tsk)
+{
+}
+
+static long litmus_dummy_admit_task(struct task_struct* tsk)
+{
+ printk(KERN_CRIT "LITMUS^RT: Linux plugin rejects %s/%d.\n",
+ tsk->comm, tsk->pid);
+ return -EINVAL;
+}
+
+static void litmus_dummy_task_new(struct task_struct *t, int on_rq, int running)
+{
+}
+
+static void litmus_dummy_task_wake_up(struct task_struct *task)
+{
+}
+
+static void litmus_dummy_task_block(struct task_struct *task)
+{
+}
+
+static void litmus_dummy_task_exit(struct task_struct *task)
+{
+}
+
+static long litmus_dummy_complete_job(void)
+{
+ return -ENOSYS;
+}
+
+static long litmus_dummy_activate_plugin(void)
+{
+ return 0;
+}
+
+static long litmus_dummy_deactivate_plugin(void)
+{
+ return 0;
+}
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+static long litmus_dummy_allocate_lock(struct litmus_lock **lock, int type,
+ void* __user config)
+{
+ return -ENXIO;
+}
+
+#endif
+
+
+/* The default scheduler plugin. It doesn't do anything and lets Linux do its
+ * job.
+ */
+struct sched_plugin linux_sched_plugin = {
+ .plugin_name = "Linux",
+ .tick = litmus_dummy_tick,
+ .task_new = litmus_dummy_task_new,
+ .task_exit = litmus_dummy_task_exit,
+ .task_wake_up = litmus_dummy_task_wake_up,
+ .task_block = litmus_dummy_task_block,
+ .complete_job = litmus_dummy_complete_job,
+ .schedule = litmus_dummy_schedule,
+ .finish_switch = litmus_dummy_finish_switch,
+ .activate_plugin = litmus_dummy_activate_plugin,
+ .deactivate_plugin = litmus_dummy_deactivate_plugin,
+#ifdef CONFIG_LITMUS_LOCKING
+ .allocate_lock = litmus_dummy_allocate_lock,
+#endif
+ .admit_task = litmus_dummy_admit_task
+};
+
+/*
+ * The reference to current plugin that is used to schedule tasks within
+ * the system. It stores references to actual function implementations
+ * Should be initialized by calling "init_***_plugin()"
+ */
+struct sched_plugin *litmus = &linux_sched_plugin;
+
+/* the list of registered scheduling plugins */
+static LIST_HEAD(sched_plugins);
+static DEFINE_RAW_SPINLOCK(sched_plugins_lock);
+
+#define CHECK(func) {\
+ if (!plugin->func) \
+ plugin->func = litmus_dummy_ ## func;}
+
+/* FIXME: get reference to module */
+int register_sched_plugin(struct sched_plugin* plugin)
+{
+ printk(KERN_INFO "Registering LITMUS^RT plugin %s.\n",
+ plugin->plugin_name);
+
+ /* make sure we don't trip over null pointers later */
+ CHECK(finish_switch);
+ CHECK(schedule);
+ CHECK(tick);
+ CHECK(task_wake_up);
+ CHECK(task_exit);
+ CHECK(task_block);
+ CHECK(task_new);
+ CHECK(complete_job);
+ CHECK(activate_plugin);
+ CHECK(deactivate_plugin);
+#ifdef CONFIG_LITMUS_LOCKING
+ CHECK(allocate_lock);
+#endif
+ CHECK(admit_task);
+
+ if (!plugin->release_at)
+ plugin->release_at = release_at;
+
+ raw_spin_lock(&sched_plugins_lock);
+ list_add(&plugin->list, &sched_plugins);
+ raw_spin_unlock(&sched_plugins_lock);
+
+ return 0;
+}
+
+
+/* FIXME: reference counting, etc. */
+struct sched_plugin* find_sched_plugin(const char* name)
+{
+ struct list_head *pos;
+ struct sched_plugin *plugin;
+
+ raw_spin_lock(&sched_plugins_lock);
+ list_for_each(pos, &sched_plugins) {
+ plugin = list_entry(pos, struct sched_plugin, list);
+ if (!strcmp(plugin->plugin_name, name))
+ goto out_unlock;
+ }
+ plugin = NULL;
+
+out_unlock:
+ raw_spin_unlock(&sched_plugins_lock);
+ return plugin;
+}
+
+int print_sched_plugins(char* buf, int max)
+{
+ int count = 0;
+ struct list_head *pos;
+ struct sched_plugin *plugin;
+
+ raw_spin_lock(&sched_plugins_lock);
+ list_for_each(pos, &sched_plugins) {
+ plugin = list_entry(pos, struct sched_plugin, list);
+ count += snprintf(buf + count, max - count, "%s\n", plugin->plugin_name);
+ if (max - count <= 0)
+ break;
+ }
+ raw_spin_unlock(&sched_plugins_lock);
+ return count;
+}
diff --git a/litmus/sched_psn_edf.c b/litmus/sched_psn_edf.c
new file mode 100644
index 0000000..0e1675d
--- /dev/null
+++ b/litmus/sched_psn_edf.c
@@ -0,0 +1,653 @@
+/*
+ * kernel/sched_psn_edf.c
+ *
+ * Implementation of the PSN-EDF scheduler plugin.
+ * Based on kern/sched_part_edf.c and kern/sched_gsn_edf.c.
+ *
+ * Suspensions and non-preemptable sections are supported.
+ * Priority inheritance is not supported.
+ */
+
+#include <linux/percpu.h>
+#include <linux/sched.h>
+#include <linux/list.h>
+#include <linux/spinlock.h>
+#include <linux/module.h>
+
+#include <litmus/litmus.h>
+#include <litmus/jobs.h>
+#include <litmus/preempt.h>
+#include <litmus/budget.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/edf_common.h>
+#include <litmus/sched_trace.h>
+#include <litmus/trace.h>
+
+typedef struct {
+ rt_domain_t domain;
+ int cpu;
+ struct task_struct* scheduled; /* only RT tasks */
+/*
+ * scheduling lock slock
+ * protects the domain and serializes scheduling decisions
+ */
+#define slock domain.ready_lock
+
+} psnedf_domain_t;
+
+DEFINE_PER_CPU(psnedf_domain_t, psnedf_domains);
+
+#define local_edf (&__get_cpu_var(psnedf_domains).domain)
+#define local_pedf (&__get_cpu_var(psnedf_domains))
+#define remote_edf(cpu) (&per_cpu(psnedf_domains, cpu).domain)
+#define remote_pedf(cpu) (&per_cpu(psnedf_domains, cpu))
+#define task_edf(task) remote_edf(get_partition(task))
+#define task_pedf(task) remote_pedf(get_partition(task))
+
+
+static void psnedf_domain_init(psnedf_domain_t* pedf,
+ check_resched_needed_t check,
+ release_jobs_t release,
+ int cpu)
+{
+ edf_domain_init(&pedf->domain, check, release);
+ pedf->cpu = cpu;
+ pedf->scheduled = NULL;
+}
+
+static void requeue(struct task_struct* t, rt_domain_t *edf)
+{
+ if (t->state != TASK_RUNNING)
+ TRACE_TASK(t, "requeue: !TASK_RUNNING\n");
+
+ tsk_rt(t)->completed = 0;
+ if (is_released(t, litmus_clock()))
+ __add_ready(edf, t);
+ else
+ add_release(edf, t); /* it has got to wait */
+}
+
+/* we assume the lock is being held */
+static void preempt(psnedf_domain_t *pedf)
+{
+ preempt_if_preemptable(pedf->scheduled, pedf->cpu);
+}
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+static void boost_priority(struct task_struct* t)
+{
+ unsigned long flags;
+ psnedf_domain_t* pedf = task_pedf(t);
+ lt_t now;
+
+ raw_spin_lock_irqsave(&pedf->slock, flags);
+ now = litmus_clock();
+
+ TRACE_TASK(t, "priority boosted at %llu\n", now);
+
+ tsk_rt(t)->priority_boosted = 1;
+ tsk_rt(t)->boost_start_time = now;
+
+ if (pedf->scheduled != t) {
+ /* holder may be queued: first stop queue changes */
+ raw_spin_lock(&pedf->domain.release_lock);
+ if (is_queued(t) &&
+ /* If it is queued, then we need to re-order. */
+ bheap_decrease(edf_ready_order, tsk_rt(t)->heap_node) &&
+ /* If we bubbled to the top, then we need to check for preemptions. */
+ edf_preemption_needed(&pedf->domain, pedf->scheduled))
+ preempt(pedf);
+ raw_spin_unlock(&pedf->domain.release_lock);
+ } /* else: nothing to do since the job is not queued while scheduled */
+
+ raw_spin_unlock_irqrestore(&pedf->slock, flags);
+}
+
+static void unboost_priority(struct task_struct* t)
+{
+ unsigned long flags;
+ psnedf_domain_t* pedf = task_pedf(t);
+ lt_t now;
+
+ raw_spin_lock_irqsave(&pedf->slock, flags);
+ now = litmus_clock();
+
+ /* assumption: this only happens when the job is scheduled */
+ BUG_ON(pedf->scheduled != t);
+
+ TRACE_TASK(t, "priority restored at %llu\n", now);
+
+ /* priority boosted jobs must be scheduled */
+ BUG_ON(pedf->scheduled != t);
+
+ tsk_rt(t)->priority_boosted = 0;
+ tsk_rt(t)->boost_start_time = 0;
+
+ /* check if this changes anything */
+ if (edf_preemption_needed(&pedf->domain, pedf->scheduled))
+ preempt(pedf);
+
+ raw_spin_unlock_irqrestore(&pedf->slock, flags);
+}
+
+#endif
+
+static int psnedf_preempt_check(psnedf_domain_t *pedf)
+{
+ if (edf_preemption_needed(&pedf->domain, pedf->scheduled)) {
+ preempt(pedf);
+ return 1;
+ } else
+ return 0;
+}
+
+/* This check is trivial in partioned systems as we only have to consider
+ * the CPU of the partition.
+ */
+static int psnedf_check_resched(rt_domain_t *edf)
+{
+ psnedf_domain_t *pedf = container_of(edf, psnedf_domain_t, domain);
+
+ /* because this is a callback from rt_domain_t we already hold
+ * the necessary lock for the ready queue
+ */
+ return psnedf_preempt_check(pedf);
+}
+
+static void job_completion(struct task_struct* t, int forced)
+{
+ sched_trace_task_completion(t,forced);
+ TRACE_TASK(t, "job_completion().\n");
+
+ tsk_rt(t)->completed = 1;
+ prepare_for_next_period(t);
+}
+
+static void psnedf_tick(struct task_struct *t)
+{
+ psnedf_domain_t *pedf = local_pedf;
+
+ /* Check for inconsistency. We don't need the lock for this since
+ * ->scheduled is only changed in schedule, which obviously is not
+ * executing in parallel on this CPU
+ */
+ BUG_ON(is_realtime(t) && t != pedf->scheduled);
+
+ if (is_realtime(t) && budget_enforced(t) && budget_exhausted(t)) {
+ if (!is_np(t)) {
+ litmus_reschedule_local();
+ TRACE("psnedf_scheduler_tick: "
+ "%d is preemptable "
+ " => FORCE_RESCHED\n", t->pid);
+ } else if (is_user_np(t)) {
+ TRACE("psnedf_scheduler_tick: "
+ "%d is non-preemptable, "
+ "preemption delayed.\n", t->pid);
+ request_exit_np(t);
+ }
+ }
+}
+
+static struct task_struct* psnedf_schedule(struct task_struct * prev)
+{
+ psnedf_domain_t* pedf = local_pedf;
+ rt_domain_t* edf = &pedf->domain;
+ struct task_struct* next;
+
+ int out_of_time, sleep, preempt,
+ np, exists, blocks, resched;
+
+ raw_spin_lock(&pedf->slock);
+
+ /* sanity checking
+ * differently from gedf, when a task exits (dead)
+ * pedf->schedule may be null and prev _is_ realtime
+ */
+ BUG_ON(pedf->scheduled && pedf->scheduled != prev);
+ BUG_ON(pedf->scheduled && !is_realtime(prev));
+
+ /* (0) Determine state */
+ exists = pedf->scheduled != NULL;
+ blocks = exists && !is_running(pedf->scheduled);
+ out_of_time = exists &&
+ budget_enforced(pedf->scheduled) &&
+ budget_exhausted(pedf->scheduled);
+ np = exists && is_np(pedf->scheduled);
+ sleep = exists && is_completed(pedf->scheduled);
+ preempt = edf_preemption_needed(edf, prev);
+
+ /* If we need to preempt do so.
+ * The following checks set resched to 1 in case of special
+ * circumstances.
+ */
+ resched = preempt;
+
+ /* If a task blocks we have no choice but to reschedule.
+ */
+ if (blocks)
+ resched = 1;
+
+ /* Request a sys_exit_np() call if we would like to preempt but cannot.
+ * Multiple calls to request_exit_np() don't hurt.
+ */
+ if (np && (out_of_time || preempt || sleep))
+ request_exit_np(pedf->scheduled);
+
+ /* Any task that is preemptable and either exhausts its execution
+ * budget or wants to sleep completes. We may have to reschedule after
+ * this.
+ */
+ if (!np && (out_of_time || sleep) && !blocks) {
+ job_completion(pedf->scheduled, !sleep);
+ resched = 1;
+ }
+
+ /* The final scheduling decision. Do we need to switch for some reason?
+ * Switch if we are in RT mode and have no task or if we need to
+ * resched.
+ */
+ next = NULL;
+ if ((!np || blocks) && (resched || !exists)) {
+ /* When preempting a task that does not block, then
+ * re-insert it into either the ready queue or the
+ * release queue (if it completed). requeue() picks
+ * the appropriate queue.
+ */
+ if (pedf->scheduled && !blocks)
+ requeue(pedf->scheduled, edf);
+ next = __take_ready(edf);
+ } else
+ /* Only override Linux scheduler if we have a real-time task
+ * scheduled that needs to continue.
+ */
+ if (exists)
+ next = prev;
+
+ if (next) {
+ TRACE_TASK(next, "scheduled at %llu\n", litmus_clock());
+ tsk_rt(next)->completed = 0;
+ } else {
+ TRACE("becoming idle at %llu\n", litmus_clock());
+ }
+
+ pedf->scheduled = next;
+ sched_state_task_picked();
+ raw_spin_unlock(&pedf->slock);
+
+ return next;
+}
+
+
+/* Prepare a task for running in RT mode
+ */
+static void psnedf_task_new(struct task_struct * t, int on_rq, int running)
+{
+ rt_domain_t* edf = task_edf(t);
+ psnedf_domain_t* pedf = task_pedf(t);
+ unsigned long flags;
+
+ TRACE_TASK(t, "psn edf: task new, cpu = %d\n",
+ t->rt_param.task_params.cpu);
+
+ /* setup job parameters */
+ release_at(t, litmus_clock());
+
+ /* The task should be running in the queue, otherwise signal
+ * code will try to wake it up with fatal consequences.
+ */
+ raw_spin_lock_irqsave(&pedf->slock, flags);
+ if (running) {
+ /* there shouldn't be anything else running at the time */
+ BUG_ON(pedf->scheduled);
+ pedf->scheduled = t;
+ } else {
+ requeue(t, edf);
+ /* maybe we have to reschedule */
+ psnedf_preempt_check(pedf);
+ }
+ raw_spin_unlock_irqrestore(&pedf->slock, flags);
+}
+
+static void psnedf_task_wake_up(struct task_struct *task)
+{
+ unsigned long flags;
+ psnedf_domain_t* pedf = task_pedf(task);
+ rt_domain_t* edf = task_edf(task);
+ lt_t now;
+
+ TRACE_TASK(task, "wake_up at %llu\n", litmus_clock());
+ raw_spin_lock_irqsave(&pedf->slock, flags);
+ BUG_ON(is_queued(task));
+ now = litmus_clock();
+ if (is_tardy(task, now)
+#ifdef CONFIG_LITMUS_LOCKING
+ /* We need to take suspensions because of semaphores into
+ * account! If a job resumes after being suspended due to acquiring
+ * a semaphore, it should never be treated as a new job release.
+ */
+ && !is_priority_boosted(task)
+#endif
+ ) {
+ /* new sporadic release */
+ release_at(task, now);
+ sched_trace_task_release(task);
+ }
+
+ /* Only add to ready queue if it is not the currently-scheduled
+ * task. This could be the case if a task was woken up concurrently
+ * on a remote CPU before the executing CPU got around to actually
+ * de-scheduling the task, i.e., wake_up() raced with schedule()
+ * and won.
+ */
+ if (pedf->scheduled != task) {
+ requeue(task, edf);
+ psnedf_preempt_check(pedf);
+ }
+
+ raw_spin_unlock_irqrestore(&pedf->slock, flags);
+ TRACE_TASK(task, "wake up done\n");
+}
+
+static void psnedf_task_block(struct task_struct *t)
+{
+ /* only running tasks can block, thus t is in no queue */
+ TRACE_TASK(t, "block at %llu, state=%d\n", litmus_clock(), t->state);
+
+ BUG_ON(!is_realtime(t));
+ BUG_ON(is_queued(t));
+}
+
+static void psnedf_task_exit(struct task_struct * t)
+{
+ unsigned long flags;
+ psnedf_domain_t* pedf = task_pedf(t);
+ rt_domain_t* edf;
+
+ raw_spin_lock_irqsave(&pedf->slock, flags);
+ if (is_queued(t)) {
+ /* dequeue */
+ edf = task_edf(t);
+ remove(edf, t);
+ }
+ if (pedf->scheduled == t)
+ pedf->scheduled = NULL;
+
+ TRACE_TASK(t, "RIP, now reschedule\n");
+
+ preempt(pedf);
+ raw_spin_unlock_irqrestore(&pedf->slock, flags);
+}
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+#include <litmus/fdso.h>
+#include <litmus/srp.h>
+
+/* ******************** SRP support ************************ */
+
+static unsigned int psnedf_get_srp_prio(struct task_struct* t)
+{
+ /* assumes implicit deadlines */
+ return get_rt_period(t);
+}
+
+/* ******************** FMLP support ********************** */
+
+/* struct for semaphore with priority inheritance */
+struct fmlp_semaphore {
+ struct litmus_lock litmus_lock;
+
+ /* current resource holder */
+ struct task_struct *owner;
+
+ /* FIFO queue of waiting tasks */
+ wait_queue_head_t wait;
+};
+
+static inline struct fmlp_semaphore* fmlp_from_lock(struct litmus_lock* lock)
+{
+ return container_of(lock, struct fmlp_semaphore, litmus_lock);
+}
+int psnedf_fmlp_lock(struct litmus_lock* l)
+{
+ struct task_struct* t = current;
+ struct fmlp_semaphore *sem = fmlp_from_lock(l);
+ wait_queue_t wait;
+ unsigned long flags;
+
+ if (!is_realtime(t))
+ return -EPERM;
+
+ spin_lock_irqsave(&sem->wait.lock, flags);
+
+ if (sem->owner) {
+ /* resource is not free => must suspend and wait */
+
+ init_waitqueue_entry(&wait, t);
+
+ /* FIXME: interruptible would be nice some day */
+ set_task_state(t, TASK_UNINTERRUPTIBLE);
+
+ __add_wait_queue_tail_exclusive(&sem->wait, &wait);
+
+ TS_LOCK_SUSPEND;
+
+ /* release lock before sleeping */
+ spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+ /* We depend on the FIFO order. Thus, we don't need to recheck
+ * when we wake up; we are guaranteed to have the lock since
+ * there is only one wake up per release.
+ */
+
+ schedule();
+
+ TS_LOCK_RESUME;
+
+ /* Since we hold the lock, no other task will change
+ * ->owner. We can thus check it without acquiring the spin
+ * lock. */
+ BUG_ON(sem->owner != t);
+ } else {
+ /* it's ours now */
+ sem->owner = t;
+
+ /* mark the task as priority-boosted. */
+ boost_priority(t);
+
+ spin_unlock_irqrestore(&sem->wait.lock, flags);
+ }
+
+ return 0;
+}
+
+int psnedf_fmlp_unlock(struct litmus_lock* l)
+{
+ struct task_struct *t = current, *next;
+ struct fmlp_semaphore *sem = fmlp_from_lock(l);
+ unsigned long flags;
+ int err = 0;
+
+ spin_lock_irqsave(&sem->wait.lock, flags);
+
+ if (sem->owner != t) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ /* we lose the benefit of priority boosting */
+
+ unboost_priority(t);
+
+ /* check if there are jobs waiting for this resource */
+ next = __waitqueue_remove_first(&sem->wait);
+ if (next) {
+ /* boost next job */
+ boost_priority(next);
+
+ /* next becomes the resouce holder */
+ sem->owner = next;
+
+ /* wake up next */
+ wake_up_process(next);
+ } else
+ /* resource becomes available */
+ sem->owner = NULL;
+
+out:
+ spin_unlock_irqrestore(&sem->wait.lock, flags);
+ return err;
+}
+
+int psnedf_fmlp_close(struct litmus_lock* l)
+{
+ struct task_struct *t = current;
+ struct fmlp_semaphore *sem = fmlp_from_lock(l);
+ unsigned long flags;
+
+ int owner;
+
+ spin_lock_irqsave(&sem->wait.lock, flags);
+
+ owner = sem->owner == t;
+
+ spin_unlock_irqrestore(&sem->wait.lock, flags);
+
+ if (owner)
+ psnedf_fmlp_unlock(l);
+
+ return 0;
+}
+
+void psnedf_fmlp_free(struct litmus_lock* lock)
+{
+ kfree(fmlp_from_lock(lock));
+}
+
+static struct litmus_lock_ops psnedf_fmlp_lock_ops = {
+ .close = psnedf_fmlp_close,
+ .lock = psnedf_fmlp_lock,
+ .unlock = psnedf_fmlp_unlock,
+ .deallocate = psnedf_fmlp_free,
+};
+
+static struct litmus_lock* psnedf_new_fmlp(void)
+{
+ struct fmlp_semaphore* sem;
+
+ sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+ if (!sem)
+ return NULL;
+
+ sem->owner = NULL;
+ init_waitqueue_head(&sem->wait);
+ sem->litmus_lock.ops = &psnedf_fmlp_lock_ops;
+
+ return &sem->litmus_lock;
+}
+
+/* **** lock constructor **** */
+
+
+static long psnedf_allocate_lock(struct litmus_lock **lock, int type,
+ void* __user unused)
+{
+ int err = -ENXIO;
+ struct srp_semaphore* srp;
+
+ /* PSN-EDF currently supports the SRP for local resources and the FMLP
+ * for global resources. */
+ switch (type) {
+ case FMLP_SEM:
+ /* Flexible Multiprocessor Locking Protocol */
+ *lock = psnedf_new_fmlp();
+ if (*lock)
+ err = 0;
+ else
+ err = -ENOMEM;
+ break;
+
+ case SRP_SEM:
+ /* Baker's Stack Resource Policy */
+ srp = allocate_srp_semaphore();
+ if (srp) {
+ *lock = &srp->litmus_lock;
+ err = 0;
+ } else
+ err = -ENOMEM;
+ break;
+ };
+
+ return err;
+}
+
+#endif
+
+
+static long psnedf_activate_plugin(void)
+{
+#ifdef CONFIG_RELEASE_MASTER
+ int cpu;
+
+ for_each_online_cpu(cpu) {
+ remote_edf(cpu)->release_master = atomic_read(&release_master_cpu);
+ }
+#endif
+
+#ifdef CONFIG_LITMUS_LOCKING
+ get_srp_prio = psnedf_get_srp_prio;
+#endif
+
+ return 0;
+}
+
+static long psnedf_admit_task(struct task_struct* tsk)
+{
+ if (task_cpu(tsk) == tsk->rt_param.task_params.cpu
+#ifdef CONFIG_RELEASE_MASTER
+ /* don't allow tasks on release master CPU */
+ && task_cpu(tsk) != remote_edf(task_cpu(tsk))->release_master
+#endif
+ )
+ return 0;
+ else
+ return -EINVAL;
+}
+
+/* Plugin object */
+static struct sched_plugin psn_edf_plugin __cacheline_aligned_in_smp = {
+ .plugin_name = "PSN-EDF",
+ .tick = psnedf_tick,
+ .task_new = psnedf_task_new,
+ .complete_job = complete_job,
+ .task_exit = psnedf_task_exit,
+ .schedule = psnedf_schedule,
+ .task_wake_up = psnedf_task_wake_up,
+ .task_block = psnedf_task_block,
+ .admit_task = psnedf_admit_task,
+ .activate_plugin = psnedf_activate_plugin,
+#ifdef CONFIG_LITMUS_LOCKING
+ .allocate_lock = psnedf_allocate_lock,
+#endif
+};
+
+
+static int __init init_psn_edf(void)
+{
+ int i;
+
+ /* We do not really want to support cpu hotplug, do we? ;)
+ * However, if we are so crazy to do so,
+ * we cannot use num_online_cpu()
+ */
+ for (i = 0; i < num_online_cpus(); i++) {
+ psnedf_domain_init(remote_pedf(i),
+ psnedf_check_resched,
+ NULL, i);
+ }
+ return register_sched_plugin(&psn_edf_plugin);
+}
+
+module_init(init_psn_edf);
+
diff --git a/litmus/sched_task_trace.c b/litmus/sched_task_trace.c
new file mode 100644
index 0000000..5ef8d09
--- /dev/null
+++ b/litmus/sched_task_trace.c
@@ -0,0 +1,241 @@
+/*
+ * sched_task_trace.c -- record scheduling events to a byte stream
+ */
+
+#define NO_TASK_TRACE_DECLS
+
+#include <linux/module.h>
+#include <linux/sched.h>
+#include <linux/percpu.h>
+
+#include <litmus/ftdev.h>
+#include <litmus/litmus.h>
+
+#include <litmus/sched_trace.h>
+#include <litmus/feather_trace.h>
+#include <litmus/ftdev.h>
+
+
+#define NO_EVENTS (1 << CONFIG_SCHED_TASK_TRACE_SHIFT)
+
+#define now() litmus_clock()
+
+struct local_buffer {
+ struct st_event_record record[NO_EVENTS];
+ char flag[NO_EVENTS];
+ struct ft_buffer ftbuf;
+};
+
+DEFINE_PER_CPU(struct local_buffer, st_event_buffer);
+
+static struct ftdev st_dev;
+
+static int st_dev_can_open(struct ftdev *dev, unsigned int cpu)
+{
+ return cpu_online(cpu) ? 0 : -ENODEV;
+}
+
+static int __init init_sched_task_trace(void)
+{
+ struct local_buffer* buf;
+ int i, ok = 0, err;
+ printk("Allocated %u sched_trace_xxx() events per CPU "
+ "(buffer size: %d bytes)\n",
+ NO_EVENTS, (int) sizeof(struct local_buffer));
+
+ err = ftdev_init(&st_dev, THIS_MODULE,
+ num_online_cpus(), "sched_trace");
+ if (err)
+ goto err_out;
+
+ for (i = 0; i < st_dev.minor_cnt; i++) {
+ buf = &per_cpu(st_event_buffer, i);
+ ok += init_ft_buffer(&buf->ftbuf, NO_EVENTS,
+ sizeof(struct st_event_record),
+ buf->flag,
+ buf->record);
+ st_dev.minor[i].buf = &buf->ftbuf;
+ }
+ if (ok == st_dev.minor_cnt) {
+ st_dev.can_open = st_dev_can_open;
+ err = register_ftdev(&st_dev);
+ if (err)
+ goto err_dealloc;
+ } else {
+ err = -EINVAL;
+ goto err_dealloc;
+ }
+
+ return 0;
+
+err_dealloc:
+ ftdev_exit(&st_dev);
+err_out:
+ printk(KERN_WARNING "Could not register sched_trace module\n");
+ return err;
+}
+
+static void __exit exit_sched_task_trace(void)
+{
+ ftdev_exit(&st_dev);
+}
+
+module_init(init_sched_task_trace);
+module_exit(exit_sched_task_trace);
+
+
+static inline struct st_event_record* get_record(u8 type, struct task_struct* t)
+{
+ struct st_event_record* rec = NULL;
+ struct local_buffer* buf;
+
+ buf = &get_cpu_var(st_event_buffer);
+ if (ft_buffer_start_write(&buf->ftbuf, (void**) &rec)) {
+ rec->hdr.type = type;
+ rec->hdr.cpu = smp_processor_id();
+ rec->hdr.pid = t ? t->pid : 0;
+ rec->hdr.job = t ? t->rt_param.job_params.job_no : 0;
+ } else {
+ put_cpu_var(st_event_buffer);
+ }
+ /* rec will be NULL if it failed */
+ return rec;
+}
+
+static inline void put_record(struct st_event_record* rec)
+{
+ struct local_buffer* buf;
+ buf = &__get_cpu_var(st_event_buffer);
+ ft_buffer_finish_write(&buf->ftbuf, rec);
+ put_cpu_var(st_event_buffer);
+}
+
+feather_callback void do_sched_trace_task_name(unsigned long id, unsigned long _task)
+{
+ struct task_struct *t = (struct task_struct*) _task;
+ struct st_event_record* rec = get_record(ST_NAME, t);
+ int i;
+ if (rec) {
+ for (i = 0; i < min(TASK_COMM_LEN, ST_NAME_LEN); i++)
+ rec->data.name.cmd[i] = t->comm[i];
+ put_record(rec);
+ }
+}
+
+feather_callback void do_sched_trace_task_param(unsigned long id, unsigned long _task)
+{
+ struct task_struct *t = (struct task_struct*) _task;
+ struct st_event_record* rec = get_record(ST_PARAM, t);
+ if (rec) {
+ rec->data.param.wcet = get_exec_cost(t);
+ rec->data.param.period = get_rt_period(t);
+ rec->data.param.phase = get_rt_phase(t);
+ rec->data.param.partition = get_partition(t);
+ rec->data.param.class = get_class(t);
+ put_record(rec);
+ }
+}
+
+feather_callback void do_sched_trace_task_release(unsigned long id, unsigned long _task)
+{
+ struct task_struct *t = (struct task_struct*) _task;
+ struct st_event_record* rec = get_record(ST_RELEASE, t);
+ if (rec) {
+ rec->data.release.release = get_release(t);
+ rec->data.release.deadline = get_deadline(t);
+ put_record(rec);
+ }
+}
+
+/* skipped: st_assigned_data, we don't use it atm */
+
+feather_callback void do_sched_trace_task_switch_to(unsigned long id,
+ unsigned long _task)
+{
+ struct task_struct *t = (struct task_struct*) _task;
+ struct st_event_record* rec;
+ if (is_realtime(t)) {
+ rec = get_record(ST_SWITCH_TO, t);
+ if (rec) {
+ rec->data.switch_to.when = now();
+ rec->data.switch_to.exec_time = get_exec_time(t);
+ put_record(rec);
+ }
+ }
+}
+
+feather_callback void do_sched_trace_task_switch_away(unsigned long id,
+ unsigned long _task)
+{
+ struct task_struct *t = (struct task_struct*) _task;
+ struct st_event_record* rec;
+ if (is_realtime(t)) {
+ rec = get_record(ST_SWITCH_AWAY, t);
+ if (rec) {
+ rec->data.switch_away.when = now();
+ rec->data.switch_away.exec_time = get_exec_time(t);
+ put_record(rec);
+ }
+ }
+}
+
+feather_callback void do_sched_trace_task_completion(unsigned long id,
+ unsigned long _task,
+ unsigned long forced)
+{
+ struct task_struct *t = (struct task_struct*) _task;
+ struct st_event_record* rec = get_record(ST_COMPLETION, t);
+ if (rec) {
+ rec->data.completion.when = now();
+ rec->data.completion.forced = forced;
+ put_record(rec);
+ }
+}
+
+feather_callback void do_sched_trace_task_block(unsigned long id,
+ unsigned long _task)
+{
+ struct task_struct *t = (struct task_struct*) _task;
+ struct st_event_record* rec = get_record(ST_BLOCK, t);
+ if (rec) {
+ rec->data.block.when = now();
+ put_record(rec);
+ }
+}
+
+feather_callback void do_sched_trace_task_resume(unsigned long id,
+ unsigned long _task)
+{
+ struct task_struct *t = (struct task_struct*) _task;
+ struct st_event_record* rec = get_record(ST_RESUME, t);
+ if (rec) {
+ rec->data.resume.when = now();
+ put_record(rec);
+ }
+}
+
+feather_callback void do_sched_trace_sys_release(unsigned long id,
+ unsigned long _start)
+{
+ lt_t *start = (lt_t*) _start;
+ struct st_event_record* rec = get_record(ST_SYS_RELEASE, NULL);
+ if (rec) {
+ rec->data.sys_release.when = now();
+ rec->data.sys_release.release = *start;
+ put_record(rec);
+ }
+}
+
+feather_callback void do_sched_trace_action(unsigned long id,
+ unsigned long _task,
+ unsigned long action)
+{
+ struct task_struct *t = (struct task_struct*) _task;
+ struct st_event_record* rec = get_record(ST_ACTION, t);
+
+ if (rec) {
+ rec->data.action.when = now();
+ rec->data.action.action = action;
+ put_record(rec);
+ }
+}
diff --git a/litmus/sched_trace.c b/litmus/sched_trace.c
new file mode 100644
index 0000000..f4171fd
--- /dev/null
+++ b/litmus/sched_trace.c
@@ -0,0 +1,252 @@
+/*
+ * sched_trace.c -- record scheduling events to a byte stream.
+ */
+#include <linux/spinlock.h>
+#include <linux/mutex.h>
+
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include <linux/miscdevice.h>
+#include <asm/uaccess.h>
+#include <linux/module.h>
+#include <linux/sysrq.h>
+
+#include <linux/kfifo.h>
+
+#include <litmus/sched_trace.h>
+#include <litmus/litmus.h>
+
+#define SCHED_TRACE_NAME "litmus/log"
+
+/* Compute size of TRACE() buffer */
+#define LITMUS_TRACE_BUF_SIZE (1 << CONFIG_SCHED_DEBUG_TRACE_SHIFT)
+
+/* Max length of one read from the buffer */
+#define MAX_READ_LEN (64 * 1024)
+
+/* Max length for one write --- by TRACE() --- to the buffer. This is used to
+ * allocate a per-cpu buffer for printf() formatting. */
+#define MSG_SIZE 255
+
+
+static DEFINE_MUTEX(reader_mutex);
+static atomic_t reader_cnt = ATOMIC_INIT(0);
+static DEFINE_KFIFO(debug_buffer, char, LITMUS_TRACE_BUF_SIZE);
+
+
+static DEFINE_RAW_SPINLOCK(log_buffer_lock);
+static DEFINE_PER_CPU(char[MSG_SIZE], fmt_buffer);
+
+/*
+ * sched_trace_log_message - Write to the trace buffer (log_buffer)
+ *
+ * This is the only function accessing the log_buffer from inside the
+ * kernel for writing.
+ * Concurrent access to sched_trace_log_message must be serialized using
+ * log_buffer_lock
+ * The maximum length of a formatted message is 255
+ */
+void sched_trace_log_message(const char* fmt, ...)
+{
+ unsigned long flags;
+ va_list args;
+ size_t len;
+ char* buf;
+
+ if (!atomic_read(&reader_cnt))
+ /* early exit if nobody is listening */
+ return;
+
+ va_start(args, fmt);
+ local_irq_save(flags);
+
+ /* format message */
+ buf = __get_cpu_var(fmt_buffer);
+ len = vscnprintf(buf, MSG_SIZE, fmt, args);
+
+ raw_spin_lock(&log_buffer_lock);
+ /* Don't copy the trailing null byte, we don't want null bytes in a
+ * text file.
+ */
+ kfifo_in(&debug_buffer, buf, len);
+ raw_spin_unlock(&log_buffer_lock);
+
+ local_irq_restore(flags);
+ va_end(args);
+}
+
+
+/*
+ * log_read - Read the trace buffer
+ *
+ * This function is called as a file operation from userspace.
+ * Readers can sleep. Access is serialized through reader_mutex
+ */
+static ssize_t log_read(struct file *filp,
+ char __user *to, size_t len,
+ loff_t *f_pos)
+{
+ /* we ignore f_pos, this is strictly sequential */
+
+ ssize_t error = -EINVAL;
+ char* mem;
+
+ if (mutex_lock_interruptible(&reader_mutex)) {
+ error = -ERESTARTSYS;
+ goto out;
+ }
+
+ if (len > MAX_READ_LEN)
+ len = MAX_READ_LEN;
+
+ mem = kmalloc(len, GFP_KERNEL);
+ if (!mem) {
+ error = -ENOMEM;
+ goto out_unlock;
+ }
+
+ error = kfifo_out(&debug_buffer, mem, len);
+ while (!error) {
+ set_current_state(TASK_INTERRUPTIBLE);
+ schedule_timeout(110);
+ if (signal_pending(current))
+ error = -ERESTARTSYS;
+ else
+ error = kfifo_out(&debug_buffer, mem, len);
+ }
+
+ if (error > 0 && copy_to_user(to, mem, error))
+ error = -EFAULT;
+
+ kfree(mem);
+ out_unlock:
+ mutex_unlock(&reader_mutex);
+ out:
+ return error;
+}
+
+/*
+ * Enable redirection of printk() messages to the trace buffer.
+ * Defined in kernel/printk.c
+ */
+extern int trace_override;
+extern int trace_recurse;
+
+/*
+ * log_open - open the global log message ring buffer.
+ */
+static int log_open(struct inode *in, struct file *filp)
+{
+ int error = -EINVAL;
+
+ if (mutex_lock_interruptible(&reader_mutex)) {
+ error = -ERESTARTSYS;
+ goto out;
+ }
+
+ atomic_inc(&reader_cnt);
+ error = 0;
+
+ printk(KERN_DEBUG
+ "sched_trace kfifo with buffer starting at: 0x%p\n",
+ debug_buffer.buf);
+
+ /* override printk() */
+ trace_override++;
+
+ mutex_unlock(&reader_mutex);
+ out:
+ return error;
+}
+
+static int log_release(struct inode *in, struct file *filp)
+{
+ int error = -EINVAL;
+
+ if (mutex_lock_interruptible(&reader_mutex)) {
+ error = -ERESTARTSYS;
+ goto out;
+ }
+
+ atomic_dec(&reader_cnt);
+
+ /* release printk() overriding */
+ trace_override--;
+
+ printk(KERN_DEBUG "sched_trace kfifo released\n");
+
+ mutex_unlock(&reader_mutex);
+ out:
+ return error;
+}
+
+/*
+ * log_fops - The file operations for accessing the global LITMUS log message
+ * buffer.
+ *
+ * Except for opening the device file it uses the same operations as trace_fops.
+ */
+static struct file_operations log_fops = {
+ .owner = THIS_MODULE,
+ .open = log_open,
+ .release = log_release,
+ .read = log_read,
+};
+
+static struct miscdevice litmus_log_dev = {
+ .name = SCHED_TRACE_NAME,
+ .minor = MISC_DYNAMIC_MINOR,
+ .fops = &log_fops,
+};
+
+#ifdef CONFIG_MAGIC_SYSRQ
+void dump_trace_buffer(int max)
+{
+ char line[80];
+ int len;
+ int count = 0;
+
+ /* potential, but very unlikely, race... */
+ trace_recurse = 1;
+ while ((max == 0 || count++ < max) &&
+ (len = kfifo_out(&debug_buffer, line, sizeof(line - 1))) > 0) {
+ line[len] = '\0';
+ printk("%s", line);
+ }
+ trace_recurse = 0;
+}
+
+static void sysrq_dump_trace_buffer(int key)
+{
+ dump_trace_buffer(100);
+}
+
+static struct sysrq_key_op sysrq_dump_trace_buffer_op = {
+ .handler = sysrq_dump_trace_buffer,
+ .help_msg = "dump-trace-buffer(Y)",
+ .action_msg = "writing content of TRACE() buffer",
+};
+#endif
+
+static int __init init_sched_trace(void)
+{
+ printk("Initializing TRACE() device\n");
+
+#ifdef CONFIG_MAGIC_SYSRQ
+ /* offer some debugging help */
+ if (!register_sysrq_key('y', &sysrq_dump_trace_buffer_op))
+ printk("Registered dump-trace-buffer(Y) magic sysrq.\n");
+ else
+ printk("Could not register dump-trace-buffer(Y) magic sysrq.\n");
+#endif
+
+ return misc_register(&litmus_log_dev);
+}
+
+static void __exit exit_sched_trace(void)
+{
+ misc_deregister(&litmus_log_dev);
+}
+
+module_init(init_sched_trace);
+module_exit(exit_sched_trace);
diff --git a/litmus/srp.c b/litmus/srp.c
new file mode 100644
index 0000000..2ed4ec1
--- /dev/null
+++ b/litmus/srp.c
@@ -0,0 +1,295 @@
+/* ************************************************************************** */
+/* STACK RESOURCE POLICY */
+/* ************************************************************************** */
+
+#include <asm/atomic.h>
+#include <linux/sched.h>
+#include <linux/wait.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/fdso.h>
+#include <litmus/trace.h>
+
+
+#ifdef CONFIG_LITMUS_LOCKING
+
+#include <litmus/srp.h>
+
+srp_prioritization_t get_srp_prio;
+
+struct srp {
+ struct list_head ceiling;
+ wait_queue_head_t ceiling_blocked;
+};
+#define system_ceiling(srp) list2prio(srp->ceiling.next)
+#define ceiling2sem(c) container_of(c, struct srp_semaphore, ceiling)
+
+#define UNDEF_SEM -2
+
+atomic_t srp_objects_in_use = ATOMIC_INIT(0);
+
+DEFINE_PER_CPU(struct srp, srp);
+
+/* Initialize SRP semaphores at boot time. */
+static int __init srp_init(void)
+{
+ int i;
+
+ printk("Initializing SRP per-CPU ceilings...");
+ for (i = 0; i < NR_CPUS; i++) {
+ init_waitqueue_head(&per_cpu(srp, i).ceiling_blocked);
+ INIT_LIST_HEAD(&per_cpu(srp, i).ceiling);
+ }
+ printk(" done!\n");
+
+ return 0;
+}
+module_init(srp_init);
+
+/* SRP task priority comparison function. Smaller numeric values have higher
+ * priority, tie-break is PID. Special case: priority == 0 <=> no priority
+ */
+static int srp_higher_prio(struct srp_priority* first,
+ struct srp_priority* second)
+{
+ if (!first->priority)
+ return 0;
+ else
+ return !second->priority ||
+ first->priority < second->priority || (
+ first->priority == second->priority &&
+ first->pid < second->pid);
+}
+
+
+static int srp_exceeds_ceiling(struct task_struct* first,
+ struct srp* srp)
+{
+ struct srp_priority prio;
+
+ if (list_empty(&srp->ceiling))
+ return 1;
+ else {
+ prio.pid = first->pid;
+ prio.priority = get_srp_prio(first);
+ return srp_higher_prio(&prio, system_ceiling(srp)) ||
+ ceiling2sem(system_ceiling(srp))->owner == first;
+ }
+}
+
+static void srp_add_prio(struct srp* srp, struct srp_priority* prio)
+{
+ struct list_head *pos;
+ if (in_list(&prio->list)) {
+ printk(KERN_CRIT "WARNING: SRP violation detected, prio is already in "
+ "ceiling list! cpu=%d, srp=%p\n", smp_processor_id(), ceiling2sem(prio));
+ return;
+ }
+ list_for_each(pos, &srp->ceiling)
+ if (unlikely(srp_higher_prio(prio, list2prio(pos)))) {
+ __list_add(&prio->list, pos->prev, pos);
+ return;
+ }
+
+ list_add_tail(&prio->list, &srp->ceiling);
+}
+
+
+static int lock_srp_semaphore(struct litmus_lock* l)
+{
+ struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+
+ if (!is_realtime(current))
+ return -EPERM;
+
+ preempt_disable();
+
+ /* Update ceiling. */
+ srp_add_prio(&__get_cpu_var(srp), &sem->ceiling);
+
+ /* SRP invariant: all resources available */
+ BUG_ON(sem->owner != NULL);
+
+ sem->owner = current;
+ TRACE_CUR("acquired srp 0x%p\n", sem);
+
+ preempt_enable();
+
+ return 0;
+}
+
+static int unlock_srp_semaphore(struct litmus_lock* l)
+{
+ struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+ int err = 0;
+
+ preempt_disable();
+
+ if (sem->owner != current) {
+ err = -EINVAL;
+ } else {
+ /* Determine new system priority ceiling for this CPU. */
+ BUG_ON(!in_list(&sem->ceiling.list));
+
+ list_del(&sem->ceiling.list);
+ sem->owner = NULL;
+
+ /* Wake tasks on this CPU, if they exceed current ceiling. */
+ TRACE_CUR("released srp 0x%p\n", sem);
+ wake_up_all(&__get_cpu_var(srp).ceiling_blocked);
+ }
+
+ preempt_enable();
+ return err;
+}
+
+static int open_srp_semaphore(struct litmus_lock* l, void* __user arg)
+{
+ struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+ int err = 0;
+ struct task_struct* t = current;
+ struct srp_priority t_prio;
+
+ if (!is_realtime(t))
+ return -EPERM;
+
+ TRACE_CUR("opening SRP semaphore %p, cpu=%d\n", sem, sem->cpu);
+
+ preempt_disable();
+
+ if (sem->owner != NULL)
+ err = -EBUSY;
+
+ if (err == 0) {
+ if (sem->cpu == UNDEF_SEM)
+ sem->cpu = get_partition(t);
+ else if (sem->cpu != get_partition(t))
+ err = -EPERM;
+ }
+
+ if (err == 0) {
+ t_prio.priority = get_srp_prio(t);
+ t_prio.pid = t->pid;
+ if (srp_higher_prio(&t_prio, &sem->ceiling)) {
+ sem->ceiling.priority = t_prio.priority;
+ sem->ceiling.pid = t_prio.pid;
+ }
+ }
+
+ preempt_enable();
+
+ return err;
+}
+
+static int close_srp_semaphore(struct litmus_lock* l)
+{
+ struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+ int err = 0;
+
+ preempt_disable();
+
+ if (sem->owner == current)
+ unlock_srp_semaphore(l);
+
+ preempt_enable();
+
+ return err;
+}
+
+static void deallocate_srp_semaphore(struct litmus_lock* l)
+{
+ struct srp_semaphore* sem = container_of(l, struct srp_semaphore, litmus_lock);
+ atomic_dec(&srp_objects_in_use);
+ kfree(sem);
+}
+
+static struct litmus_lock_ops srp_lock_ops = {
+ .open = open_srp_semaphore,
+ .close = close_srp_semaphore,
+ .lock = lock_srp_semaphore,
+ .unlock = unlock_srp_semaphore,
+ .deallocate = deallocate_srp_semaphore,
+};
+
+struct srp_semaphore* allocate_srp_semaphore(void)
+{
+ struct srp_semaphore* sem;
+
+ sem = kmalloc(sizeof(*sem), GFP_KERNEL);
+ if (!sem)
+ return NULL;
+
+ INIT_LIST_HEAD(&sem->ceiling.list);
+ sem->ceiling.priority = 0;
+ sem->cpu = UNDEF_SEM;
+ sem->owner = NULL;
+
+ sem->litmus_lock.ops = &srp_lock_ops;
+
+ atomic_inc(&srp_objects_in_use);
+ return sem;
+}
+
+static int srp_wake_up(wait_queue_t *wait, unsigned mode, int sync,
+ void *key)
+{
+ int cpu = smp_processor_id();
+ struct task_struct *tsk = wait->private;
+ if (cpu != get_partition(tsk))
+ TRACE_TASK(tsk, "srp_wake_up on wrong cpu, partition is %d\b",
+ get_partition(tsk));
+ else if (srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
+ return default_wake_function(wait, mode, sync, key);
+ return 0;
+}
+
+static void do_ceiling_block(struct task_struct *tsk)
+{
+ wait_queue_t wait = {
+ .private = tsk,
+ .func = srp_wake_up,
+ .task_list = {NULL, NULL}
+ };
+
+ tsk->state = TASK_UNINTERRUPTIBLE;
+ add_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
+ tsk->rt_param.srp_non_recurse = 1;
+ preempt_enable_no_resched();
+ schedule();
+ preempt_disable();
+ tsk->rt_param.srp_non_recurse = 0;
+ remove_wait_queue(&__get_cpu_var(srp).ceiling_blocked, &wait);
+}
+
+/* Wait for current task priority to exceed system-wide priority ceiling.
+ * FIXME: the hotpath should be inline.
+ */
+void srp_ceiling_block(void)
+{
+ struct task_struct *tsk = current;
+
+ /* Only applies to real-time tasks, but optimize for RT tasks. */
+ if (unlikely(!is_realtime(tsk)))
+ return;
+
+ /* Avoid recursive ceiling blocking. */
+ if (unlikely(tsk->rt_param.srp_non_recurse))
+ return;
+
+ /* Bail out early if there aren't any SRP resources around. */
+ if (likely(!atomic_read(&srp_objects_in_use)))
+ return;
+
+ preempt_disable();
+ if (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp))) {
+ TRACE_CUR("is priority ceiling blocked.\n");
+ while (!srp_exceeds_ceiling(tsk, &__get_cpu_var(srp)))
+ do_ceiling_block(tsk);
+ TRACE_CUR("finally exceeds system ceiling.\n");
+ } else
+ TRACE_CUR("is not priority ceiling blocked\n");
+ preempt_enable();
+}
+
+#endif
diff --git a/litmus/sync.c b/litmus/sync.c
new file mode 100644
index 0000000..873b3ff
--- /dev/null
+++ b/litmus/sync.c
@@ -0,0 +1,152 @@
+/* litmus/sync.c - Support for synchronous and asynchronous task system releases.
+ *
+ *
+ */
+
+#include <asm/atomic.h>
+#include <asm/uaccess.h>
+#include <linux/spinlock.h>
+#include <linux/list.h>
+#include <linux/sched.h>
+#include <linux/completion.h>
+
+#include <litmus/litmus.h>
+#include <litmus/sched_plugin.h>
+#include <litmus/jobs.h>
+
+#include <litmus/sched_trace.h>
+
+struct ts_release_wait {
+ struct list_head list;
+ struct completion completion;
+ lt_t ts_release_time;
+};
+
+#define DECLARE_TS_RELEASE_WAIT(symb) \
+ struct ts_release_wait symb = \
+ { \
+ LIST_HEAD_INIT(symb.list), \
+ COMPLETION_INITIALIZER_ONSTACK(symb.completion), \
+ 0 \
+ }
+
+static LIST_HEAD(task_release_list);
+static DEFINE_MUTEX(task_release_lock);
+
+static long do_wait_for_ts_release(void)
+{
+ DECLARE_TS_RELEASE_WAIT(wait);
+
+ long ret = -ERESTARTSYS;
+
+ if (mutex_lock_interruptible(&task_release_lock))
+ goto out;
+
+ list_add(&wait.list, &task_release_list);
+
+ mutex_unlock(&task_release_lock);
+
+ /* We are enqueued, now we wait for someone to wake us up. */
+ ret = wait_for_completion_interruptible(&wait.completion);
+
+ if (!ret) {
+ /* Completion succeeded, setup release. */
+ litmus->release_at(current, wait.ts_release_time
+ + current->rt_param.task_params.phase
+ - current->rt_param.task_params.period);
+ /* trigger advance to next job release at the programmed time */
+ ret = complete_job();
+ } else {
+ /* We were interrupted, must cleanup list. */
+ mutex_lock(&task_release_lock);
+ if (!wait.completion.done)
+ list_del(&wait.list);
+ mutex_unlock(&task_release_lock);
+ }
+
+out:
+ return ret;
+}
+
+int count_tasks_waiting_for_release(void)
+{
+ int task_count = 0;
+ struct list_head *pos;
+
+ mutex_lock(&task_release_lock);
+
+ list_for_each(pos, &task_release_list) {
+ task_count++;
+ }
+
+ mutex_unlock(&task_release_lock);
+
+
+ return task_count;
+}
+
+static long do_release_ts(lt_t start)
+{
+ long task_count = 0;
+
+ struct list_head *pos;
+ struct ts_release_wait *wait;
+
+ if (mutex_lock_interruptible(&task_release_lock)) {
+ task_count = -ERESTARTSYS;
+ goto out;
+ }
+
+ TRACE("<<<<<< synchronous task system release >>>>>>\n");
+ sched_trace_sys_release(&start);
+
+ task_count = 0;
+ list_for_each(pos, &task_release_list) {
+ wait = (struct ts_release_wait*)
+ list_entry(pos, struct ts_release_wait, list);
+
+ task_count++;
+ wait->ts_release_time = start;
+ complete(&wait->completion);
+ }
+
+ /* clear stale list */
+ INIT_LIST_HEAD(&task_release_list);
+
+ mutex_unlock(&task_release_lock);
+
+out:
+ return task_count;
+}
+
+
+asmlinkage long sys_wait_for_ts_release(void)
+{
+ long ret = -EPERM;
+ struct task_struct *t = current;
+
+ if (is_realtime(t))
+ ret = do_wait_for_ts_release();
+
+ return ret;
+}
+
+#define ONE_MS 1000000
+
+asmlinkage long sys_release_ts(lt_t __user *__delay)
+{
+ long ret;
+ lt_t delay;
+ lt_t start_time;
+
+ /* FIXME: check capabilities... */
+
+ ret = copy_from_user(&delay, __delay, sizeof(delay));
+ if (ret == 0) {
+ /* round up to next larger integral millisecond */
+ start_time = ((litmus_clock() / ONE_MS) + 1) * ONE_MS;
+ ret = do_release_ts(start_time + delay);
+ }
+
+ return ret;
+}
diff --git a/litmus/trace.c b/litmus/trace.c
new file mode 100644
index 0000000..7dbb98e
--- /dev/null
+++ b/litmus/trace.c
@@ -0,0 +1,300 @@
+#include <linux/sched.h>
+#include <linux/module.h>
+#include <linux/uaccess.h>
+
+#include <litmus/ftdev.h>
+#include <litmus/litmus.h>
+#include <litmus/trace.h>
+
+/******************************************************************************/
+/* Allocation */
+/******************************************************************************/
+
+static struct ftdev overhead_dev;
+
+#define trace_ts_buf overhead_dev.minor[0].buf
+
+static unsigned int ts_seq_no = 0;
+
+DEFINE_PER_CPU(atomic_t, irq_fired_count);
+
+void ft_irq_fired(void)
+{
+ /* Only called with preemptions disabled. */
+ atomic_inc(&__get_cpu_var(irq_fired_count));
+
+ if (has_control_page(current))
+ get_control_page(current)->irq_count++;
+}
+
+static inline void clear_irq_fired(void)
+{
+ atomic_set(&__raw_get_cpu_var(irq_fired_count), 0);
+}
+
+static inline unsigned int get_and_clear_irq_fired(void)
+{
+ /* This is potentially not atomic since we might migrate if
+ * preemptions are not disabled. As a tradeoff between
+ * accuracy and tracing overheads, this seems acceptable.
+ * If it proves to be a problem, then one could add a callback
+ * from the migration code to invalidate irq_fired_count.
+ */
+ return atomic_xchg(&__raw_get_cpu_var(irq_fired_count), 0);
+}
+
+static inline void save_irq_flags(struct timestamp *ts, unsigned int irq_count)
+{
+ /* Store how many interrupts occurred. */
+ ts->irq_count = irq_count;
+ /* Extra flag because ts->irq_count overflows quickly. */
+ ts->irq_flag = irq_count > 0;
+
+}
+
+static inline void write_timestamp(uint8_t event,
+ uint8_t type,
+ uint8_t cpu,
+ uint16_t pid_fragment,
+ unsigned int irq_count,
+ int record_irq,
+ int hide_irq,
+ uint64_t timestamp,
+ int record_timestamp)
+{
+ unsigned long flags;
+ unsigned int seq_no;
+ struct timestamp *ts;
+
+ /* Avoid preemptions while recording the timestamp. This reduces the
+ * number of "out of order" timestamps in the stream and makes
+ * post-processing easier. */
+
+ local_irq_save(flags);
+
+ seq_no = fetch_and_inc((int *) &ts_seq_no);
+ if (ft_buffer_start_write(trace_ts_buf, (void**) &ts)) {
+ ts->event = event;
+ ts->seq_no = seq_no;
+
+ ts->task_type = type;
+ ts->pid = pid_fragment;
+
+ ts->cpu = cpu;
+
+ if (record_irq)
+ irq_count = get_and_clear_irq_fired();
+
+ save_irq_flags(ts, irq_count - hide_irq);
+
+ if (record_timestamp)
+ timestamp = ft_timestamp();
+
+ ts->timestamp = timestamp;
+ ft_buffer_finish_write(trace_ts_buf, ts);
+ }
+
+ local_irq_restore(flags);
+}
+
+static void __add_timestamp_user(struct timestamp *pre_recorded)
+{
+ unsigned long flags;
+ unsigned int seq_no;
+ struct timestamp *ts;
+
+
+ local_irq_save(flags);
+
+ seq_no = fetch_and_inc((int *) &ts_seq_no);
+ if (ft_buffer_start_write(trace_ts_buf, (void**) &ts)) {
+ *ts = *pre_recorded;
+ ts->seq_no = seq_no;
+ ts->cpu = raw_smp_processor_id();
+ save_irq_flags(ts, get_and_clear_irq_fired());
+ ft_buffer_finish_write(trace_ts_buf, ts);
+ }
+
+ local_irq_restore(flags);
+}
+
+feather_callback void save_timestamp(unsigned long event)
+{
+ write_timestamp(event, TSK_UNKNOWN,
+ raw_smp_processor_id(),
+ current->pid,
+ 0, 1, 0,
+ 0, 1);
+}
+
+feather_callback void save_timestamp_def(unsigned long event,
+ unsigned long type)
+{
+ write_timestamp(event, type,
+ raw_smp_processor_id(),
+ current->pid,
+ 0, 1, 0,
+ 0, 1);
+}
+
+feather_callback void save_timestamp_task(unsigned long event,
+ unsigned long t_ptr)
+{
+ struct task_struct *t = (struct task_struct *) t_ptr;
+ int rt = is_realtime(t);
+
+ write_timestamp(event, rt ? TSK_RT : TSK_BE,
+ raw_smp_processor_id(),
+ t->pid,
+ 0, 1, 0,
+ 0, 1);
+}
+
+feather_callback void save_timestamp_cpu(unsigned long event,
+ unsigned long cpu)
+{
+ write_timestamp(event, TSK_UNKNOWN, cpu, current->pid,
+ 0, 1, 0,
+ 0, 1);
+}
+
+feather_callback void save_task_latency(unsigned long event,
+ unsigned long when_ptr)
+{
+ lt_t now = litmus_clock();
+ lt_t *when = (lt_t*) when_ptr;
+
+ write_timestamp(event, TSK_RT, raw_smp_processor_id(), 0,
+ 0, 1, 0,
+ now - *when, 0);
+}
+
+/* fake timestamp to user-reported time */
+feather_callback void save_timestamp_time(unsigned long event,
+ unsigned long ptr)
+{
+ uint64_t* time = (uint64_t*) ptr;
+
+ write_timestamp(event, is_realtime(current) ? TSK_RT : TSK_BE,
+ raw_smp_processor_id(), current->pid,
+ 0, 1, 0,
+ *time, 0);
+}
+
+/* Record user-reported IRQ count */
+feather_callback void save_timestamp_irq(unsigned long event,
+ unsigned long irq_counter_ptr)
+{
+ uint64_t* irqs = (uint64_t*) irq_counter_ptr;
+
+ write_timestamp(event, is_realtime(current) ? TSK_RT : TSK_BE,
+ raw_smp_processor_id(), current->pid,
+ *irqs, 0, 0,
+ 0, 1);
+}
+
+/* Suppress one IRQ from the irq count. Used by TS_SEND_RESCHED_END, which is
+ * called from within an interrupt that is expected. */
+feather_callback void save_timestamp_hide_irq(unsigned long event)
+{
+ write_timestamp(event, is_realtime(current) ? TSK_RT : TSK_BE,
+ raw_smp_processor_id(), current->pid,
+ 0, 1, 1,
+ 0, 1);
+}
+
+/******************************************************************************/
+/* DEVICE FILE DRIVER */
+/******************************************************************************/
+
+/*
+ * should be 8M; it is the max we can ask to buddy system allocator (MAX_ORDER)
+ * and we might not get as much
+ */
+#define NO_TIMESTAMPS (2 << 16)
+
+static int alloc_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
+{
+ unsigned int count = NO_TIMESTAMPS;
+
+ /* An overhead-tracing timestamp should be exactly 16 bytes long. */
+ BUILD_BUG_ON(sizeof(struct timestamp) != 16);
+
+ while (count && !trace_ts_buf) {
+ printk("time stamp buffer: trying to allocate %u time stamps.\n", count);
+ ftdev->minor[idx].buf = alloc_ft_buffer(count, sizeof(struct timestamp));
+ count /= 2;
+ }
+ return ftdev->minor[idx].buf ? 0 : -ENOMEM;
+}
+
+static void free_timestamp_buffer(struct ftdev* ftdev, unsigned int idx)
+{
+ free_ft_buffer(ftdev->minor[idx].buf);
+ ftdev->minor[idx].buf = NULL;
+}
+
+static ssize_t write_timestamp_from_user(struct ft_buffer* buf, size_t len,
+ const char __user *from)
+{
+ ssize_t consumed = 0;
+ struct timestamp ts;
+
+ /* don't give us partial timestamps */
+ if (len % sizeof(ts))
+ return -EINVAL;
+
+ while (len >= sizeof(ts)) {
+ if (copy_from_user(&ts, from, sizeof(ts))) {
+ consumed = -EFAULT;
+ goto out;
+ }
+ len -= sizeof(ts);
+ from += sizeof(ts);
+ consumed += sizeof(ts);
+
+ __add_timestamp_user(&ts);
+ }
+
+out:
+ return consumed;
+}
+
+static int __init init_ft_overhead_trace(void)
+{
+ int err, cpu;
+
+ printk("Initializing Feather-Trace overhead tracing device.\n");
+ err = ftdev_init(&overhead_dev, THIS_MODULE, 1, "ft_trace");
+ if (err)
+ goto err_out;
+
+ overhead_dev.alloc = alloc_timestamp_buffer;
+ overhead_dev.free = free_timestamp_buffer;
+ overhead_dev.write = write_timestamp_from_user;
+
+ err = register_ftdev(&overhead_dev);
+ if (err)
+ goto err_dealloc;
+
+ /* initialize IRQ flags */
+ for (cpu = 0; cpu < NR_CPUS; cpu++) {
+ clear_irq_fired();
+ }
+
+ return 0;
+
+err_dealloc:
+ ftdev_exit(&overhead_dev);
+err_out:
+ printk(KERN_WARNING "Could not register ft_trace module.\n");
+ return err;
+}
+
+static void __exit exit_ft_overhead_trace(void)
+{
+ ftdev_exit(&overhead_dev);
+}
+
+module_init(init_ft_overhead_trace);
+module_exit(exit_ft_overhead_trace);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment