Created
April 2, 2020 06:19
-
-
Save mertcanekiz/cb0f0b19a0dcc9c1334d30ffa8eed2a3 to your computer and use it in GitHub Desktop.
lottery
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* linux/kernel/fork.c | |
* | |
* Copyright (C) 1991, 1992 Linus Torvalds | |
*/ | |
/* | |
* 'fork.c' contains the help-routines for the 'fork' system call | |
* (see also entry.S and others). | |
* Fork is rather simple, once you get the hang of it, but the memory | |
* management can be a bitch. See 'mm/memory.c': 'copy_page_range()' | |
*/ | |
#include <linux/config.h> | |
#include <linux/slab.h> | |
#include <linux/init.h> | |
#include <linux/unistd.h> | |
#include <linux/smp_lock.h> | |
#include <linux/module.h> | |
#include <linux/vmalloc.h> | |
#include <linux/completion.h> | |
#include <linux/namespace.h> | |
#include <linux/personality.h> | |
#include <linux/compiler.h> | |
#include <asm/pgtable.h> | |
#include <asm/pgalloc.h> | |
#include <asm/uaccess.h> | |
#include <asm/mmu_context.h> | |
/* The idle threads do not count.. */ | |
int nr_threads; | |
int nr_running; | |
int max_threads; | |
unsigned long total_forks; /* Handle normal Linux uptimes. */ | |
int last_pid; | |
struct task_struct *pidhash[PIDHASH_SZ]; | |
void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait) | |
{ | |
unsigned long flags; | |
wait->flags &= ~WQ_FLAG_EXCLUSIVE; | |
wq_write_lock_irqsave(&q->lock, flags); | |
__add_wait_queue(q, wait); | |
wq_write_unlock_irqrestore(&q->lock, flags); | |
} | |
void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait) | |
{ | |
unsigned long flags; | |
wait->flags |= WQ_FLAG_EXCLUSIVE; | |
wq_write_lock_irqsave(&q->lock, flags); | |
__add_wait_queue_tail(q, wait); | |
wq_write_unlock_irqrestore(&q->lock, flags); | |
} | |
void remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait) | |
{ | |
unsigned long flags; | |
wq_write_lock_irqsave(&q->lock, flags); | |
__remove_wait_queue(q, wait); | |
wq_write_unlock_irqrestore(&q->lock, flags); | |
} | |
void __init fork_init(unsigned long mempages) | |
{ | |
/* | |
* The default maximum number of threads is set to a safe | |
* value: the thread structures can take up at most half | |
* of memory. | |
*/ | |
max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8; | |
init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2; | |
init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2; | |
} | |
/* Protects next_safe and last_pid. */ | |
spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED; | |
static int get_pid(unsigned long flags) | |
{ | |
static int next_safe = PID_MAX; | |
struct task_struct *p; | |
int pid, beginpid; | |
if (flags & CLONE_PID) | |
return current->pid; | |
spin_lock(&lastpid_lock); | |
beginpid = last_pid; | |
if((++last_pid) & 0xffff8000) { | |
last_pid = 300; /* Skip daemons etc. */ | |
goto inside; | |
} | |
if(last_pid >= next_safe) { | |
inside: | |
next_safe = PID_MAX; | |
read_lock(&tasklist_lock); | |
repeat: | |
for_each_task(p) { | |
if(p->pid == last_pid || | |
p->pgrp == last_pid || | |
p->tgid == last_pid || | |
p->session == last_pid) { | |
if(++last_pid >= next_safe) { | |
if(last_pid & 0xffff8000) | |
last_pid = 300; | |
next_safe = PID_MAX; | |
} | |
if(unlikely(last_pid == beginpid)) | |
goto nomorepids; | |
goto repeat; | |
} | |
if(p->pid > last_pid && next_safe > p->pid) | |
next_safe = p->pid; | |
if(p->pgrp > last_pid && next_safe > p->pgrp) | |
next_safe = p->pgrp; | |
if(p->tgid > last_pid && next_safe > p->tgid) | |
next_safe = p->tgid; | |
if(p->session > last_pid && next_safe > p->session) | |
next_safe = p->session; | |
} | |
read_unlock(&tasklist_lock); | |
} | |
pid = last_pid; | |
spin_unlock(&lastpid_lock); | |
return pid; | |
nomorepids: | |
read_unlock(&tasklist_lock); | |
spin_unlock(&lastpid_lock); | |
return 0; | |
} | |
static inline int dup_mmap(struct mm_struct * mm) | |
{ | |
struct vm_area_struct * mpnt, *tmp, **pprev; | |
int retval; | |
flush_cache_mm(current->mm); | |
mm->locked_vm = 0; | |
mm->mmap = NULL; | |
mm->mmap_cache = NULL; | |
mm->map_count = 0; | |
mm->rss = 0; | |
mm->cpu_vm_mask = 0; | |
mm->swap_address = 0; | |
pprev = &mm->mmap; | |
/* | |
* Add it to the mmlist after the parent. | |
* Doing it this way means that we can order the list, | |
* and fork() won't mess up the ordering significantly. | |
* Add it first so that swapoff can see any swap entries. | |
*/ | |
spin_lock(&mmlist_lock); | |
list_add(&mm->mmlist, ¤t->mm->mmlist); | |
mmlist_nr++; | |
spin_unlock(&mmlist_lock); | |
for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) { | |
struct file *file; | |
retval = -ENOMEM; | |
if(mpnt->vm_flags & VM_DONTCOPY) | |
continue; | |
tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL); | |
if (!tmp) | |
goto fail_nomem; | |
*tmp = *mpnt; | |
tmp->vm_flags &= ~VM_LOCKED; | |
tmp->vm_mm = mm; | |
tmp->vm_next = NULL; | |
file = tmp->vm_file; | |
if (file) { | |
struct inode *inode = file->f_dentry->d_inode; | |
get_file(file); | |
if (tmp->vm_flags & VM_DENYWRITE) | |
atomic_dec(&inode->i_writecount); | |
/* insert tmp into the share list, just after mpnt */ | |
spin_lock(&inode->i_mapping->i_shared_lock); | |
if((tmp->vm_next_share = mpnt->vm_next_share) != NULL) | |
mpnt->vm_next_share->vm_pprev_share = | |
&tmp->vm_next_share; | |
mpnt->vm_next_share = tmp; | |
tmp->vm_pprev_share = &mpnt->vm_next_share; | |
spin_unlock(&inode->i_mapping->i_shared_lock); | |
} | |
/* | |
* Link in the new vma and copy the page table entries: | |
* link in first so that swapoff can see swap entries. | |
*/ | |
spin_lock(&mm->page_table_lock); | |
*pprev = tmp; | |
pprev = &tmp->vm_next; | |
mm->map_count++; | |
retval = copy_page_range(mm, current->mm, tmp); | |
spin_unlock(&mm->page_table_lock); | |
if (tmp->vm_ops && tmp->vm_ops->open) | |
tmp->vm_ops->open(tmp); | |
if (retval) | |
goto fail_nomem; | |
} | |
retval = 0; | |
build_mmap_rb(mm); | |
fail_nomem: | |
flush_tlb_mm(current->mm); | |
return retval; | |
} | |
spinlock_t mmlist_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; | |
int mmlist_nr; | |
#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL)) | |
#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) | |
static struct mm_struct * mm_init(struct mm_struct * mm) | |
{ | |
atomic_set(&mm->mm_users, 1); | |
atomic_set(&mm->mm_count, 1); | |
init_rwsem(&mm->mmap_sem); | |
mm->page_table_lock = SPIN_LOCK_UNLOCKED; | |
mm->pgd = pgd_alloc(mm); | |
mm->def_flags = 0; | |
if (mm->pgd) | |
return mm; | |
free_mm(mm); | |
return NULL; | |
} | |
/* | |
* Allocate and initialize an mm_struct. | |
*/ | |
struct mm_struct * mm_alloc(void) | |
{ | |
struct mm_struct * mm; | |
mm = allocate_mm(); | |
if (mm) { | |
memset(mm, 0, sizeof(*mm)); | |
return mm_init(mm); | |
} | |
return NULL; | |
} | |
/* | |
* Called when the last reference to the mm | |
* is dropped: either by a lazy thread or by | |
* mmput. Free the page directory and the mm. | |
*/ | |
inline void __mmdrop(struct mm_struct *mm) | |
{ | |
BUG_ON(mm == &init_mm); | |
pgd_free(mm->pgd); | |
destroy_context(mm); | |
free_mm(mm); | |
} | |
/* | |
* Decrement the use count and release all resources for an mm. | |
*/ | |
void mmput(struct mm_struct *mm) | |
{ | |
if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) { | |
extern struct mm_struct *swap_mm; | |
if (swap_mm == mm) | |
swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist); | |
list_del(&mm->mmlist); | |
mmlist_nr--; | |
spin_unlock(&mmlist_lock); | |
exit_mmap(mm); | |
mmdrop(mm); | |
} | |
} | |
/* Please note the differences between mmput and mm_release. | |
* mmput is called whenever we stop holding onto a mm_struct, | |
* error success whatever. | |
* | |
* mm_release is called after a mm_struct has been removed | |
* from the current process. | |
* | |
* This difference is important for error handling, when we | |
* only half set up a mm_struct for a new process and need to restore | |
* the old one. Because we mmput the new mm_struct before | |
* restoring the old one. . . | |
* Eric Biederman 10 January 1998 | |
*/ | |
void mm_release(void) | |
{ | |
struct task_struct *tsk = current; | |
struct completion *vfork_done = tsk->vfork_done; | |
/* notify parent sleeping on vfork() */ | |
if (vfork_done) { | |
tsk->vfork_done = NULL; | |
complete(vfork_done); | |
} | |
} | |
static int copy_mm(unsigned long clone_flags, struct task_struct * tsk) | |
{ | |
struct mm_struct * mm, *oldmm; | |
int retval; | |
tsk->min_flt = tsk->maj_flt = 0; | |
tsk->cmin_flt = tsk->cmaj_flt = 0; | |
tsk->nswap = tsk->cnswap = 0; | |
tsk->mm = NULL; | |
tsk->active_mm = NULL; | |
/* | |
* Are we cloning a kernel thread? | |
* | |
* We need to steal a active VM for that.. | |
*/ | |
oldmm = current->mm; | |
if (!oldmm) | |
return 0; | |
if (clone_flags & CLONE_VM) { | |
atomic_inc(&oldmm->mm_users); | |
mm = oldmm; | |
goto good_mm; | |
} | |
retval = -ENOMEM; | |
mm = allocate_mm(); | |
if (!mm) | |
goto fail_nomem; | |
/* Copy the current MM stuff.. */ | |
memcpy(mm, oldmm, sizeof(*mm)); | |
if (!mm_init(mm)) | |
goto fail_nomem; | |
if (init_new_context(tsk,mm)) | |
goto free_pt; | |
down_write(&oldmm->mmap_sem); | |
retval = dup_mmap(mm); | |
up_write(&oldmm->mmap_sem); | |
if (retval) | |
goto free_pt; | |
/* | |
* child gets a private LDT (if there was an LDT in the parent) | |
*/ | |
copy_segments(tsk, mm); | |
good_mm: | |
tsk->mm = mm; | |
tsk->active_mm = mm; | |
return 0; | |
free_pt: | |
mmput(mm); | |
fail_nomem: | |
return retval; | |
} | |
static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old) | |
{ | |
struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL); | |
/* We don't need to lock fs - think why ;-) */ | |
if (fs) { | |
atomic_set(&fs->count, 1); | |
fs->lock = RW_LOCK_UNLOCKED; | |
fs->umask = old->umask; | |
read_lock(&old->lock); | |
fs->rootmnt = mntget(old->rootmnt); | |
fs->root = dget(old->root); | |
fs->pwdmnt = mntget(old->pwdmnt); | |
fs->pwd = dget(old->pwd); | |
if (old->altroot) { | |
fs->altrootmnt = mntget(old->altrootmnt); | |
fs->altroot = dget(old->altroot); | |
} else { | |
fs->altrootmnt = NULL; | |
fs->altroot = NULL; | |
} | |
read_unlock(&old->lock); | |
} | |
return fs; | |
} | |
struct fs_struct *copy_fs_struct(struct fs_struct *old) | |
{ | |
return __copy_fs_struct(old); | |
} | |
static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk) | |
{ | |
if (clone_flags & CLONE_FS) { | |
atomic_inc(¤t->fs->count); | |
return 0; | |
} | |
tsk->fs = __copy_fs_struct(current->fs); | |
if (!tsk->fs) | |
return -1; | |
return 0; | |
} | |
static int count_open_files(struct files_struct *files, int size) | |
{ | |
int i; | |
/* Find the last open fd */ | |
for (i = size/(8*sizeof(long)); i > 0; ) { | |
if (files->open_fds->fds_bits[--i]) | |
break; | |
} | |
i = (i+1) * 8 * sizeof(long); | |
return i; | |
} | |
static int copy_files(unsigned long clone_flags, struct task_struct * tsk) | |
{ | |
struct files_struct *oldf, *newf; | |
struct file **old_fds, **new_fds; | |
int open_files, nfds, size, i, error = 0; | |
/* | |
* A background process may not have any files ... | |
*/ | |
oldf = current->files; | |
if (!oldf) | |
goto out; | |
if (clone_flags & CLONE_FILES) { | |
atomic_inc(&oldf->count); | |
goto out; | |
} | |
tsk->files = NULL; | |
error = -ENOMEM; | |
newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL); | |
if (!newf) | |
goto out; | |
atomic_set(&newf->count, 1); | |
newf->file_lock = RW_LOCK_UNLOCKED; | |
newf->next_fd = 0; | |
newf->max_fds = NR_OPEN_DEFAULT; | |
newf->max_fdset = __FD_SETSIZE; | |
newf->close_on_exec = &newf->close_on_exec_init; | |
newf->open_fds = &newf->open_fds_init; | |
newf->fd = &newf->fd_array[0]; | |
/* We don't yet have the oldf readlock, but even if the old | |
fdset gets grown now, we'll only copy up to "size" fds */ | |
size = oldf->max_fdset; | |
if (size > __FD_SETSIZE) { | |
newf->max_fdset = 0; | |
write_lock(&newf->file_lock); | |
error = expand_fdset(newf, size-1); | |
write_unlock(&newf->file_lock); | |
if (error) | |
goto out_release; | |
} | |
read_lock(&oldf->file_lock); | |
open_files = count_open_files(oldf, size); | |
/* | |
* Check whether we need to allocate a larger fd array. | |
* Note: we're not a clone task, so the open count won't | |
* change. | |
*/ | |
nfds = NR_OPEN_DEFAULT; | |
if (open_files > nfds) { | |
read_unlock(&oldf->file_lock); | |
newf->max_fds = 0; | |
write_lock(&newf->file_lock); | |
error = expand_fd_array(newf, open_files-1); | |
write_unlock(&newf->file_lock); | |
if (error) | |
goto out_release; | |
nfds = newf->max_fds; | |
read_lock(&oldf->file_lock); | |
} | |
old_fds = oldf->fd; | |
new_fds = newf->fd; | |
memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8); | |
memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8); | |
for (i = open_files; i != 0; i--) { | |
struct file *f = *old_fds++; | |
if (f) | |
get_file(f); | |
*new_fds++ = f; | |
} | |
read_unlock(&oldf->file_lock); | |
/* compute the remainder to be cleared */ | |
size = (newf->max_fds - open_files) * sizeof(struct file *); | |
/* This is long word aligned thus could use a optimized version */ | |
memset(new_fds, 0, size); | |
if (newf->max_fdset > open_files) { | |
int left = (newf->max_fdset-open_files)/8; | |
int start = open_files / (8 * sizeof(unsigned long)); | |
memset(&newf->open_fds->fds_bits[start], 0, left); | |
memset(&newf->close_on_exec->fds_bits[start], 0, left); | |
} | |
tsk->files = newf; | |
error = 0; | |
out: | |
return error; | |
out_release: | |
free_fdset (newf->close_on_exec, newf->max_fdset); | |
free_fdset (newf->open_fds, newf->max_fdset); | |
kmem_cache_free(files_cachep, newf); | |
goto out; | |
} | |
static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk) | |
{ | |
struct signal_struct *sig; | |
if (clone_flags & CLONE_SIGHAND) { | |
atomic_inc(¤t->sig->count); | |
return 0; | |
} | |
sig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL); | |
tsk->sig = sig; | |
if (!sig) | |
return -1; | |
spin_lock_init(&sig->siglock); | |
atomic_set(&sig->count, 1); | |
memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action)); | |
return 0; | |
} | |
static inline void copy_flags(unsigned long clone_flags, struct task_struct *p) | |
{ | |
unsigned long new_flags = p->flags; | |
new_flags &= ~(PF_SUPERPRIV | PF_USEDFPU); | |
new_flags |= PF_FORKNOEXEC; | |
if (!(clone_flags & CLONE_PTRACE)) | |
p->ptrace = 0; | |
p->flags = new_flags; | |
} | |
/* | |
* Ok, this is the main fork-routine. It copies the system process | |
* information (task[nr]) and sets up the necessary registers. It also | |
* copies the data segment in its entirety. The "stack_start" and | |
* "stack_top" arguments are simply passed along to the platform | |
* specific copy_thread() routine. Most platforms ignore stack_top. | |
* For an example that's using stack_top, see | |
* arch/ia64/kernel/process.c. | |
*/ | |
int do_fork(unsigned long clone_flags, unsigned long stack_start, | |
struct pt_regs *regs, unsigned long stack_size) | |
{ | |
int retval; | |
struct task_struct *p; | |
struct completion vfork; | |
if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS)) | |
return -EINVAL; | |
retval = -EPERM; | |
/* | |
* CLONE_PID is only allowed for the initial SMP swapper | |
* calls | |
*/ | |
if (clone_flags & CLONE_PID) { | |
if (current->pid) | |
goto fork_out; | |
} | |
retval = -ENOMEM; | |
p = alloc_task_struct(); | |
if (!p) | |
goto fork_out; | |
*p = *current; | |
retval = -EAGAIN; | |
/* | |
* Check if we are over our maximum process limit, but be sure to | |
* exclude root. This is needed to make it possible for login and | |
* friends to set the per-user process limit to something lower | |
* than the amount of processes root is running. -- Rik | |
*/ | |
if (atomic_read(&p->user->processes) >= p->rlim[RLIMIT_NPROC].rlim_cur | |
&& !capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE)) | |
goto bad_fork_free; | |
atomic_inc(&p->user->__count); | |
atomic_inc(&p->user->processes); | |
/* | |
* Counter increases are protected by | |
* the kernel lock so nr_threads can't | |
* increase under us (but it may decrease). | |
*/ | |
if (nr_threads >= max_threads) | |
goto bad_fork_cleanup_count; | |
get_exec_domain(p->exec_domain); | |
if (p->binfmt && p->binfmt->module) | |
__MOD_INC_USE_COUNT(p->binfmt->module); | |
p->did_exec = 0; | |
p->swappable = 0; | |
p->state = TASK_UNINTERRUPTIBLE; | |
copy_flags(clone_flags, p); | |
p->pid = get_pid(clone_flags); | |
if (p->pid == 0 && current->pid != 0) | |
goto bad_fork_cleanup; | |
p->run_list.next = NULL; | |
p->run_list.prev = NULL; | |
p->p_cptr = NULL; | |
init_waitqueue_head(&p->wait_chldexit); | |
p->vfork_done = NULL; | |
if (clone_flags & CLONE_VFORK) { | |
p->vfork_done = &vfork; | |
init_completion(&vfork); | |
} | |
spin_lock_init(&p->alloc_lock); | |
p->sigpending = 0; | |
init_sigpending(&p->pending); | |
p->it_real_value = p->it_virt_value = p->it_prof_value = 0; | |
p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0; | |
init_timer(&p->real_timer); | |
p->real_timer.data = (unsigned long) p; | |
p->leader = 0; /* session leadership doesn't inherit */ | |
p->tty_old_pgrp = 0; | |
p->times.tms_utime = p->times.tms_stime = 0; | |
p->times.tms_cutime = p->times.tms_cstime = 0; | |
p->tn=5; | |
p->cpustart=jiffies*10; | |
#ifdef CONFIG_SMP | |
{ | |
int i; | |
p->cpus_runnable = ~0UL; | |
p->processor = current->processor; | |
/* ?? should we just memset this ?? */ | |
for(i = 0; i < smp_num_cpus; i++) | |
p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0; | |
spin_lock_init(&p->sigmask_lock); | |
} | |
#endif | |
p->lock_depth = -1; /* -1 = no lock */ | |
p->start_time = jiffies; | |
INIT_LIST_HEAD(&p->local_pages); | |
retval = -ENOMEM; | |
/* copy all the process information */ | |
if (copy_files(clone_flags, p)) | |
goto bad_fork_cleanup; | |
if (copy_fs(clone_flags, p)) | |
goto bad_fork_cleanup_files; | |
if (copy_sighand(clone_flags, p)) | |
goto bad_fork_cleanup_fs; | |
if (copy_mm(clone_flags, p)) | |
goto bad_fork_cleanup_sighand; | |
if (copy_namespace(clone_flags, p)) | |
goto bad_fork_cleanup_mm; | |
retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs); | |
if (retval) | |
goto bad_fork_cleanup_namespace; | |
p->semundo = NULL; | |
/* Our parent execution domain becomes current domain | |
These must match for thread signalling to apply */ | |
p->parent_exec_id = p->self_exec_id; | |
/* ok, now we should be set up.. */ | |
p->swappable = 1; | |
p->exit_signal = clone_flags & CSIGNAL; | |
p->pdeath_signal = 0; | |
/* | |
* "share" dynamic priority between parent and child, thus the | |
* total amount of dynamic priorities in the system doesn't change, | |
* more scheduling fairness. This is only important in the first | |
* timeslice, on the long run the scheduling behaviour is unchanged. | |
*/ | |
p->counter = (current->counter + 1) >> 1; | |
current->counter >>= 1; | |
if (!current->counter) | |
current->need_resched = 1; | |
/* | |
* Ok, add it to the run-queues and make it | |
* visible to the rest of the system. | |
* | |
* Let it rip! | |
*/ | |
retval = p->pid; | |
p->tgid = retval; | |
INIT_LIST_HEAD(&p->thread_group); | |
/* Need tasklist lock for parent etc handling! */ | |
write_lock_irq(&tasklist_lock); | |
/* CLONE_PARENT re-uses the old parent */ | |
p->p_opptr = current->p_opptr; | |
p->p_pptr = current->p_pptr; | |
if (!(clone_flags & CLONE_PARENT)) { | |
p->p_opptr = current; | |
if (!(p->ptrace & PT_PTRACED)) | |
p->p_pptr = current; | |
} | |
if (clone_flags & CLONE_THREAD) { | |
p->tgid = current->tgid; | |
list_add(&p->thread_group, ¤t->thread_group); | |
} | |
SET_LINKS(p); | |
hash_pid(p); | |
nr_threads++; | |
write_unlock_irq(&tasklist_lock); | |
if (p->ptrace & PT_PTRACED) | |
send_sig(SIGSTOP, p, 1); | |
wake_up_process(p); /* do this last */ | |
++total_forks; | |
if (clone_flags & CLONE_VFORK) | |
wait_for_completion(&vfork); | |
fork_out: | |
return retval; | |
bad_fork_cleanup_namespace: | |
exit_namespace(p); | |
bad_fork_cleanup_mm: | |
exit_mm(p); | |
bad_fork_cleanup_sighand: | |
exit_sighand(p); | |
bad_fork_cleanup_fs: | |
exit_fs(p); /* blocking */ | |
bad_fork_cleanup_files: | |
exit_files(p); /* blocking */ | |
bad_fork_cleanup: | |
put_exec_domain(p->exec_domain); | |
if (p->binfmt && p->binfmt->module) | |
__MOD_DEC_USE_COUNT(p->binfmt->module); | |
bad_fork_cleanup_count: | |
atomic_dec(&p->user->processes); | |
free_uid(p->user); | |
bad_fork_free: | |
free_task_struct(p); | |
goto fork_out; | |
} | |
/* SLAB cache for signal_struct structures (tsk->sig) */ | |
kmem_cache_t *sigact_cachep; | |
/* SLAB cache for files_struct structures (tsk->files) */ | |
kmem_cache_t *files_cachep; | |
/* SLAB cache for fs_struct structures (tsk->fs) */ | |
kmem_cache_t *fs_cachep; | |
/* SLAB cache for vm_area_struct structures */ | |
kmem_cache_t *vm_area_cachep; | |
/* SLAB cache for mm_struct structures (tsk->mm) */ | |
kmem_cache_t *mm_cachep; | |
void __init proc_caches_init(void) | |
{ | |
sigact_cachep = kmem_cache_create("signal_act", | |
sizeof(struct signal_struct), 0, | |
SLAB_HWCACHE_ALIGN, NULL, NULL); | |
if (!sigact_cachep) | |
panic("Cannot create signal action SLAB cache"); | |
files_cachep = kmem_cache_create("files_cache", | |
sizeof(struct files_struct), 0, | |
SLAB_HWCACHE_ALIGN, NULL, NULL); | |
if (!files_cachep) | |
panic("Cannot create files SLAB cache"); | |
fs_cachep = kmem_cache_create("fs_cache", | |
sizeof(struct fs_struct), 0, | |
SLAB_HWCACHE_ALIGN, NULL, NULL); | |
if (!fs_cachep) | |
panic("Cannot create fs_struct SLAB cache"); | |
vm_area_cachep = kmem_cache_create("vm_area_struct", | |
sizeof(struct vm_area_struct), 0, | |
SLAB_HWCACHE_ALIGN, NULL, NULL); | |
if(!vm_area_cachep) | |
panic("vma_init: Cannot alloc vm_area_struct SLAB cache"); | |
mm_cachep = kmem_cache_create("mm_struct", | |
sizeof(struct mm_struct), 0, | |
SLAB_HWCACHE_ALIGN, NULL, NULL); | |
if(!mm_cachep) | |
panic("vma_init: Cannot alloc mm_struct SLAB cache"); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
* linux/kernel/sched.c | |
* | |
* Kernel scheduler and related syscalls | |
* | |
* Copyright (C) 1991, 1992 Linus Torvalds | |
* | |
* 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and | |
* make semaphores SMP safe | |
* 1998-11-19 Implemented schedule_timeout() and related stuff | |
* by Andrea Arcangeli | |
* 1998-12-28 Implemented better SMP scheduling by Ingo Molnar | |
*/ | |
/* | |
* 'sched.c' is the main kernel file. It contains scheduling primitives | |
* (sleep_on, wakeup, schedule etc) as well as a number of simple system | |
* call functions (type getpid()), which just extract a field from | |
* current-task | |
*/ | |
#include <linux/config.h> | |
#include <linux/mm.h> | |
#include <linux/init.h> | |
#include <linux/smp_lock.h> | |
#include <linux/nmi.h> | |
#include <linux/interrupt.h> | |
#include <linux/kernel_stat.h> | |
#include <linux/completion.h> | |
#include <linux/prefetch.h> | |
#include <linux/compiler.h> | |
#include <asm/uaccess.h> | |
#include <asm/mmu_context.h> | |
extern void timer_bh(void); | |
extern void tqueue_bh(void); | |
extern void immediate_bh(void); | |
extern int opsyspolisi; | |
/* | |
* scheduler variables | |
*/ | |
unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */ | |
extern void mem_use(void); | |
/* | |
* Scheduling quanta. | |
* | |
* NOTE! The unix "nice" value influences how long a process | |
* gets. The nice value ranges from -20 to +19, where a -20 | |
* is a "high-priority" task, and a "+10" is a low-priority | |
* task. | |
* | |
* We want the time-slice to be around 50ms or so, so this | |
* calculation depends on the value of HZ. | |
*/ | |
#if HZ < 200 | |
#define TICK_SCALE(x) ((x) >> 2) | |
#elif HZ < 400 | |
#define TICK_SCALE(x) ((x) >> 1) | |
#elif HZ < 800 | |
#define TICK_SCALE(x) (x) | |
#elif HZ < 1600 | |
#define TICK_SCALE(x) ((x) << 1) | |
#else | |
#define TICK_SCALE(x) ((x) << 2) | |
#endif | |
#define NICE_TO_TICKS(nice) (TICK_SCALE(20-(nice))+1) | |
/* | |
* Init task must be ok at boot for the ix86 as we will check its signals | |
* via the SMP irq return path. | |
*/ | |
struct task_struct * init_tasks[NR_CPUS] = {&init_task, }; | |
/* | |
* The tasklist_lock protects the linked list of processes. | |
* | |
* The runqueue_lock locks the parts that actually access | |
* and change the run-queues, and have to be interrupt-safe. | |
* | |
* If both locks are to be concurrently held, the runqueue_lock | |
* nests inside the tasklist_lock. | |
* | |
* task->alloc_lock nests inside tasklist_lock. | |
*/ | |
spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED; /* inner */ | |
rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED; /* outer */ | |
static LIST_HEAD(runqueue_head); | |
/* | |
* We align per-CPU scheduling data on cacheline boundaries, | |
* to prevent cacheline ping-pong. | |
*/ | |
static union { | |
struct schedule_data { | |
struct task_struct * curr; | |
cycles_t last_schedule; | |
} schedule_data; | |
char __pad [SMP_CACHE_BYTES]; | |
} aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}}; | |
#define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr | |
#define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule | |
struct kernel_stat kstat; | |
extern struct task_struct *child_reaper; | |
#ifdef CONFIG_SMP | |
#define idle_task(cpu) (init_tasks[cpu_number_map(cpu)]) | |
#define can_schedule(p,cpu) \ | |
((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu)) | |
#else | |
#define idle_task(cpu) (&init_task) | |
#define can_schedule(p,cpu) (1) | |
#endif | |
void scheduling_functions_start_here(void) { } | |
/* | |
* This is the function that decides how desirable a process is.. | |
* You can weigh different processes against each other depending | |
* on what CPU they've run on lately etc to try to handle cache | |
* and TLB miss penalties. | |
* | |
* Return values: | |
* -1000: never select this | |
* 0: out of time, recalculate counters (but it might still be | |
* selected) | |
* +ve: "goodness" value (the larger, the better) | |
* +1000: realtime process, select this. | |
*/ | |
static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm) | |
{ | |
int weight; | |
/* | |
* select the current process after every other | |
* runnable process, but before the idle thread. | |
* Also, dont trigger a counter recalculation. | |
*/ | |
weight = -1; | |
if (p->policy & SCHED_YIELD) | |
goto out; | |
/* | |
* Non-RT process - normal case first. | |
*/ | |
if (p->policy == SCHED_OTHER) { | |
/* | |
* Give the process a first-approximation goodness value | |
* according to the number of clock-ticks it has left. | |
* | |
* Don't do any other calculations if the time slice is | |
* over.. | |
*/ | |
weight = p->counter; | |
if (!weight) | |
goto out; | |
#ifdef CONFIG_SMP | |
/* Give a largish advantage to the same processor... */ | |
/* (this is equivalent to penalizing other processors) */ | |
if (p->processor == this_cpu) | |
weight += PROC_CHANGE_PENALTY; | |
#endif | |
/* .. and a slight advantage to the current MM */ | |
if (p->mm == this_mm || !p->mm) | |
weight += 1; | |
weight += 20 - p->nice; | |
goto out; | |
} | |
/* | |
* Realtime process, select the first one on the | |
* runqueue (taking priorities within processes | |
* into account). | |
*/ | |
weight = 1000 + p->rt_priority; | |
out: | |
return weight; | |
} | |
/* | |
* the 'goodness value' of replacing a process on a given CPU. | |
* positive value means 'replace', zero or negative means 'dont'. | |
*/ | |
static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu) | |
{ | |
return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm); | |
} | |
/* | |
* This is ugly, but reschedule_idle() is very timing-critical. | |
* We are called with the runqueue spinlock held and we must | |
* not claim the tasklist_lock. | |
*/ | |
static FASTCALL(void reschedule_idle(struct task_struct * p)); | |
static void reschedule_idle(struct task_struct * p) | |
{ | |
#ifdef CONFIG_SMP | |
int this_cpu = smp_processor_id(); | |
struct task_struct *tsk, *target_tsk; | |
int cpu, best_cpu, i, max_prio; | |
cycles_t oldest_idle; | |
/* | |
* shortcut if the woken up task's last CPU is | |
* idle now. | |
*/ | |
best_cpu = p->processor; | |
if (can_schedule(p, best_cpu)) { | |
tsk = idle_task(best_cpu); | |
if (cpu_curr(best_cpu) == tsk) { | |
int need_resched; | |
send_now_idle: | |
/* | |
* If need_resched == -1 then we can skip sending | |
* the IPI altogether, tsk->need_resched is | |
* actively watched by the idle thread. | |
*/ | |
need_resched = tsk->need_resched; | |
tsk->need_resched = 1; | |
if ((best_cpu != this_cpu) && !need_resched) | |
smp_send_reschedule(best_cpu); | |
return; | |
} | |
} | |
/* | |
* We know that the preferred CPU has a cache-affine current | |
* process, lets try to find a new idle CPU for the woken-up | |
* process. Select the least recently active idle CPU. (that | |
* one will have the least active cache context.) Also find | |
* the executing process which has the least priority. | |
*/ | |
oldest_idle = (cycles_t) -1; | |
target_tsk = NULL; | |
max_prio = 0; | |
for (i = 0; i < smp_num_cpus; i++) { | |
cpu = cpu_logical_map(i); | |
if (!can_schedule(p, cpu)) | |
continue; | |
tsk = cpu_curr(cpu); | |
/* | |
* We use the first available idle CPU. This creates | |
* a priority list between idle CPUs, but this is not | |
* a problem. | |
*/ | |
if (tsk == idle_task(cpu)) { | |
#if defined(__i386__) && defined(CONFIG_SMP) | |
/* | |
* Check if two siblings are idle in the same | |
* physical package. Use them if found. | |
*/ | |
if (smp_num_siblings == 2) { | |
if (cpu_curr(cpu_sibling_map[cpu]) == | |
idle_task(cpu_sibling_map[cpu])) { | |
oldest_idle = last_schedule(cpu); | |
target_tsk = tsk; | |
break; | |
} | |
} | |
#endif | |
if (last_schedule(cpu) < oldest_idle) { | |
oldest_idle = last_schedule(cpu); | |
target_tsk = tsk; | |
} | |
} else { | |
if (oldest_idle == -1ULL) { | |
int prio = preemption_goodness(tsk, p, cpu); | |
if (prio > max_prio) { | |
max_prio = prio; | |
target_tsk = tsk; | |
} | |
} | |
} | |
} | |
tsk = target_tsk; | |
if (tsk) { | |
if (oldest_idle != -1ULL) { | |
best_cpu = tsk->processor; | |
goto send_now_idle; | |
} | |
tsk->need_resched = 1; | |
if (tsk->processor != this_cpu) | |
smp_send_reschedule(tsk->processor); | |
} | |
return; | |
#else /* UP */ | |
int this_cpu = smp_processor_id(); | |
struct task_struct *tsk; | |
tsk = cpu_curr(this_cpu); | |
if (preemption_goodness(tsk, p, this_cpu) > 0) | |
tsk->need_resched = 1; | |
#endif | |
} | |
/* | |
* Careful! | |
* | |
* This has to add the process to the _end_ of the | |
* run-queue, not the beginning. The goodness value will | |
* determine whether this process will run next. This is | |
* important to get SCHED_FIFO and SCHED_RR right, where | |
* a process that is either pre-empted or its time slice | |
* has expired, should be moved to the tail of the run | |
* queue for its priority - Bhavesh Davda | |
*/ | |
static inline void add_to_runqueue(struct task_struct * p) | |
{ | |
list_add_tail(&p->run_list, &runqueue_head); | |
nr_running++; | |
} | |
static inline void move_last_runqueue(struct task_struct * p) | |
{ | |
list_del(&p->run_list); | |
list_add_tail(&p->run_list, &runqueue_head); | |
} | |
/* | |
* Wake up a process. Put it on the run-queue if it's not | |
* already there. The "current" process is always on the | |
* run-queue (except when the actual re-schedule is in | |
* progress), and as such you're allowed to do the simpler | |
* "current->state = TASK_RUNNING" to mark yourself runnable | |
* without the overhead of this. | |
*/ | |
static inline int try_to_wake_up(struct task_struct * p, int synchronous) | |
{ | |
unsigned long flags; | |
int success = 0; | |
/* | |
* We want the common case fall through straight, thus the goto. | |
*/ | |
spin_lock_irqsave(&runqueue_lock, flags); | |
p->state = TASK_RUNNING; | |
if (task_on_runqueue(p)) | |
goto out; | |
add_to_runqueue(p); | |
if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id()))) | |
reschedule_idle(p); | |
success = 1; | |
out: | |
spin_unlock_irqrestore(&runqueue_lock, flags); | |
return success; | |
} | |
inline int wake_up_process(struct task_struct * p) | |
{ | |
return try_to_wake_up(p, 0); | |
} | |
static void process_timeout(unsigned long __data) | |
{ | |
struct task_struct * p = (struct task_struct *) __data; | |
wake_up_process(p); | |
} | |
/** | |
* schedule_timeout - sleep until timeout | |
* @timeout: timeout value in jiffies | |
* | |
* Make the current task sleep until @timeout jiffies have | |
* elapsed. The routine will return immediately unless | |
* the current task state has been set (see set_current_state()). | |
* | |
* You can set the task state as follows - | |
* | |
* %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to | |
* pass before the routine returns. The routine will return 0 | |
* | |
* %TASK_INTERRUPTIBLE - the routine may return early if a signal is | |
* delivered to the current task. In this case the remaining time | |
* in jiffies will be returned, or 0 if the timer expired in time | |
* | |
* The current task state is guaranteed to be TASK_RUNNING when this | |
* routine returns. | |
* | |
* Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule | |
* the CPU away without a bound on the timeout. In this case the return | |
* value will be %MAX_SCHEDULE_TIMEOUT. | |
* | |
* In all cases the return value is guaranteed to be non-negative. | |
*/ | |
signed long schedule_timeout(signed long timeout) | |
{ | |
struct timer_list timer; | |
unsigned long expire; | |
switch (timeout) | |
{ | |
case MAX_SCHEDULE_TIMEOUT: | |
/* | |
* These two special cases are useful to be comfortable | |
* in the caller. Nothing more. We could take | |
* MAX_SCHEDULE_TIMEOUT from one of the negative value | |
* but I' d like to return a valid offset (>=0) to allow | |
* the caller to do everything it want with the retval. | |
*/ | |
schedule(); | |
goto out; | |
default: | |
/* | |
* Another bit of PARANOID. Note that the retval will be | |
* 0 since no piece of kernel is supposed to do a check | |
* for a negative retval of schedule_timeout() (since it | |
* should never happens anyway). You just have the printk() | |
* that will tell you if something is gone wrong and where. | |
*/ | |
if (timeout < 0) | |
{ | |
printk(KERN_ERR "schedule_timeout: wrong timeout " | |
"value %lx from %p\n", timeout, | |
__builtin_return_address(0)); | |
current->state = TASK_RUNNING; | |
goto out; | |
} | |
} | |
expire = timeout + jiffies; | |
init_timer(&timer); | |
timer.expires = expire; | |
timer.data = (unsigned long) current; | |
timer.function = process_timeout; | |
add_timer(&timer); | |
schedule(); | |
del_timer_sync(&timer); | |
timeout = expire - jiffies; | |
out: | |
return timeout < 0 ? 0 : timeout; | |
} | |
/* | |
* schedule_tail() is getting called from the fork return path. This | |
* cleans up all remaining scheduler things, without impacting the | |
* common case. | |
*/ | |
static inline void __schedule_tail(struct task_struct *prev) | |
{ | |
#ifdef CONFIG_SMP | |
int policy; | |
/* | |
* prev->policy can be written from here only before `prev' | |
* can be scheduled (before setting prev->cpus_runnable to ~0UL). | |
* Of course it must also be read before allowing prev | |
* to be rescheduled, but since the write depends on the read | |
* to complete, wmb() is enough. (the spin_lock() acquired | |
* before setting cpus_runnable is not enough because the spin_lock() | |
* common code semantics allows code outside the critical section | |
* to enter inside the critical section) | |
*/ | |
policy = prev->policy; | |
prev->policy = policy & ~SCHED_YIELD; | |
wmb(); | |
/* | |
* fast path falls through. We have to clear cpus_runnable before | |
* checking prev->state to avoid a wakeup race. Protect against | |
* the task exiting early. | |
*/ | |
task_lock(prev); | |
task_release_cpu(prev); | |
mb(); | |
if (prev->state == TASK_RUNNING) | |
goto needs_resched; | |
out_unlock: | |
task_unlock(prev); /* Synchronise here with release_task() if prev is TASK_ZOMBIE */ | |
return; | |
/* | |
* Slow path - we 'push' the previous process and | |
* reschedule_idle() will attempt to find a new | |
* processor for it. (but it might preempt the | |
* current process as well.) We must take the runqueue | |
* lock and re-check prev->state to be correct. It might | |
* still happen that this process has a preemption | |
* 'in progress' already - but this is not a problem and | |
* might happen in other circumstances as well. | |
*/ | |
needs_resched: | |
{ | |
unsigned long flags; | |
/* | |
* Avoid taking the runqueue lock in cases where | |
* no preemption-check is necessery: | |
*/ | |
if ((prev == idle_task(smp_processor_id())) || | |
(policy & SCHED_YIELD)) | |
goto out_unlock; | |
spin_lock_irqsave(&runqueue_lock, flags); | |
if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev)) | |
reschedule_idle(prev); | |
spin_unlock_irqrestore(&runqueue_lock, flags); | |
goto out_unlock; | |
} | |
#else | |
prev->policy &= ~SCHED_YIELD; | |
#endif /* CONFIG_SMP */ | |
} | |
asmlinkage void schedule_tail(struct task_struct *prev) | |
{ | |
__schedule_tail(prev); | |
} | |
/* | |
* 'schedule()' is the scheduler function. It's a very simple and nice | |
* scheduler: it's not perfect, but certainly works for most things. | |
* | |
* The goto is "interesting". | |
* | |
* NOTE!! Task 0 is the 'idle' task, which gets called when no other | |
* tasks can run. It can not be killed, and it cannot sleep. The 'state' | |
* information in task[0] is never used. | |
*/ | |
asmlinkage void schedule(void) | |
{ | |
struct schedule_data * sched_data; | |
struct task_struct *prev, *next, *p; | |
struct list_head *tmp; | |
int this_cpu, c; | |
spin_lock_prefetch(&runqueue_lock); | |
BUG_ON(!current->active_mm); | |
need_resched_back: | |
prev = current; | |
this_cpu = prev->processor; | |
if (unlikely(in_interrupt())) { | |
printk("Scheduling in interrupt\n"); | |
BUG(); | |
} | |
release_kernel_lock(prev, this_cpu); | |
/* | |
* 'sched_data' is protected by the fact that we can run | |
* only one process per CPU. | |
*/ | |
sched_data = & aligned_data[this_cpu].schedule_data; | |
spin_lock_irq(&runqueue_lock); | |
/* move an exhausted RR process to be last.. */ | |
if (unlikely(prev->policy == SCHED_RR)) | |
if (!prev->counter) { | |
prev->counter = NICE_TO_TICKS(prev->nice); | |
move_last_runqueue(prev); | |
} | |
switch (prev->state) { | |
case TASK_INTERRUPTIBLE: | |
if (signal_pending(prev)) { | |
prev->state = TASK_RUNNING; | |
break; | |
} | |
default: | |
del_from_runqueue(prev); | |
case TASK_RUNNING:; | |
} | |
prev->need_resched = 0; | |
/* | |
* this is the scheduler proper: | |
*/ | |
repeat_schedule: | |
if(opsyspolisi==1) | |
{ | |
/* | |
* Default process to select.. | |
*/ | |
next = idle_task(this_cpu); | |
c = -1000; | |
list_for_each(tmp, &runqueue_head) | |
{ | |
p = list_entry(tmp, struct task_struct, run_list); | |
if (can_schedule(p, this_cpu)) | |
{ | |
int weight = goodness(p, this_cpu, prev->active_mm); | |
if (weight > c) | |
c = weight, next = p; | |
} | |
} | |
/* Do we need to re-calculate counters? */ | |
if (unlikely(!c)) { | |
struct task_struct *p; | |
spin_unlock_irq(&runqueue_lock); | |
read_lock(&tasklist_lock); | |
for_each_task(p) | |
p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice); | |
read_unlock(&tasklist_lock); | |
spin_lock_irq(&runqueue_lock); | |
goto repeat_schedule; | |
} | |
} | |
else if(opsyspolisi==2) | |
{ | |
next = idle_task(this_cpu); | |
unsigned int searchbit=1; | |
unsigned int rndmbit=0; | |
unsigned int randomtn=0; | |
unsigned int ttn=0; | |
int flag=0; | |
list_for_each(tmp, &runqueue_head) | |
{ | |
p = list_entry(tmp, struct task_struct, run_list); | |
if (can_schedule(p, this_cpu)) | |
{ | |
ttn+=p->tn; | |
} | |
} | |
do | |
{ | |
get_random_bytes(&rndmbit,sizeof(unsigned int)); | |
if(ttn==0) | |
{ | |
return NULL; | |
} | |
randomtn=rndmbit % ttn; | |
}while(randomtn<0); | |
list_for_each(tmp, &runqueue_head) | |
{ | |
p = list_entry(tmp, struct task_struct, run_list); | |
if (can_schedule(p, this_cpu) && flag==0) | |
{ | |
searchbit+=p->tn; | |
if(searchbit>=randomtn) | |
{ | |
next=p; | |
break; | |
} | |
} | |
} | |
/* reacalculate tickets*/ | |
if((jiffies*10)-prev->cpustart<20) | |
{ | |
if(prev->tn>1) | |
{ | |
prev->tn=prev->tn-1; | |
prev->cpustart=0; | |
} | |
} | |
else if((jiffies*10)-prev->cpustart>200) | |
{ | |
if(prev->tn<9) | |
{ | |
prev->tn=prev->tn+1; | |
prev->cpustart=0; | |
} | |
} | |
} | |
/* | |
* from this point on nothing can prevent us from | |
* switching to the next task, save this fact in | |
* sched_data. | |
*/ | |
sched_data->curr = next; | |
task_set_cpu(next, this_cpu); | |
spin_unlock_irq(&runqueue_lock); | |
if (unlikely(prev == next)) { | |
/* We won't go through the normal tail, so do this by hand */ | |
prev->policy &= ~SCHED_YIELD; | |
goto same_process; | |
} | |
#ifdef CONFIG_SMP | |
/* | |
* maintain the per-process 'last schedule' value. | |
* (this has to be recalculated even if we reschedule to | |
* the same process) Currently this is only used on SMP, | |
* and it's approximate, so we do not have to maintain | |
* it while holding the runqueue spinlock. | |
*/ | |
sched_data->last_schedule = get_cycles(); | |
/* | |
* We drop the scheduler lock early (it's a global spinlock), | |
* thus we have to lock the previous process from getting | |
* rescheduled during switch_to(). | |
*/ | |
#endif /* CONFIG_SMP */ | |
kstat.context_swtch++; | |
/* | |
* there are 3 processes which are affected by a context switch: | |
* | |
* prev == .... ==> (last => next) | |
* | |
* It's the 'much more previous' 'prev' that is on next's stack, | |
* but prev is set to (the just run) 'last' process by switch_to(). | |
* This might sound slightly confusing but makes tons of sense. | |
*/ | |
prepare_to_switch(); | |
{ | |
struct mm_struct *mm = next->mm; | |
struct mm_struct *oldmm = prev->active_mm; | |
if (!mm) { | |
BUG_ON(next->active_mm); | |
next->active_mm = oldmm; | |
atomic_inc(&oldmm->mm_count); | |
enter_lazy_tlb(oldmm, next, this_cpu); | |
} else { | |
BUG_ON(next->active_mm != mm); | |
switch_mm(oldmm, mm, next, this_cpu); | |
} | |
if (!prev->mm) { | |
prev->active_mm = NULL; | |
mmdrop(oldmm); | |
} | |
} | |
/* | |
* This just switches the register state and the | |
* stack. | |
*/ | |
switch_to(prev, next, prev); | |
__schedule_tail(prev); | |
same_process: | |
reacquire_kernel_lock(current); | |
if (current->need_resched) | |
goto need_resched_back; | |
return; | |
} | |
/* | |
* The core wakeup function. Non-exclusive wakeups (nr_exclusive == 0) just wake everything | |
* up. If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the | |
* non-exclusive tasks and one exclusive task. | |
* | |
* There are circumstances in which we can try to wake a task which has already | |
* started to run but is not in state TASK_RUNNING. try_to_wake_up() returns zero | |
* in this (rare) case, and we handle it by contonuing to scan the queue. | |
*/ | |
static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode, | |
int nr_exclusive, const int sync) | |
{ | |
struct list_head *tmp; | |
struct task_struct *p; | |
CHECK_MAGIC_WQHEAD(q); | |
WQ_CHECK_LIST_HEAD(&q->task_list); | |
list_for_each(tmp,&q->task_list) { | |
unsigned int state; | |
wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list); | |
CHECK_MAGIC(curr->__magic); | |
p = curr->task; | |
state = p->state; | |
if (state & mode) { | |
WQ_NOTE_WAKER(curr); | |
if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive) | |
break; | |
} | |
} | |
} | |
void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr) | |
{ | |
if (q) { | |
unsigned long flags; | |
wq_read_lock_irqsave(&q->lock, flags); | |
__wake_up_common(q, mode, nr, 0); | |
wq_read_unlock_irqrestore(&q->lock, flags); | |
} | |
} | |
void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr) | |
{ | |
if (q) { | |
unsigned long flags; | |
wq_read_lock_irqsave(&q->lock, flags); | |
__wake_up_common(q, mode, nr, 1); | |
wq_read_unlock_irqrestore(&q->lock, flags); | |
} | |
} | |
void complete(struct completion *x) | |
{ | |
unsigned long flags; | |
spin_lock_irqsave(&x->wait.lock, flags); | |
x->done++; | |
__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1, 0); | |
spin_unlock_irqrestore(&x->wait.lock, flags); | |
} | |
void wait_for_completion(struct completion *x) | |
{ | |
spin_lock_irq(&x->wait.lock); | |
if (!x->done) { | |
DECLARE_WAITQUEUE(wait, current); | |
wait.flags |= WQ_FLAG_EXCLUSIVE; | |
__add_wait_queue_tail(&x->wait, &wait); | |
do { | |
__set_current_state(TASK_UNINTERRUPTIBLE); | |
spin_unlock_irq(&x->wait.lock); | |
schedule(); | |
spin_lock_irq(&x->wait.lock); | |
} while (!x->done); | |
__remove_wait_queue(&x->wait, &wait); | |
} | |
x->done--; | |
spin_unlock_irq(&x->wait.lock); | |
} | |
#define SLEEP_ON_VAR \ | |
unsigned long flags; \ | |
wait_queue_t wait; \ | |
init_waitqueue_entry(&wait, current); | |
#define SLEEP_ON_HEAD \ | |
wq_write_lock_irqsave(&q->lock,flags); \ | |
__add_wait_queue(q, &wait); \ | |
wq_write_unlock(&q->lock); | |
#define SLEEP_ON_TAIL \ | |
wq_write_lock_irq(&q->lock); \ | |
__remove_wait_queue(q, &wait); \ | |
wq_write_unlock_irqrestore(&q->lock,flags); | |
void interruptible_sleep_on(wait_queue_head_t *q) | |
{ | |
SLEEP_ON_VAR | |
current->state = TASK_INTERRUPTIBLE; | |
SLEEP_ON_HEAD | |
schedule(); | |
SLEEP_ON_TAIL | |
} | |
long interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout) | |
{ | |
SLEEP_ON_VAR | |
current->state = TASK_INTERRUPTIBLE; | |
SLEEP_ON_HEAD | |
timeout = schedule_timeout(timeout); | |
SLEEP_ON_TAIL | |
return timeout; | |
} | |
void sleep_on(wait_queue_head_t *q) | |
{ | |
SLEEP_ON_VAR | |
current->state = TASK_UNINTERRUPTIBLE; | |
SLEEP_ON_HEAD | |
schedule(); | |
SLEEP_ON_TAIL | |
} | |
long sleep_on_timeout(wait_queue_head_t *q, long timeout) | |
{ | |
SLEEP_ON_VAR | |
current->state = TASK_UNINTERRUPTIBLE; | |
SLEEP_ON_HEAD | |
timeout = schedule_timeout(timeout); | |
SLEEP_ON_TAIL | |
return timeout; | |
} | |
void scheduling_functions_end_here(void) { } | |
#ifndef __alpha__ | |
/* | |
* This has been replaced by sys_setpriority. Maybe it should be | |
* moved into the arch dependent tree for those ports that require | |
* it for backward compatibility? | |
*/ | |
asmlinkage long sys_nice(int increment) | |
{ | |
long newprio; | |
/* | |
* Setpriority might change our priority at the same moment. | |
* We don't have to worry. Conceptually one call occurs first | |
* and we have a single winner. | |
*/ | |
if (increment < 0) { | |
if (!capable(CAP_SYS_NICE)) | |
return -EPERM; | |
if (increment < -40) | |
increment = -40; | |
} | |
if (increment > 40) | |
increment = 40; | |
newprio = current->nice + increment; | |
if (newprio < -20) | |
newprio = -20; | |
if (newprio > 19) | |
newprio = 19; | |
current->nice = newprio; | |
return 0; | |
} | |
#endif | |
static inline struct task_struct *find_process_by_pid(pid_t pid) | |
{ | |
struct task_struct *tsk = current; | |
if (pid) | |
tsk = find_task_by_pid(pid); | |
return tsk; | |
} | |
static int setscheduler(pid_t pid, int policy, | |
struct sched_param *param) | |
{ | |
struct sched_param lp; | |
struct task_struct *p; | |
int retval; | |
retval = -EINVAL; | |
if (!param || pid < 0) | |
goto out_nounlock; | |
retval = -EFAULT; | |
if (copy_from_user(&lp, param, sizeof(struct sched_param))) | |
goto out_nounlock; | |
/* | |
* We play safe to avoid deadlocks. | |
*/ | |
read_lock_irq(&tasklist_lock); | |
spin_lock(&runqueue_lock); | |
p = find_process_by_pid(pid); | |
retval = -ESRCH; | |
if (!p) | |
goto out_unlock; | |
if (policy < 0) | |
policy = p->policy; | |
else { | |
retval = -EINVAL; | |
if (policy != SCHED_FIFO && policy != SCHED_RR && | |
policy != SCHED_OTHER) | |
goto out_unlock; | |
} | |
/* | |
* Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid | |
* priority for SCHED_OTHER is 0. | |
*/ | |
retval = -EINVAL; | |
if (lp.sched_priority < 0 || lp.sched_priority > 99) | |
goto out_unlock; | |
if ((policy == SCHED_OTHER) != (lp.sched_priority == 0)) | |
goto out_unlock; | |
retval = -EPERM; | |
if ((policy == SCHED_FIFO || policy == SCHED_RR) && | |
!capable(CAP_SYS_NICE)) | |
goto out_unlock; | |
if ((current->euid != p->euid) && (current->euid != p->uid) && | |
!capable(CAP_SYS_NICE)) | |
goto out_unlock; | |
retval = 0; | |
p->policy = policy; | |
p->rt_priority = lp.sched_priority; | |
current->need_resched = 1; | |
out_unlock: | |
spin_unlock(&runqueue_lock); | |
read_unlock_irq(&tasklist_lock); | |
out_nounlock: | |
return retval; | |
} | |
asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, | |
struct sched_param *param) | |
{ | |
return setscheduler(pid, policy, param); | |
} | |
asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param *param) | |
{ | |
return setscheduler(pid, -1, param); | |
} | |
asmlinkage long sys_sched_getscheduler(pid_t pid) | |
{ | |
struct task_struct *p; | |
int retval; | |
retval = -EINVAL; | |
if (pid < 0) | |
goto out_nounlock; | |
retval = -ESRCH; | |
read_lock(&tasklist_lock); | |
p = find_process_by_pid(pid); | |
if (p) | |
retval = p->policy & ~SCHED_YIELD; | |
read_unlock(&tasklist_lock); | |
out_nounlock: | |
return retval; | |
} | |
asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param) | |
{ | |
struct task_struct *p; | |
struct sched_param lp; | |
int retval; | |
retval = -EINVAL; | |
if (!param || pid < 0) | |
goto out_nounlock; | |
read_lock(&tasklist_lock); | |
p = find_process_by_pid(pid); | |
retval = -ESRCH; | |
if (!p) | |
goto out_unlock; | |
lp.sched_priority = p->rt_priority; | |
read_unlock(&tasklist_lock); | |
/* | |
* This one might sleep, we cannot do it with a spinlock held ... | |
*/ | |
retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; | |
out_nounlock: | |
return retval; | |
out_unlock: | |
read_unlock(&tasklist_lock); | |
return retval; | |
} | |
asmlinkage long sys_sched_yield(void) | |
{ | |
/* | |
* Trick. sched_yield() first counts the number of truly | |
* 'pending' runnable processes, then returns if it's | |
* only the current processes. (This test does not have | |
* to be atomic.) In threaded applications this optimization | |
* gets triggered quite often. | |
*/ | |
int nr_pending = nr_running; | |
#if CONFIG_SMP | |
int i; | |
// Subtract non-idle processes running on other CPUs. | |
for (i = 0; i < smp_num_cpus; i++) { | |
int cpu = cpu_logical_map(i); | |
if (aligned_data[cpu].schedule_data.curr != idle_task(cpu)) | |
nr_pending--; | |
} | |
#else | |
// on UP this process is on the runqueue as well | |
nr_pending--; | |
#endif | |
if (nr_pending) { | |
/* | |
* This process can only be rescheduled by us, | |
* so this is safe without any locking. | |
*/ | |
if (current->policy == SCHED_OTHER) | |
current->policy |= SCHED_YIELD; | |
current->need_resched = 1; | |
spin_lock_irq(&runqueue_lock); | |
move_last_runqueue(current); | |
spin_unlock_irq(&runqueue_lock); | |
} | |
return 0; | |
} | |
/** | |
* yield - yield the current processor to other threads. | |
* | |
* this is a shortcut for kernel-space yielding - it marks the | |
* thread runnable and calls sys_sched_yield(). | |
*/ | |
void yield(void) | |
{ | |
set_current_state(TASK_RUNNING); | |
sys_sched_yield(); | |
schedule(); | |
} | |
void __cond_resched(void) | |
{ | |
set_current_state(TASK_RUNNING); | |
schedule(); | |
} | |
asmlinkage long sys_sched_get_priority_max(int policy) | |
{ | |
int ret = -EINVAL; | |
switch (policy) { | |
case SCHED_FIFO: | |
case SCHED_RR: | |
ret = 99; | |
break; | |
case SCHED_OTHER: | |
ret = 0; | |
break; | |
} | |
return ret; | |
} | |
asmlinkage long sys_sched_get_priority_min(int policy) | |
{ | |
int ret = -EINVAL; | |
switch (policy) { | |
case SCHED_FIFO: | |
case SCHED_RR: | |
ret = 1; | |
break; | |
case SCHED_OTHER: | |
ret = 0; | |
} | |
return ret; | |
} | |
asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval) | |
{ | |
struct timespec t; | |
struct task_struct *p; | |
int retval = -EINVAL; | |
if (pid < 0) | |
goto out_nounlock; | |
retval = -ESRCH; | |
read_lock(&tasklist_lock); | |
p = find_process_by_pid(pid); | |
if (p) | |
jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice), | |
&t); | |
read_unlock(&tasklist_lock); | |
if (p) | |
retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0; | |
out_nounlock: | |
return retval; | |
} | |
static void show_task(struct task_struct * p) | |
{ | |
unsigned long free = 0; | |
int state; | |
static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" }; | |
printk("%-13.13s ", p->comm); | |
state = p->state ? ffz(~p->state) + 1 : 0; | |
if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *)) | |
printk(stat_nam[state]); | |
else | |
printk(" "); | |
#if (BITS_PER_LONG == 32) | |
if (p == current) | |
printk(" current "); | |
else | |
printk(" %08lX ", thread_saved_pc(&p->thread)); | |
#else | |
if (p == current) | |
printk(" current task "); | |
else | |
printk(" %016lx ", thread_saved_pc(&p->thread)); | |
#endif | |
{ | |
unsigned long * n = (unsigned long *) (p+1); | |
while (!*n) | |
n++; | |
free = (unsigned long) n - (unsigned long)(p+1); | |
} | |
printk("%5lu %5d %6d ", free, p->pid, p->p_pptr->pid); | |
if (p->p_cptr) | |
printk("%5d ", p->p_cptr->pid); | |
else | |
printk(" "); | |
if (p->p_ysptr) | |
printk("%7d", p->p_ysptr->pid); | |
else | |
printk(" "); | |
if (p->p_osptr) | |
printk(" %5d", p->p_osptr->pid); | |
else | |
printk(" "); | |
if (!p->mm) | |
printk(" (L-TLB)\n"); | |
else | |
printk(" (NOTLB)\n"); | |
{ | |
extern void show_trace_task(struct task_struct *tsk); | |
show_trace_task(p); | |
} | |
} | |
char * render_sigset_t(sigset_t *set, char *buffer) | |
{ | |
int i = _NSIG, x; | |
do { | |
i -= 4, x = 0; | |
if (sigismember(set, i+1)) x |= 1; | |
if (sigismember(set, i+2)) x |= 2; | |
if (sigismember(set, i+3)) x |= 4; | |
if (sigismember(set, i+4)) x |= 8; | |
*buffer++ = (x < 10 ? '0' : 'a' - 10) + x; | |
} while (i >= 4); | |
*buffer = 0; | |
return buffer; | |
} | |
void show_state(void) | |
{ | |
struct task_struct *p; | |
#if (BITS_PER_LONG == 32) | |
printk("\n" | |
" free sibling\n"); | |
printk(" task PC stack pid father child younger older\n"); | |
#else | |
printk("\n" | |
" free sibling\n"); | |
printk(" task PC stack pid father child younger older\n"); | |
#endif | |
read_lock(&tasklist_lock); | |
for_each_task(p) { | |
/* | |
* reset the NMI-timeout, listing all files on a slow | |
* console might take alot of time: | |
*/ | |
touch_nmi_watchdog(); | |
show_task(p); | |
} | |
read_unlock(&tasklist_lock); | |
} | |
/** | |
* reparent_to_init() - Reparent the calling kernel thread to the init task. | |
* | |
* If a kernel thread is launched as a result of a system call, or if | |
* it ever exits, it should generally reparent itself to init so that | |
* it is correctly cleaned up on exit. | |
* | |
* The various task state such as scheduling policy and priority may have | |
* been inherited fro a user process, so we reset them to sane values here. | |
* | |
* NOTE that reparent_to_init() gives the caller full capabilities. | |
*/ | |
void reparent_to_init(void) | |
{ | |
struct task_struct *this_task = current; | |
write_lock_irq(&tasklist_lock); | |
/* Reparent to init */ | |
REMOVE_LINKS(this_task); | |
this_task->p_pptr = child_reaper; | |
this_task->p_opptr = child_reaper; | |
SET_LINKS(this_task); | |
/* Set the exit signal to SIGCHLD so we signal init on exit */ | |
this_task->exit_signal = SIGCHLD; | |
/* We also take the runqueue_lock while altering task fields | |
* which affect scheduling decisions */ | |
spin_lock(&runqueue_lock); | |
this_task->ptrace = 0; | |
this_task->nice = DEF_NICE; | |
this_task->policy = SCHED_OTHER; | |
/* cpus_allowed? */ | |
/* rt_priority? */ | |
/* signals? */ | |
this_task->cap_effective = CAP_INIT_EFF_SET; | |
this_task->cap_inheritable = CAP_INIT_INH_SET; | |
this_task->cap_permitted = CAP_FULL_SET; | |
this_task->keep_capabilities = 0; | |
memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim))); | |
this_task->user = INIT_USER; | |
spin_unlock(&runqueue_lock); | |
write_unlock_irq(&tasklist_lock); | |
} | |
/* | |
* Put all the gunge required to become a kernel thread without | |
* attached user resources in one place where it belongs. | |
*/ | |
void daemonize(void) | |
{ | |
struct fs_struct *fs; | |
/* | |
* If we were started as result of loading a module, close all of the | |
* user space pages. We don't need them, and if we didn't close them | |
* they would be locked into memory. | |
*/ | |
exit_mm(current); | |
current->session = 1; | |
current->pgrp = 1; | |
current->tty = NULL; | |
/* Become as one with the init task */ | |
exit_fs(current); /* current->fs->count--; */ | |
fs = init_task.fs; | |
current->fs = fs; | |
atomic_inc(&fs->count); | |
exit_files(current); | |
current->files = init_task.files; | |
atomic_inc(¤t->files->count); | |
} | |
extern unsigned long wait_init_idle; | |
void __init init_idle(void) | |
{ | |
struct schedule_data * sched_data; | |
sched_data = &aligned_data[smp_processor_id()].schedule_data; | |
if (current != &init_task && task_on_runqueue(current)) { | |
printk("UGH! (%d:%d) was on the runqueue, removing.\n", | |
smp_processor_id(), current->pid); | |
del_from_runqueue(current); | |
} | |
sched_data->curr = current; | |
sched_data->last_schedule = get_cycles(); | |
clear_bit(current->processor, &wait_init_idle); | |
} | |
extern void init_timervecs (void); | |
void __init sched_init(void) | |
{ | |
/* | |
* We have to do a little magic to get the first | |
* process right in SMP mode. | |
*/ | |
int cpu = smp_processor_id(); | |
int nr; | |
init_task.processor = cpu; | |
for(nr = 0; nr < PIDHASH_SZ; nr++) | |
pidhash[nr] = NULL; | |
init_timervecs(); | |
init_bh(TIMER_BH, timer_bh); | |
init_bh(TQUEUE_BH, tqueue_bh); | |
init_bh(IMMEDIATE_BH, immediate_bh); | |
/* | |
* The boot idle thread does lazy MMU switching as well: | |
*/ | |
atomic_inc(&init_mm.mm_count); | |
enter_lazy_tlb(&init_mm, current, cpu); | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#ifndef _LINUX_SCHED_H | |
#define _LINUX_SCHED_H | |
#include <asm/param.h> /* for HZ */ | |
extern unsigned long event; | |
#include <linux/config.h> | |
#include <linux/binfmts.h> | |
#include <linux/threads.h> | |
#include <linux/kernel.h> | |
#include <linux/types.h> | |
#include <linux/times.h> | |
#include <linux/timex.h> | |
#include <linux/rbtree.h> | |
#include <asm/system.h> | |
#include <asm/semaphore.h> | |
#include <asm/page.h> | |
#include <asm/ptrace.h> | |
#include <asm/mmu.h> | |
#include <linux/smp.h> | |
#include <linux/tty.h> | |
#include <linux/sem.h> | |
#include <linux/signal.h> | |
#include <linux/securebits.h> | |
#include <linux/fs_struct.h> | |
struct exec_domain; | |
/* | |
* cloning flags: | |
*/ | |
#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */ | |
#define CLONE_VM 0x00000100 /* set if VM shared between processes */ | |
#define CLONE_FS 0x00000200 /* set if fs info shared between processes */ | |
#define CLONE_FILES 0x00000400 /* set if open files shared between processes */ | |
#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */ | |
#define CLONE_PID 0x00001000 /* set if pid shared */ | |
#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */ | |
#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */ | |
#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */ | |
#define CLONE_THREAD 0x00010000 /* Same thread group? */ | |
#define CLONE_NEWNS 0x00020000 /* New namespace group? */ | |
#define CLONE_SIGNAL (CLONE_SIGHAND | CLONE_THREAD) | |
/* | |
* These are the constant used to fake the fixed-point load-average | |
* counting. Some notes: | |
* - 11 bit fractions expand to 22 bits by the multiplies: this gives | |
* a load-average precision of 10 bits integer + 11 bits fractional | |
* - if you want to count load-averages more often, you need more | |
* precision, or rounding will get you. With 2-second counting freq, | |
* the EXP_n values would be 1981, 2034 and 2043 if still using only | |
* 11 bit fractions. | |
*/ | |
extern unsigned long avenrun[]; /* Load averages */ | |
#define FSHIFT 11 /* nr of bits of precision */ | |
#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */ | |
#define LOAD_FREQ (5*HZ) /* 5 sec intervals */ | |
#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */ | |
#define EXP_5 2014 /* 1/exp(5sec/5min) */ | |
#define EXP_15 2037 /* 1/exp(5sec/15min) */ | |
#define CALC_LOAD(load,exp,n) \ | |
load *= exp; \ | |
load += n*(FIXED_1-exp); \ | |
load >>= FSHIFT; | |
#define CT_TO_SECS(x) ((x) / HZ) | |
#define CT_TO_USECS(x) (((x) % HZ) * 1000000/HZ) | |
extern int nr_running, nr_threads; | |
extern int last_pid; | |
#include <linux/fs.h> | |
#include <linux/time.h> | |
#include <linux/param.h> | |
#include <linux/resource.h> | |
#ifdef __KERNEL__ | |
#include <linux/timer.h> | |
#endif | |
#include <asm/processor.h> | |
#define TASK_RUNNING 0 | |
#define TASK_INTERRUPTIBLE 1 | |
#define TASK_UNINTERRUPTIBLE 2 | |
#define TASK_ZOMBIE 4 | |
#define TASK_STOPPED 8 | |
#define __set_task_state(tsk, state_value) \ | |
do { (tsk)->state = (state_value); } while (0) | |
#ifdef CONFIG_SMP | |
#define set_task_state(tsk, state_value) \ | |
set_mb((tsk)->state, (state_value)) | |
#else | |
#define set_task_state(tsk, state_value) \ | |
__set_task_state((tsk), (state_value)) | |
#endif | |
#define __set_current_state(state_value) \ | |
do { current->state = (state_value); } while (0) | |
#ifdef CONFIG_SMP | |
#define set_current_state(state_value) \ | |
set_mb(current->state, (state_value)) | |
#else | |
#define set_current_state(state_value) \ | |
__set_current_state(state_value) | |
#endif | |
/* | |
* Scheduling policies | |
*/ | |
#define SCHED_OTHER 0 | |
#define SCHED_FIFO 1 | |
#define SCHED_RR 2 | |
/* | |
* This is an additional bit set when we want to | |
* yield the CPU for one re-schedule.. | |
*/ | |
#define SCHED_YIELD 0x10 | |
struct sched_param { | |
int sched_priority; | |
}; | |
struct completion; | |
#ifdef __KERNEL__ | |
#include <linux/spinlock.h> | |
/* | |
* This serializes "schedule()" and also protects | |
* the run-queue from deletions/modifications (but | |
* _adding_ to the beginning of the run-queue has | |
* a separate lock). | |
*/ | |
extern rwlock_t tasklist_lock; | |
extern spinlock_t runqueue_lock; | |
extern spinlock_t mmlist_lock; | |
extern void sched_init(void); | |
extern void init_idle(void); | |
extern void show_state(void); | |
extern void cpu_init (void); | |
extern void trap_init(void); | |
extern void update_process_times(int user); | |
extern void update_one_process(struct task_struct *p, unsigned long user, | |
unsigned long system, int cpu); | |
#define MAX_SCHEDULE_TIMEOUT LONG_MAX | |
extern signed long FASTCALL(schedule_timeout(signed long timeout)); | |
asmlinkage void schedule(void); | |
extern int schedule_task(struct tq_struct *task); | |
extern void flush_scheduled_tasks(void); | |
extern int start_context_thread(void); | |
extern int current_is_keventd(void); | |
/* | |
* The default fd array needs to be at least BITS_PER_LONG, | |
* as this is the granularity returned by copy_fdset(). | |
*/ | |
#define NR_OPEN_DEFAULT BITS_PER_LONG | |
struct namespace; | |
/* | |
* Open file table structure | |
*/ | |
struct files_struct { | |
atomic_t count; | |
rwlock_t file_lock; /* Protects all the below members. Nests inside tsk->alloc_lock */ | |
int max_fds; | |
int max_fdset; | |
int next_fd; | |
struct file ** fd; /* current fd array */ | |
fd_set *close_on_exec; | |
fd_set *open_fds; | |
fd_set close_on_exec_init; | |
fd_set open_fds_init; | |
struct file * fd_array[NR_OPEN_DEFAULT]; | |
}; | |
#define INIT_FILES \ | |
{ \ | |
count: ATOMIC_INIT(1), \ | |
file_lock: RW_LOCK_UNLOCKED, \ | |
max_fds: NR_OPEN_DEFAULT, \ | |
max_fdset: __FD_SETSIZE, \ | |
next_fd: 0, \ | |
fd: &init_files.fd_array[0], \ | |
close_on_exec: &init_files.close_on_exec_init, \ | |
open_fds: &init_files.open_fds_init, \ | |
close_on_exec_init: { { 0, } }, \ | |
open_fds_init: { { 0, } }, \ | |
fd_array: { NULL, } \ | |
} | |
/* Maximum number of active map areas.. This is a random (large) number */ | |
#define DEFAULT_MAX_MAP_COUNT (65536) | |
extern int max_map_count; | |
struct mm_struct { | |
struct vm_area_struct * mmap; /* list of VMAs */ | |
rb_root_t mm_rb; | |
struct vm_area_struct * mmap_cache; /* last find_vma result */ | |
pgd_t * pgd; | |
atomic_t mm_users; /* How many users with user space? */ | |
atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ | |
int map_count; /* number of VMAs */ | |
struct rw_semaphore mmap_sem; | |
spinlock_t page_table_lock; /* Protects task page tables and mm->rss */ | |
struct list_head mmlist; /* List of all active mm's. These are globally strung | |
* together off init_mm.mmlist, and are protected | |
* by mmlist_lock | |
*/ | |
unsigned long start_code, end_code, start_data, end_data; | |
unsigned long start_brk, brk, start_stack; | |
unsigned long arg_start, arg_end, env_start, env_end; | |
unsigned long rss, total_vm, locked_vm; | |
unsigned long def_flags; | |
unsigned long cpu_vm_mask; | |
unsigned long swap_address; | |
unsigned dumpable:1; | |
/* Architecture-specific MM context */ | |
mm_context_t context; | |
}; | |
extern int mmlist_nr; | |
#define INIT_MM(name) \ | |
{ \ | |
mm_rb: RB_ROOT, \ | |
pgd: swapper_pg_dir, \ | |
mm_users: ATOMIC_INIT(2), \ | |
mm_count: ATOMIC_INIT(1), \ | |
mmap_sem: __RWSEM_INITIALIZER(name.mmap_sem), \ | |
page_table_lock: SPIN_LOCK_UNLOCKED, \ | |
mmlist: LIST_HEAD_INIT(name.mmlist), \ | |
} | |
struct signal_struct { | |
atomic_t count; | |
struct k_sigaction action[_NSIG]; | |
spinlock_t siglock; | |
}; | |
#define INIT_SIGNALS { \ | |
count: ATOMIC_INIT(1), \ | |
action: { {{0,}}, }, \ | |
siglock: SPIN_LOCK_UNLOCKED \ | |
} | |
/* | |
* Some day this will be a full-fledged user tracking system.. | |
*/ | |
struct user_struct { | |
atomic_t __count; /* reference count */ | |
atomic_t processes; /* How many processes does this user have? */ | |
atomic_t files; /* How many open files does this user have? */ | |
/* Hash table maintenance information */ | |
struct user_struct *next, **pprev; | |
uid_t uid; | |
}; | |
#define get_current_user() ({ \ | |
struct user_struct *__user = current->user; \ | |
atomic_inc(&__user->__count); \ | |
__user; }) | |
extern struct user_struct root_user; | |
#define INIT_USER (&root_user) | |
struct task_struct { | |
/* | |
* offsets of these are hardcoded elsewhere - touch with care | |
*/ | |
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */ | |
unsigned long flags; /* per process flags, defined below */ | |
int sigpending; | |
mm_segment_t addr_limit; /* thread address space: | |
0-0xBFFFFFFF for user-thead | |
0-0xFFFFFFFF for kernel-thread | |
*/ | |
struct exec_domain *exec_domain; | |
volatile long need_resched; | |
unsigned long ptrace; | |
int lock_depth; /* Lock depth */ | |
/* | |
* offset 32 begins here on 32-bit platforms. We keep | |
* all fields in a single cacheline that are needed for | |
* the goodness() loop in schedule(). | |
*/ | |
long counter; | |
long nice; | |
unsigned long policy; | |
struct mm_struct *mm; | |
int processor; | |
/* | |
* cpus_runnable is ~0 if the process is not running on any | |
* CPU. It's (1 << cpu) if it's running on a CPU. This mask | |
* is updated under the runqueue lock. | |
* | |
* To determine whether a process might run on a CPU, this | |
* mask is AND-ed with cpus_allowed. | |
*/ | |
unsigned long cpus_runnable, cpus_allowed; | |
/* | |
* (only the 'next' pointer fits into the cacheline, but | |
* that's just fine.) | |
*/ | |
struct list_head run_list; | |
unsigned long sleep_time; | |
struct task_struct *next_task, *prev_task; | |
struct mm_struct *active_mm; | |
struct list_head local_pages; | |
unsigned int allocation_order, nr_local_pages; | |
/* task state */ | |
struct linux_binfmt *binfmt; | |
int exit_code, exit_signal; | |
int pdeath_signal; /* The signal sent when the parent dies */ | |
/* ??? */ | |
unsigned long personality; | |
int did_exec:1; | |
pid_t pid; | |
pid_t pgrp; | |
pid_t tty_old_pgrp; | |
pid_t session; | |
pid_t tgid; | |
/* boolean value for session group leader */ | |
int leader; | |
/* | |
* pointers to (original) parent process, youngest child, younger sibling, | |
* older sibling, respectively. (p->father can be replaced with | |
* p->p_pptr->pid) | |
*/ | |
struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr; | |
struct list_head thread_group; | |
/* PID hash table linkage. */ | |
struct task_struct *pidhash_next; | |
struct task_struct **pidhash_pprev; | |
wait_queue_head_t wait_chldexit; /* for wait4() */ | |
struct completion *vfork_done; /* for vfork() */ | |
unsigned long rt_priority; | |
unsigned long it_real_value, it_prof_value, it_virt_value; | |
unsigned long it_real_incr, it_prof_incr, it_virt_incr; | |
struct timer_list real_timer; | |
struct tms times; | |
unsigned long start_time; | |
long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS]; | |
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */ | |
unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap; | |
int swappable:1; | |
/* process credentials */ | |
uid_t uid,euid,suid,fsuid; | |
gid_t gid,egid,sgid,fsgid; | |
int ngroups; | |
gid_t groups[NGROUPS]; | |
kernel_cap_t cap_effective, cap_inheritable, cap_permitted; | |
int keep_capabilities:1; | |
struct user_struct *user; | |
/* limits */ | |
struct rlimit rlim[RLIM_NLIMITS]; | |
unsigned short used_math; | |
char comm[16]; | |
/* file system info */ | |
int link_count, total_link_count; | |
struct tty_struct *tty; /* NULL if no tty */ | |
unsigned int locks; /* How many file locks are being held */ | |
/* ipc stuff */ | |
struct sem_undo *semundo; | |
struct sem_queue *semsleeping; | |
/* CPU-specific state of this task */ | |
struct thread_struct thread; | |
/* filesystem information */ | |
struct fs_struct *fs; | |
/* open file information */ | |
struct files_struct *files; | |
/* namespace */ | |
struct namespace *namespace; | |
/* signal handlers */ | |
spinlock_t sigmask_lock; /* Protects signal and blocked */ | |
struct signal_struct *sig; | |
sigset_t blocked; | |
struct sigpending pending; | |
unsigned long sas_ss_sp; | |
size_t sas_ss_size; | |
int (*notifier)(void *priv); | |
void *notifier_data; | |
sigset_t *notifier_mask; | |
/* Thread group tracking */ | |
u32 parent_exec_id; | |
u32 self_exec_id; | |
/* Protection of (de-)allocation: mm, files, fs, tty */ | |
spinlock_t alloc_lock; | |
/* journalling filesystem info */ | |
void *journal_info; | |
int tn; | |
unsigned long cpustart; | |
}; | |
/* | |
* Per process flags | |
*/ | |
#define PF_ALIGNWARN 0x00000001 /* Print alignment warning msgs */ | |
/* Not implemented yet, only for 486*/ | |
#define PF_STARTING 0x00000002 /* being created */ | |
#define PF_EXITING 0x00000004 /* getting shut down */ | |
#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */ | |
#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */ | |
#define PF_DUMPCORE 0x00000200 /* dumped core */ | |
#define PF_SIGNALED 0x00000400 /* killed by a signal */ | |
#define PF_MEMALLOC 0x00000800 /* Allocating memory */ | |
#define PF_MEMDIE 0x00001000 /* Killed for out-of-memory */ | |
#define PF_FREE_PAGES 0x00002000 /* per process page freeing */ | |
#define PF_NOIO 0x00004000 /* avoid generating further I/O */ | |
#define PF_USEDFPU 0x00100000 /* task used FPU this quantum (SMP) */ | |
/* | |
* Ptrace flags | |
*/ | |
#define PT_PTRACED 0x00000001 | |
#define PT_TRACESYS 0x00000002 | |
#define PT_DTRACE 0x00000004 /* delayed trace (used on m68k, i386) */ | |
#define PT_TRACESYSGOOD 0x00000008 | |
#define PT_PTRACE_CAP 0x00000010 /* ptracer can follow suid-exec */ | |
/* | |
* Limit the stack by to some sane default: root can always | |
* increase this limit if needed.. 8MB seems reasonable. | |
*/ | |
#define _STK_LIM (8*1024*1024) | |
#define DEF_COUNTER (10*HZ/100) /* 100 ms time slice */ | |
#define MAX_COUNTER (20*HZ/100) | |
#define DEF_NICE (0) | |
extern void yield(void); | |
/* | |
* The default (Linux) execution domain. | |
*/ | |
extern struct exec_domain default_exec_domain; | |
/* | |
* INIT_TASK is used to set up the first task table, touch at | |
* your own risk!. Base=0, limit=0x1fffff (=2MB) | |
*/ | |
#define INIT_TASK(tsk) \ | |
{ \ | |
state: 0, \ | |
flags: 0, \ | |
sigpending: 0, \ | |
addr_limit: KERNEL_DS, \ | |
exec_domain: &default_exec_domain, \ | |
lock_depth: -1, \ | |
counter: DEF_COUNTER, \ | |
nice: DEF_NICE, \ | |
policy: SCHED_OTHER, \ | |
mm: NULL, \ | |
active_mm: &init_mm, \ | |
cpus_runnable: -1, \ | |
cpus_allowed: -1, \ | |
run_list: LIST_HEAD_INIT(tsk.run_list), \ | |
next_task: &tsk, \ | |
prev_task: &tsk, \ | |
p_opptr: &tsk, \ | |
p_pptr: &tsk, \ | |
thread_group: LIST_HEAD_INIT(tsk.thread_group), \ | |
wait_chldexit: __WAIT_QUEUE_HEAD_INITIALIZER(tsk.wait_chldexit),\ | |
real_timer: { \ | |
function: it_real_fn \ | |
}, \ | |
cap_effective: CAP_INIT_EFF_SET, \ | |
cap_inheritable: CAP_INIT_INH_SET, \ | |
cap_permitted: CAP_FULL_SET, \ | |
keep_capabilities: 0, \ | |
rlim: INIT_RLIMITS, \ | |
user: INIT_USER, \ | |
comm: "swapper", \ | |
thread: INIT_THREAD, \ | |
fs: &init_fs, \ | |
files: &init_files, \ | |
sigmask_lock: SPIN_LOCK_UNLOCKED, \ | |
sig: &init_signals, \ | |
pending: { NULL, &tsk.pending.head, {{0}}}, \ | |
blocked: {{0}}, \ | |
alloc_lock: SPIN_LOCK_UNLOCKED, \ | |
journal_info: NULL, \ | |
} | |
#ifndef INIT_TASK_SIZE | |
# define INIT_TASK_SIZE 2048*sizeof(long) | |
#endif | |
union task_union { | |
struct task_struct task; | |
unsigned long stack[INIT_TASK_SIZE/sizeof(long)]; | |
}; | |
extern union task_union init_task_union; | |
extern struct mm_struct init_mm; | |
extern struct task_struct *init_tasks[NR_CPUS]; | |
/* PID hashing. (shouldnt this be dynamic?) */ | |
#define PIDHASH_SZ (4096 >> 2) | |
extern struct task_struct *pidhash[PIDHASH_SZ]; | |
#define pid_hashfn(x) ((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1)) | |
static inline void hash_pid(struct task_struct *p) | |
{ | |
struct task_struct **htable = &pidhash[pid_hashfn(p->pid)]; | |
if((p->pidhash_next = *htable) != NULL) | |
(*htable)->pidhash_pprev = &p->pidhash_next; | |
*htable = p; | |
p->pidhash_pprev = htable; | |
} | |
static inline void unhash_pid(struct task_struct *p) | |
{ | |
if(p->pidhash_next) | |
p->pidhash_next->pidhash_pprev = p->pidhash_pprev; | |
*p->pidhash_pprev = p->pidhash_next; | |
} | |
static inline struct task_struct *find_task_by_pid(int pid) | |
{ | |
struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)]; | |
for(p = *htable; p && p->pid != pid; p = p->pidhash_next) | |
; | |
return p; | |
} | |
#define task_has_cpu(tsk) ((tsk)->cpus_runnable != ~0UL) | |
static inline void task_set_cpu(struct task_struct *tsk, unsigned int cpu) | |
{ | |
tsk->processor = cpu; | |
tsk->cpus_runnable = 1UL << cpu; | |
} | |
static inline void task_release_cpu(struct task_struct *tsk) | |
{ | |
tsk->cpus_runnable = ~0UL; | |
} | |
/* per-UID process charging. */ | |
extern struct user_struct * alloc_uid(uid_t); | |
extern void free_uid(struct user_struct *); | |
#include <asm/current.h> | |
extern unsigned long volatile jiffies; | |
extern unsigned long itimer_ticks; | |
extern unsigned long itimer_next; | |
extern struct timeval xtime; | |
extern void do_timer(struct pt_regs *); | |
extern unsigned int * prof_buffer; | |
extern unsigned long prof_len; | |
extern unsigned long prof_shift; | |
#define CURRENT_TIME (xtime.tv_sec) | |
extern void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, int nr)); | |
extern void FASTCALL(__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr)); | |
extern void FASTCALL(sleep_on(wait_queue_head_t *q)); | |
extern long FASTCALL(sleep_on_timeout(wait_queue_head_t *q, | |
signed long timeout)); | |
extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q)); | |
extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q, | |
signed long timeout)); | |
extern int FASTCALL(wake_up_process(struct task_struct * tsk)); | |
#define wake_up(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1) | |
#define wake_up_nr(x, nr) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr) | |
#define wake_up_all(x) __wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0) | |
#define wake_up_sync(x) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1) | |
#define wake_up_sync_nr(x, nr) __wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr) | |
#define wake_up_interruptible(x) __wake_up((x),TASK_INTERRUPTIBLE, 1) | |
#define wake_up_interruptible_nr(x, nr) __wake_up((x),TASK_INTERRUPTIBLE, nr) | |
#define wake_up_interruptible_all(x) __wake_up((x),TASK_INTERRUPTIBLE, 0) | |
#define wake_up_interruptible_sync(x) __wake_up_sync((x),TASK_INTERRUPTIBLE, 1) | |
#define wake_up_interruptible_sync_nr(x, nr) __wake_up_sync((x),TASK_INTERRUPTIBLE, nr) | |
asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru); | |
extern int in_group_p(gid_t); | |
extern int in_egroup_p(gid_t); | |
extern void proc_caches_init(void); | |
extern void flush_signals(struct task_struct *); | |
extern void flush_signal_handlers(struct task_struct *); | |
extern void sig_exit(int, int, struct siginfo *); | |
extern int dequeue_signal(sigset_t *, siginfo_t *); | |
extern void block_all_signals(int (*notifier)(void *priv), void *priv, | |
sigset_t *mask); | |
extern void unblock_all_signals(void); | |
extern int send_sig_info(int, struct siginfo *, struct task_struct *); | |
extern int force_sig_info(int, struct siginfo *, struct task_struct *); | |
extern int kill_pg_info(int, struct siginfo *, pid_t); | |
extern int kill_sl_info(int, struct siginfo *, pid_t); | |
extern int kill_proc_info(int, struct siginfo *, pid_t); | |
extern void notify_parent(struct task_struct *, int); | |
extern void do_notify_parent(struct task_struct *, int); | |
extern void force_sig(int, struct task_struct *); | |
extern int send_sig(int, struct task_struct *, int); | |
extern int kill_pg(pid_t, int, int); | |
extern int kill_sl(pid_t, int, int); | |
extern int kill_proc(pid_t, int, int); | |
extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *); | |
extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long); | |
static inline int signal_pending(struct task_struct *p) | |
{ | |
return (p->sigpending != 0); | |
} | |
/* | |
* Re-calculate pending state from the set of locally pending | |
* signals, globally pending signals, and blocked signals. | |
*/ | |
static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked) | |
{ | |
unsigned long ready; | |
long i; | |
switch (_NSIG_WORDS) { | |
default: | |
for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;) | |
ready |= signal->sig[i] &~ blocked->sig[i]; | |
break; | |
case 4: ready = signal->sig[3] &~ blocked->sig[3]; | |
ready |= signal->sig[2] &~ blocked->sig[2]; | |
ready |= signal->sig[1] &~ blocked->sig[1]; | |
ready |= signal->sig[0] &~ blocked->sig[0]; | |
break; | |
case 2: ready = signal->sig[1] &~ blocked->sig[1]; | |
ready |= signal->sig[0] &~ blocked->sig[0]; | |
break; | |
case 1: ready = signal->sig[0] &~ blocked->sig[0]; | |
} | |
return ready != 0; | |
} | |
/* Reevaluate whether the task has signals pending delivery. | |
This is required every time the blocked sigset_t changes. | |
All callers should have t->sigmask_lock. */ | |
static inline void recalc_sigpending(struct task_struct *t) | |
{ | |
t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked); | |
} | |
/* True if we are on the alternate signal stack. */ | |
static inline int on_sig_stack(unsigned long sp) | |
{ | |
return (sp - current->sas_ss_sp < current->sas_ss_size); | |
} | |
static inline int sas_ss_flags(unsigned long sp) | |
{ | |
return (current->sas_ss_size == 0 ? SS_DISABLE | |
: on_sig_stack(sp) ? SS_ONSTACK : 0); | |
} | |
extern int request_irq(unsigned int, | |
void (*handler)(int, void *, struct pt_regs *), | |
unsigned long, const char *, void *); | |
extern void free_irq(unsigned int, void *); | |
/* | |
* This has now become a routine instead of a macro, it sets a flag if | |
* it returns true (to do BSD-style accounting where the process is flagged | |
* if it uses root privs). The implication of this is that you should do | |
* normal permissions checks first, and check suser() last. | |
* | |
* [Dec 1997 -- Chris Evans] | |
* For correctness, the above considerations need to be extended to | |
* fsuser(). This is done, along with moving fsuser() checks to be | |
* last. | |
* | |
* These will be removed, but in the mean time, when the SECURE_NOROOT | |
* flag is set, uids don't grant privilege. | |
*/ | |
static inline int suser(void) | |
{ | |
if (!issecure(SECURE_NOROOT) && current->euid == 0) { | |
current->flags |= PF_SUPERPRIV; | |
return 1; | |
} | |
return 0; | |
} | |
static inline int fsuser(void) | |
{ | |
if (!issecure(SECURE_NOROOT) && current->fsuid == 0) { | |
current->flags |= PF_SUPERPRIV; | |
return 1; | |
} | |
return 0; | |
} | |
/* | |
* capable() checks for a particular capability. | |
* New privilege checks should use this interface, rather than suser() or | |
* fsuser(). See include/linux/capability.h for defined capabilities. | |
*/ | |
static inline int capable(int cap) | |
{ | |
#if 1 /* ok now */ | |
if (cap_raised(current->cap_effective, cap)) | |
#else | |
if (cap_is_fs_cap(cap) ? current->fsuid == 0 : current->euid == 0) | |
#endif | |
{ | |
current->flags |= PF_SUPERPRIV; | |
return 1; | |
} | |
return 0; | |
} | |
/* | |
* Routines for handling mm_structs | |
*/ | |
extern struct mm_struct * mm_alloc(void); | |
extern struct mm_struct * start_lazy_tlb(void); | |
extern void end_lazy_tlb(struct mm_struct *mm); | |
/* mmdrop drops the mm and the page tables */ | |
extern inline void FASTCALL(__mmdrop(struct mm_struct *)); | |
static inline void mmdrop(struct mm_struct * mm) | |
{ | |
if (atomic_dec_and_test(&mm->mm_count)) | |
__mmdrop(mm); | |
} | |
/* mmput gets rid of the mappings and all user-space */ | |
extern void mmput(struct mm_struct *); | |
/* Remove the current tasks stale references to the old mm_struct */ | |
extern void mm_release(void); | |
/* | |
* Routines for handling the fd arrays | |
*/ | |
extern struct file ** alloc_fd_array(int); | |
extern int expand_fd_array(struct files_struct *, int nr); | |
extern void free_fd_array(struct file **, int); | |
extern fd_set *alloc_fdset(int); | |
extern int expand_fdset(struct files_struct *, int nr); | |
extern void free_fdset(fd_set *, int); | |
extern int copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *); | |
extern void flush_thread(void); | |
extern void exit_thread(void); | |
extern void exit_mm(struct task_struct *); | |
extern void exit_files(struct task_struct *); | |
extern void exit_sighand(struct task_struct *); | |
extern void reparent_to_init(void); | |
extern void daemonize(void); | |
extern int do_execve(char *, char **, char **, struct pt_regs *); | |
extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long); | |
extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); | |
extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)); | |
extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)); | |
#define __wait_event(wq, condition) \ | |
do { \ | |
wait_queue_t __wait; \ | |
init_waitqueue_entry(&__wait, current); \ | |
\ | |
add_wait_queue(&wq, &__wait); \ | |
for (;;) { \ | |
set_current_state(TASK_UNINTERRUPTIBLE); \ | |
if (condition) \ | |
break; \ | |
schedule(); \ | |
} \ | |
current->state = TASK_RUNNING; \ | |
remove_wait_queue(&wq, &__wait); \ | |
} while (0) | |
#define wait_event(wq, condition) \ | |
do { \ | |
if (condition) \ | |
break; \ | |
__wait_event(wq, condition); \ | |
} while (0) | |
#define __wait_event_interruptible(wq, condition, ret) \ | |
do { \ | |
wait_queue_t __wait; \ | |
init_waitqueue_entry(&__wait, current); \ | |
\ | |
add_wait_queue(&wq, &__wait); \ | |
for (;;) { \ | |
set_current_state(TASK_INTERRUPTIBLE); \ | |
if (condition) \ | |
break; \ | |
if (!signal_pending(current)) { \ | |
schedule(); \ | |
continue; \ | |
} \ | |
ret = -ERESTARTSYS; \ | |
break; \ | |
} \ | |
current->state = TASK_RUNNING; \ | |
remove_wait_queue(&wq, &__wait); \ | |
} while (0) | |
#define wait_event_interruptible(wq, condition) \ | |
({ \ | |
int __ret = 0; \ | |
if (!(condition)) \ | |
__wait_event_interruptible(wq, condition, __ret); \ | |
__ret; \ | |
}) | |
#define REMOVE_LINKS(p) do { \ | |
(p)->next_task->prev_task = (p)->prev_task; \ | |
(p)->prev_task->next_task = (p)->next_task; \ | |
if ((p)->p_osptr) \ | |
(p)->p_osptr->p_ysptr = (p)->p_ysptr; \ | |
if ((p)->p_ysptr) \ | |
(p)->p_ysptr->p_osptr = (p)->p_osptr; \ | |
else \ | |
(p)->p_pptr->p_cptr = (p)->p_osptr; \ | |
} while (0) | |
#define SET_LINKS(p) do { \ | |
(p)->next_task = &init_task; \ | |
(p)->prev_task = init_task.prev_task; \ | |
init_task.prev_task->next_task = (p); \ | |
init_task.prev_task = (p); \ | |
(p)->p_ysptr = NULL; \ | |
if (((p)->p_osptr = (p)->p_pptr->p_cptr) != NULL) \ | |
(p)->p_osptr->p_ysptr = p; \ | |
(p)->p_pptr->p_cptr = p; \ | |
} while (0) | |
#define for_each_task(p) \ | |
for (p = &init_task ; (p = p->next_task) != &init_task ; ) | |
#define for_each_thread(task) \ | |
for (task = next_thread(current) ; task != current ; task = next_thread(task)) | |
#define next_thread(p) \ | |
list_entry((p)->thread_group.next, struct task_struct, thread_group) | |
#define thread_group_leader(p) (p->pid == p->tgid) | |
static inline void del_from_runqueue(struct task_struct * p) | |
{ | |
nr_running--; | |
p->sleep_time = jiffies; | |
list_del(&p->run_list); | |
p->run_list.next = NULL; | |
} | |
static inline int task_on_runqueue(struct task_struct *p) | |
{ | |
return (p->run_list.next != NULL); | |
} | |
static inline void unhash_process(struct task_struct *p) | |
{ | |
if (task_on_runqueue(p)) | |
out_of_line_bug(); | |
write_lock_irq(&tasklist_lock); | |
nr_threads--; | |
unhash_pid(p); | |
REMOVE_LINKS(p); | |
list_del(&p->thread_group); | |
write_unlock_irq(&tasklist_lock); | |
} | |
/* Protects ->fs, ->files, ->mm, and synchronises with wait4(). Nests inside tasklist_lock */ | |
static inline void task_lock(struct task_struct *p) | |
{ | |
spin_lock(&p->alloc_lock); | |
} | |
static inline void task_unlock(struct task_struct *p) | |
{ | |
spin_unlock(&p->alloc_lock); | |
} | |
/* write full pathname into buffer and return start of pathname */ | |
static inline char * d_path(struct dentry *dentry, struct vfsmount *vfsmnt, | |
char *buf, int buflen) | |
{ | |
char *res; | |
struct vfsmount *rootmnt; | |
struct dentry *root; | |
read_lock(¤t->fs->lock); | |
rootmnt = mntget(current->fs->rootmnt); | |
root = dget(current->fs->root); | |
read_unlock(¤t->fs->lock); | |
spin_lock(&dcache_lock); | |
res = __d_path(dentry, vfsmnt, root, rootmnt, buf, buflen); | |
spin_unlock(&dcache_lock); | |
dput(root); | |
mntput(rootmnt); | |
return res; | |
} | |
static inline int need_resched(void) | |
{ | |
return (unlikely(current->need_resched)); | |
} | |
extern void __cond_resched(void); | |
static inline void cond_resched(void) | |
{ | |
if (need_resched()) | |
__cond_resched(); | |
} | |
#endif /* __KERNEL__ */ | |
#endif |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment