mertcanekiz · April 2, 2020 06:19
diff --git a/fork.c b/fork.c
 /*
 *  linux/kernel/fork.c
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 */

 /*
 *  'fork.c' contains the help-routines for the 'fork' system call
 * (see also entry.S and others).
 * Fork is rather simple, once you get the hang of it, but the memory
 * management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
 */

 #include <linux/config.h>
 #include <linux/slab.h>
 #include <linux/init.h>
 #include <linux/unistd.h>
 #include <linux/smp_lock.h>
 #include <linux/module.h>
 #include <linux/vmalloc.h>
 #include <linux/completion.h>
 #include <linux/namespace.h>
 #include <linux/personality.h>
 #include <linux/compiler.h>

 #include <asm/pgtable.h>
 #include <asm/pgalloc.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>

 /* The idle threads do not count.. */
 int nr_threads;
 int nr_running;

 int max_threads;
 unsigned long total_forks;	/* Handle normal Linux uptimes. */
 int last_pid;

 struct task_struct *pidhash[PIDHASH_SZ];

 void add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
 {
 	unsigned long flags;

 	wait->flags &= ~WQ_FLAG_EXCLUSIVE;
 	wq_write_lock_irqsave(&q->lock, flags);
 	__add_wait_queue(q, wait);
 	wq_write_unlock_irqrestore(&q->lock, flags);
 }

 void add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait)
 {
 	unsigned long flags;

 	wait->flags |= WQ_FLAG_EXCLUSIVE;
 	wq_write_lock_irqsave(&q->lock, flags);
 	__add_wait_queue_tail(q, wait);
 	wq_write_unlock_irqrestore(&q->lock, flags);
 }

 void remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait)
 {
 	unsigned long flags;

 	wq_write_lock_irqsave(&q->lock, flags);
 	__remove_wait_queue(q, wait);
 	wq_write_unlock_irqrestore(&q->lock, flags);
 }

 void __init fork_init(unsigned long mempages)
 {
 	/*
 	 * The default maximum number of threads is set to a safe
 	 * value: the thread structures can take up at most half
 	 * of memory.
 	 */
 	max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8;

 	init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
 	init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
 }

 /* Protects next_safe and last_pid. */
 spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;

 static int get_pid(unsigned long flags)
 {
 	static int next_safe = PID_MAX;
 	struct task_struct *p;
 	int pid, beginpid;

 	if (flags & CLONE_PID)
 		return current->pid;

 	spin_lock(&lastpid_lock);
 	beginpid = last_pid;
 	if((++last_pid) & 0xffff8000) {
 		last_pid = 300;		/* Skip daemons etc. */
 		goto inside;
 	}
 	if(last_pid >= next_safe) {
 inside:
 		next_safe = PID_MAX;
 		read_lock(&tasklist_lock);
 	repeat:
 		for_each_task(p) {
 			if(p->pid == last_pid	||
 			   p->pgrp == last_pid	||
 			   p->tgid == last_pid	||
 			   p->session == last_pid) {
 				if(++last_pid >= next_safe) {
 					if(last_pid & 0xffff8000)
 						last_pid = 300;
 					next_safe = PID_MAX;
 				}
 				if(unlikely(last_pid == beginpid))
 					goto nomorepids;
 				goto repeat;
 			}
 			if(p->pid > last_pid && next_safe > p->pid)
 				next_safe = p->pid;
 			if(p->pgrp > last_pid && next_safe > p->pgrp)
 				next_safe = p->pgrp;
 			if(p->tgid > last_pid && next_safe > p->tgid)
 				next_safe = p->tgid;
 			if(p->session > last_pid && next_safe > p->session)
 				next_safe = p->session;
 		}
 		read_unlock(&tasklist_lock);
 	}
 	pid = last_pid;
 	spin_unlock(&lastpid_lock);

 	return pid;

 nomorepids:
 	read_unlock(&tasklist_lock);
 	spin_unlock(&lastpid_lock);
 	return 0;
 }

 static inline int dup_mmap(struct mm_struct * mm)
 {
 	struct vm_area_struct * mpnt, *tmp, **pprev;
 	int retval;

 	flush_cache_mm(current->mm);
 	mm->locked_vm = 0;
 	mm->mmap = NULL;
 	mm->mmap_cache = NULL;
 	mm->map_count = 0;
 	mm->rss = 0;
 	mm->cpu_vm_mask = 0;
 	mm->swap_address = 0;
 	pprev = &mm->mmap;

 	/*
 	 * Add it to the mmlist after the parent.
 	 * Doing it this way means that we can order the list,
 	 * and fork() won't mess up the ordering significantly.
 	 * Add it first so that swapoff can see any swap entries.
 	 */
 	spin_lock(&mmlist_lock);
 	list_add(&mm->mmlist, &current->mm->mmlist);
 	mmlist_nr++;
 	spin_unlock(&mmlist_lock);

 	for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
 		struct file *file;

 		retval = -ENOMEM;
 		if(mpnt->vm_flags & VM_DONTCOPY)
 			continue;
 		tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
 		if (!tmp)
 			goto fail_nomem;
 		*tmp = *mpnt;
 		tmp->vm_flags &= ~VM_LOCKED;
 		tmp->vm_mm = mm;
 		tmp->vm_next = NULL;
 		file = tmp->vm_file;
 		if (file) {
 			struct inode *inode = file->f_dentry->d_inode;
 			get_file(file);
 			if (tmp->vm_flags & VM_DENYWRITE)
 				atomic_dec(&inode->i_writecount);
      
 			/* insert tmp into the share list, just after mpnt */
 			spin_lock(&inode->i_mapping->i_shared_lock);
 			if((tmp->vm_next_share = mpnt->vm_next_share) != NULL)
 				mpnt->vm_next_share->vm_pprev_share =
 					&tmp->vm_next_share;
 			mpnt->vm_next_share = tmp;
 			tmp->vm_pprev_share = &mpnt->vm_next_share;
 			spin_unlock(&inode->i_mapping->i_shared_lock);
 		}

 		/*
 		 * Link in the new vma and copy the page table entries:
 		 * link in first so that swapoff can see swap entries.
 		 */
 		spin_lock(&mm->page_table_lock);
 		*pprev = tmp;
 		pprev = &tmp->vm_next;
 		mm->map_count++;
 		retval = copy_page_range(mm, current->mm, tmp);
 		spin_unlock(&mm->page_table_lock);

 		if (tmp->vm_ops && tmp->vm_ops->open)
 			tmp->vm_ops->open(tmp);

 		if (retval)
 			goto fail_nomem;
 	}
 	retval = 0;
 	build_mmap_rb(mm);

 fail_nomem:
 	flush_tlb_mm(current->mm);
 	return retval;
 }

 spinlock_t mmlist_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;
 int mmlist_nr;

 #define allocate_mm()	(kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
 #define free_mm(mm)	(kmem_cache_free(mm_cachep, (mm)))

 static struct mm_struct * mm_init(struct mm_struct * mm)
 {
 	atomic_set(&mm->mm_users, 1);
 	atomic_set(&mm->mm_count, 1);
 	init_rwsem(&mm->mmap_sem);
 	mm->page_table_lock = SPIN_LOCK_UNLOCKED;
 	mm->pgd = pgd_alloc(mm);
 	mm->def_flags = 0;
 	if (mm->pgd)
 		return mm;
 	free_mm(mm);
 	return NULL;
 }
 	

 /*
 * Allocate and initialize an mm_struct.
 */
 struct mm_struct * mm_alloc(void)
 {
 	struct mm_struct * mm;

 	mm = allocate_mm();
 	if (mm) {
 		memset(mm, 0, sizeof(*mm));
 		return mm_init(mm);
 	}
 	return NULL;
 }

 /*
 * Called when the last reference to the mm
 * is dropped: either by a lazy thread or by
 * mmput. Free the page directory and the mm.
 */
 inline void __mmdrop(struct mm_struct *mm)
 {
 	BUG_ON(mm == &init_mm);
 	pgd_free(mm->pgd);
 	destroy_context(mm);
 	free_mm(mm);
 }

 /*
 * Decrement the use count and release all resources for an mm.
 */
 void mmput(struct mm_struct *mm)
 {
 	if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) {
 		extern struct mm_struct *swap_mm;
 		if (swap_mm == mm)
 			swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
 		list_del(&mm->mmlist);
 		mmlist_nr--;
 		spin_unlock(&mmlist_lock);
 		exit_mmap(mm);
 		mmdrop(mm);
 	}
 }

 /* Please note the differences between mmput and mm_release.
 * mmput is called whenever we stop holding onto a mm_struct,
 * error success whatever.
 *
 * mm_release is called after a mm_struct has been removed
 * from the current process.
 *
 * This difference is important for error handling, when we
 * only half set up a mm_struct for a new process and need to restore
 * the old one.  Because we mmput the new mm_struct before
 * restoring the old one. . .
 * Eric Biederman 10 January 1998
 */
 void mm_release(void)
 {
 	struct task_struct *tsk = current;
 	struct completion *vfork_done = tsk->vfork_done;

 	/* notify parent sleeping on vfork() */
 	if (vfork_done) {
 		tsk->vfork_done = NULL;
 		complete(vfork_done);
 	}
 }

 static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct mm_struct * mm, *oldmm;
 	int retval;

 	tsk->min_flt = tsk->maj_flt = 0;
 	tsk->cmin_flt = tsk->cmaj_flt = 0;
 	tsk->nswap = tsk->cnswap = 0;

 	tsk->mm = NULL;
 	tsk->active_mm = NULL;

 	/*
 	 * Are we cloning a kernel thread?
 	 *
 	 * We need to steal a active VM for that..
 	 */
 	oldmm = current->mm;
 	if (!oldmm)
 		return 0;

 	if (clone_flags & CLONE_VM) {
 		atomic_inc(&oldmm->mm_users);
 		mm = oldmm;
 		goto good_mm;
 	}

 	retval = -ENOMEM;
 	mm = allocate_mm();
 	if (!mm)
 		goto fail_nomem;

 	/* Copy the current MM stuff.. */
 	memcpy(mm, oldmm, sizeof(*mm));
 	if (!mm_init(mm))
 		goto fail_nomem;

 	if (init_new_context(tsk,mm))
 		goto free_pt;

 	down_write(&oldmm->mmap_sem);
 	retval = dup_mmap(mm);
 	up_write(&oldmm->mmap_sem);

 	if (retval)
 		goto free_pt;

 	/*
 	 * child gets a private LDT (if there was an LDT in the parent)
 	 */
 	copy_segments(tsk, mm);

 good_mm:
 	tsk->mm = mm;
 	tsk->active_mm = mm;
 	return 0;

 free_pt:
 	mmput(mm);
 fail_nomem:
 	return retval;
 }

 static inline struct fs_struct *__copy_fs_struct(struct fs_struct *old)
 {
 	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
 	/* We don't need to lock fs - think why ;-) */
 	if (fs) {
 		atomic_set(&fs->count, 1);
 		fs->lock = RW_LOCK_UNLOCKED;
 		fs->umask = old->umask;
 		read_lock(&old->lock);
 		fs->rootmnt = mntget(old->rootmnt);
 		fs->root = dget(old->root);
 		fs->pwdmnt = mntget(old->pwdmnt);
 		fs->pwd = dget(old->pwd);
 		if (old->altroot) {
 			fs->altrootmnt = mntget(old->altrootmnt);
 			fs->altroot = dget(old->altroot);
 		} else {
 			fs->altrootmnt = NULL;
 			fs->altroot = NULL;
 		}	
 		read_unlock(&old->lock);
 	}
 	return fs;
 }

 struct fs_struct *copy_fs_struct(struct fs_struct *old)
 {
 	return __copy_fs_struct(old);
 }

 static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
 {
 	if (clone_flags & CLONE_FS) {
 		atomic_inc(&current->fs->count);
 		return 0;
 	}
 	tsk->fs = __copy_fs_struct(current->fs);
 	if (!tsk->fs)
 		return -1;
 	return 0;
 }

 static int count_open_files(struct files_struct *files, int size)
 {
 	int i;
 	
 	/* Find the last open fd */
 	for (i = size/(8*sizeof(long)); i > 0; ) {
 		if (files->open_fds->fds_bits[--i])
 			break;
 	}
 	i = (i+1) * 8 * sizeof(long);
 	return i;
 }

 static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct files_struct *oldf, *newf;
 	struct file **old_fds, **new_fds;
 	int open_files, nfds, size, i, error = 0;

 	/*
 	 * A background process may not have any files ...
 	 */
 	oldf = current->files;
 	if (!oldf)
 		goto out;

 	if (clone_flags & CLONE_FILES) {
 		atomic_inc(&oldf->count);
 		goto out;
 	}

 	tsk->files = NULL;
 	error = -ENOMEM;
 	newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
 	if (!newf) 
 		goto out;

 	atomic_set(&newf->count, 1);

 	newf->file_lock	    = RW_LOCK_UNLOCKED;
 	newf->next_fd	    = 0;
 	newf->max_fds	    = NR_OPEN_DEFAULT;
 	newf->max_fdset	    = __FD_SETSIZE;
 	newf->close_on_exec = &newf->close_on_exec_init;
 	newf->open_fds	    = &newf->open_fds_init;
 	newf->fd	    = &newf->fd_array[0];

 	/* We don't yet have the oldf readlock, but even if the old
           fdset gets grown now, we'll only copy up to "size" fds */
 	size = oldf->max_fdset;
 	if (size > __FD_SETSIZE) {
 		newf->max_fdset = 0;
 		write_lock(&newf->file_lock);
 		error = expand_fdset(newf, size-1);
 		write_unlock(&newf->file_lock);
 		if (error)
 			goto out_release;
 	}
 	read_lock(&oldf->file_lock);

 	open_files = count_open_files(oldf, size);

 	/*
 	 * Check whether we need to allocate a larger fd array.
 	 * Note: we're not a clone task, so the open count won't
 	 * change.
 	 */
 	nfds = NR_OPEN_DEFAULT;
 	if (open_files > nfds) {
 		read_unlock(&oldf->file_lock);
 		newf->max_fds = 0;
 		write_lock(&newf->file_lock);
 		error = expand_fd_array(newf, open_files-1);
 		write_unlock(&newf->file_lock);
 		if (error) 
 			goto out_release;
 		nfds = newf->max_fds;
 		read_lock(&oldf->file_lock);
 	}

 	old_fds = oldf->fd;
 	new_fds = newf->fd;

 	memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
 	memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);

 	for (i = open_files; i != 0; i--) {
 		struct file *f = *old_fds++;
 		if (f)
 			get_file(f);
 		*new_fds++ = f;
 	}
 	read_unlock(&oldf->file_lock);

 	/* compute the remainder to be cleared */
 	size = (newf->max_fds - open_files) * sizeof(struct file *);

 	/* This is long word aligned thus could use a optimized version */ 
 	memset(new_fds, 0, size); 

 	if (newf->max_fdset > open_files) {
 		int left = (newf->max_fdset-open_files)/8;
 		int start = open_files / (8 * sizeof(unsigned long));
 		
 		memset(&newf->open_fds->fds_bits[start], 0, left);
 		memset(&newf->close_on_exec->fds_bits[start], 0, left);
 	}

 	tsk->files = newf;
 	error = 0;
 out:
 	return error;

 out_release:
 	free_fdset (newf->close_on_exec, newf->max_fdset);
 	free_fdset (newf->open_fds, newf->max_fdset);
 	kmem_cache_free(files_cachep, newf);
 	goto out;
 }

 static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
 {
 	struct signal_struct *sig;

 	if (clone_flags & CLONE_SIGHAND) {
 		atomic_inc(&current->sig->count);
 		return 0;
 	}
 	sig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL);
 	tsk->sig = sig;
 	if (!sig)
 		return -1;
 	spin_lock_init(&sig->siglock);
 	atomic_set(&sig->count, 1);
 	memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action));
 	return 0;
 }

 static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
 {
 	unsigned long new_flags = p->flags;

 	new_flags &= ~(PF_SUPERPRIV | PF_USEDFPU);
 	new_flags |= PF_FORKNOEXEC;
 	if (!(clone_flags & CLONE_PTRACE))
 		p->ptrace = 0;
 	p->flags = new_flags;
 }

 /*
 *  Ok, this is the main fork-routine. It copies the system process
 * information (task[nr]) and sets up the necessary registers. It also
 * copies the data segment in its entirety.  The "stack_start" and
 * "stack_top" arguments are simply passed along to the platform
 * specific copy_thread() routine.  Most platforms ignore stack_top.
 * For an example that's using stack_top, see
 * arch/ia64/kernel/process.c.
 */
 int do_fork(unsigned long clone_flags, unsigned long stack_start,
 	    struct pt_regs *regs, unsigned long stack_size)
 {
 	int retval;
 	struct task_struct *p;
 	struct completion vfork;

 	if ((clone_flags & (CLONE_NEWNS|CLONE_FS)) == (CLONE_NEWNS|CLONE_FS))
 		return -EINVAL;

 	retval = -EPERM;

 	/* 
 	 * CLONE_PID is only allowed for the initial SMP swapper
 	 * calls
 	 */
 	if (clone_flags & CLONE_PID) {
 		if (current->pid)
 			goto fork_out;
 	}

 	retval = -ENOMEM;
 	p = alloc_task_struct();
 	if (!p)
 		goto fork_out;

 	*p = *current;

 	retval = -EAGAIN;
 	/*
 	 * Check if we are over our maximum process limit, but be sure to
 	 * exclude root. This is needed to make it possible for login and
 	 * friends to set the per-user process limit to something lower
 	 * than the amount of processes root is running. -- Rik
 	 */
 	if (atomic_read(&p->user->processes) >= p->rlim[RLIMIT_NPROC].rlim_cur
 	              && !capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE))
 		goto bad_fork_free;

 	atomic_inc(&p->user->__count);
 	atomic_inc(&p->user->processes);

 	/*
 	 * Counter increases are protected by
 	 * the kernel lock so nr_threads can't
 	 * increase under us (but it may decrease).
 	 */
 	if (nr_threads >= max_threads)
 		goto bad_fork_cleanup_count;
 	
 	get_exec_domain(p->exec_domain);

 	if (p->binfmt && p->binfmt->module)
 		__MOD_INC_USE_COUNT(p->binfmt->module);

 	p->did_exec = 0;
 	p->swappable = 0;
 	p->state = TASK_UNINTERRUPTIBLE;

 	copy_flags(clone_flags, p);
 	p->pid = get_pid(clone_flags);
 	if (p->pid == 0 && current->pid != 0)
 		goto bad_fork_cleanup;

 	p->run_list.next = NULL;
 	p->run_list.prev = NULL;

 	p->p_cptr = NULL;
 	init_waitqueue_head(&p->wait_chldexit);
 	p->vfork_done = NULL;
 	if (clone_flags & CLONE_VFORK) {
 		p->vfork_done = &vfork;
 		init_completion(&vfork);
 	}
 	spin_lock_init(&p->alloc_lock);

 	p->sigpending = 0;
 	init_sigpending(&p->pending);

 	p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
 	p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
 	init_timer(&p->real_timer);
 	p->real_timer.data = (unsigned long) p;

 	p->leader = 0;		/* session leadership doesn't inherit */
 	p->tty_old_pgrp = 0;
 	p->times.tms_utime = p->times.tms_stime = 0;
 	p->times.tms_cutime = p->times.tms_cstime = 0;
 	p->tn=5;
 	p->cpustart=jiffies*10;
 #ifdef CONFIG_SMP
 	{
 		int i;
 		p->cpus_runnable = ~0UL;
 		p->processor = current->processor;
 		/* ?? should we just memset this ?? */
 		for(i = 0; i < smp_num_cpus; i++)
 			p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
 		spin_lock_init(&p->sigmask_lock);
 	}
 #endif
 	p->lock_depth = -1;		/* -1 = no lock */
 	p->start_time = jiffies;

 	INIT_LIST_HEAD(&p->local_pages);

 	retval = -ENOMEM;
 	/* copy all the process information */
 	if (copy_files(clone_flags, p))
 		goto bad_fork_cleanup;
 	if (copy_fs(clone_flags, p))
 		goto bad_fork_cleanup_files;
 	if (copy_sighand(clone_flags, p))
 		goto bad_fork_cleanup_fs;
 	if (copy_mm(clone_flags, p))
 		goto bad_fork_cleanup_sighand;
 	if (copy_namespace(clone_flags, p))
 		goto bad_fork_cleanup_mm;
 	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
 	if (retval)
 		goto bad_fork_cleanup_namespace;
 	p->semundo = NULL;
 	
 	/* Our parent execution domain becomes current domain
 	   These must match for thread signalling to apply */
 	   
 	p->parent_exec_id = p->self_exec_id;

 	/* ok, now we should be set up.. */
 	p->swappable = 1;
 	p->exit_signal = clone_flags & CSIGNAL;
 	p->pdeath_signal = 0;

 	/*
 	 * "share" dynamic priority between parent and child, thus the
 	 * total amount of dynamic priorities in the system doesn't change,
 	 * more scheduling fairness. This is only important in the first
 	 * timeslice, on the long run the scheduling behaviour is unchanged.
 	 */
 	p->counter = (current->counter + 1) >> 1;
 	current->counter >>= 1;
 	if (!current->counter)
 		current->need_resched = 1;

 	/*
 	 * Ok, add it to the run-queues and make it
 	 * visible to the rest of the system.
 	 *
 	 * Let it rip!
 	 */
 	retval = p->pid;
 	p->tgid = retval;
 	INIT_LIST_HEAD(&p->thread_group);

 	/* Need tasklist lock for parent etc handling! */
 	write_lock_irq(&tasklist_lock);

 	/* CLONE_PARENT re-uses the old parent */
 	p->p_opptr = current->p_opptr;
 	p->p_pptr = current->p_pptr;
 	if (!(clone_flags & CLONE_PARENT)) {
 		p->p_opptr = current;
 		if (!(p->ptrace & PT_PTRACED))
 			p->p_pptr = current;
 	}

 	if (clone_flags & CLONE_THREAD) {
 		p->tgid = current->tgid;
 		list_add(&p->thread_group, &current->thread_group);
 	}

 	SET_LINKS(p);
 	hash_pid(p);
 	nr_threads++;
 	write_unlock_irq(&tasklist_lock);

 	if (p->ptrace & PT_PTRACED)
 		send_sig(SIGSTOP, p, 1);

 	wake_up_process(p);		/* do this last */
 	++total_forks;
 	if (clone_flags & CLONE_VFORK)
 		wait_for_completion(&vfork);

 fork_out:
 	return retval;

 bad_fork_cleanup_namespace:
 	exit_namespace(p);
 bad_fork_cleanup_mm:
 	exit_mm(p);
 bad_fork_cleanup_sighand:
 	exit_sighand(p);
 bad_fork_cleanup_fs:
 	exit_fs(p); /* blocking */
 bad_fork_cleanup_files:
 	exit_files(p); /* blocking */
 bad_fork_cleanup:
 	put_exec_domain(p->exec_domain);
 	if (p->binfmt && p->binfmt->module)
 		__MOD_DEC_USE_COUNT(p->binfmt->module);
 bad_fork_cleanup_count:
 	atomic_dec(&p->user->processes);
 	free_uid(p->user);
 bad_fork_free:
 	free_task_struct(p);
 	goto fork_out;
 }

 /* SLAB cache for signal_struct structures (tsk->sig) */
 kmem_cache_t *sigact_cachep;

 /* SLAB cache for files_struct structures (tsk->files) */
 kmem_cache_t *files_cachep;

 /* SLAB cache for fs_struct structures (tsk->fs) */
 kmem_cache_t *fs_cachep;

 /* SLAB cache for vm_area_struct structures */
 kmem_cache_t *vm_area_cachep;

 /* SLAB cache for mm_struct structures (tsk->mm) */
 kmem_cache_t *mm_cachep;

 void __init proc_caches_init(void)
 {
 	sigact_cachep = kmem_cache_create("signal_act",
 			sizeof(struct signal_struct), 0,
 			SLAB_HWCACHE_ALIGN, NULL, NULL);
 	if (!sigact_cachep)
 		panic("Cannot create signal action SLAB cache");

 	files_cachep = kmem_cache_create("files_cache", 
 			 sizeof(struct files_struct), 0, 
 			 SLAB_HWCACHE_ALIGN, NULL, NULL);
 	if (!files_cachep) 
 		panic("Cannot create files SLAB cache");

 	fs_cachep = kmem_cache_create("fs_cache", 
 			 sizeof(struct fs_struct), 0, 
 			 SLAB_HWCACHE_ALIGN, NULL, NULL);
 	if (!fs_cachep) 
 		panic("Cannot create fs_struct SLAB cache");
 
 	vm_area_cachep = kmem_cache_create("vm_area_struct",
 			sizeof(struct vm_area_struct), 0,
 			SLAB_HWCACHE_ALIGN, NULL, NULL);
 	if(!vm_area_cachep)
 		panic("vma_init: Cannot alloc vm_area_struct SLAB cache");

 	mm_cachep = kmem_cache_create("mm_struct",
 			sizeof(struct mm_struct), 0,
 			SLAB_HWCACHE_ALIGN, NULL, NULL);
 	if(!mm_cachep)
 		panic("vma_init: Cannot alloc mm_struct SLAB cache");
 }
diff --git a/sched.c b/sched.c
 /*
 *  linux/kernel/sched.c
 *
 *  Kernel scheduler and related syscalls
 *
 *  Copyright (C) 1991, 1992  Linus Torvalds
 *
 *  1996-12-23  Modified by Dave Grothe to fix bugs in semaphores and
 *              make semaphores SMP safe
 *  1998-11-19	Implemented schedule_timeout() and related stuff
 *		by Andrea Arcangeli
 *  1998-12-28  Implemented better SMP scheduling by Ingo Molnar
 */

 /*
 * 'sched.c' is the main kernel file. It contains scheduling primitives
 * (sleep_on, wakeup, schedule etc) as well as a number of simple system
 * call functions (type getpid()), which just extract a field from
 * current-task
 */

 #include <linux/config.h>
 #include <linux/mm.h>
 #include <linux/init.h>
 #include <linux/smp_lock.h>
 #include <linux/nmi.h>
 #include <linux/interrupt.h>
 #include <linux/kernel_stat.h>
 #include <linux/completion.h>
 #include <linux/prefetch.h>
 #include <linux/compiler.h>
 #include <asm/uaccess.h>
 #include <asm/mmu_context.h>

 extern void timer_bh(void);
 extern void tqueue_bh(void);
 extern void immediate_bh(void);
 extern int opsyspolisi;

 /*
 * scheduler variables
 */

 unsigned securebits = SECUREBITS_DEFAULT; /* systemwide security settings */

 extern void mem_use(void);

 /*
 * Scheduling quanta.
 *
 * NOTE! The unix "nice" value influences how long a process
 * gets. The nice value ranges from -20 to +19, where a -20
 * is a "high-priority" task, and a "+10" is a low-priority
 * task.
 *
 * We want the time-slice to be around 50ms or so, so this
 * calculation depends on the value of HZ.
 */
 #if HZ < 200
 #define TICK_SCALE(x)	((x) >> 2)
 #elif HZ < 400
 #define TICK_SCALE(x)	((x) >> 1)
 #elif HZ < 800
 #define TICK_SCALE(x)	(x)
 #elif HZ < 1600
 #define TICK_SCALE(x)	((x) << 1)
 #else
 #define TICK_SCALE(x)	((x) << 2)
 #endif

 #define NICE_TO_TICKS(nice)	(TICK_SCALE(20-(nice))+1)


 /*
 *	Init task must be ok at boot for the ix86 as we will check its signals
 *	via the SMP irq return path.
 */
 
 struct task_struct * init_tasks[NR_CPUS] = {&init_task, };

 /*
 * The tasklist_lock protects the linked list of processes.
 *
 * The runqueue_lock locks the parts that actually access
 * and change the run-queues, and have to be interrupt-safe.
 *
 * If both locks are to be concurrently held, the runqueue_lock
 * nests inside the tasklist_lock.
 *
 * task->alloc_lock nests inside tasklist_lock.
 */
 spinlock_t runqueue_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;  /* inner */
 rwlock_t tasklist_lock __cacheline_aligned = RW_LOCK_UNLOCKED;	/* outer */

 static LIST_HEAD(runqueue_head);

 /*
 * We align per-CPU scheduling data on cacheline boundaries,
 * to prevent cacheline ping-pong.
 */
 static union {
 	struct schedule_data {
 		struct task_struct * curr;
 		cycles_t last_schedule;
 	} schedule_data;
 	char __pad [SMP_CACHE_BYTES];
 } aligned_data [NR_CPUS] __cacheline_aligned = { {{&init_task,0}}};

 #define cpu_curr(cpu) aligned_data[(cpu)].schedule_data.curr
 #define last_schedule(cpu) aligned_data[(cpu)].schedule_data.last_schedule

 struct kernel_stat kstat;
 extern struct task_struct *child_reaper;

 #ifdef CONFIG_SMP

 #define idle_task(cpu) (init_tasks[cpu_number_map(cpu)])
 #define can_schedule(p,cpu) \
 	((p)->cpus_runnable & (p)->cpus_allowed & (1 << cpu))

 #else

 #define idle_task(cpu) (&init_task)
 #define can_schedule(p,cpu) (1)

 #endif

 void scheduling_functions_start_here(void) { }

 /*
 * This is the function that decides how desirable a process is..
 * You can weigh different processes against each other depending
 * on what CPU they've run on lately etc to try to handle cache
 * and TLB miss penalties.
 *
 * Return values:
 *	 -1000: never select this
 *	     0: out of time, recalculate counters (but it might still be
 *		selected)
 *	   +ve: "goodness" value (the larger, the better)
 *	 +1000: realtime process, select this.
 */

 static inline int goodness(struct task_struct * p, int this_cpu, struct mm_struct *this_mm)
 {
 	int weight;

 	/*
 	 * select the current process after every other
 	 * runnable process, but before the idle thread.
 	 * Also, dont trigger a counter recalculation.
 	 */
 	weight = -1;
 	if (p->policy & SCHED_YIELD)
 		goto out;

 	/*
 	 * Non-RT process - normal case first.
 	 */
 	if (p->policy == SCHED_OTHER) {
 		/*
 		 * Give the process a first-approximation goodness value
 		 * according to the number of clock-ticks it has left.
 		 *
 		 * Don't do any other calculations if the time slice is
 		 * over..
 		 */
 		weight = p->counter;
 		if (!weight)
 			goto out;
 			
 #ifdef CONFIG_SMP
 		/* Give a largish advantage to the same processor...   */
 		/* (this is equivalent to penalizing other processors) */
 		if (p->processor == this_cpu)
 			weight += PROC_CHANGE_PENALTY;
 #endif

 		/* .. and a slight advantage to the current MM */
 		if (p->mm == this_mm || !p->mm)
 			weight += 1;
 		weight += 20 - p->nice;
 		goto out;
 	}

 	/*
 	 * Realtime process, select the first one on the
 	 * runqueue (taking priorities within processes
 	 * into account).
 	 */
 	weight = 1000 + p->rt_priority;
 out:
 	return weight;
 }

 /*
 * the 'goodness value' of replacing a process on a given CPU.
 * positive value means 'replace', zero or negative means 'dont'.
 */
 static inline int preemption_goodness(struct task_struct * prev, struct task_struct * p, int cpu)
 {
 	return goodness(p, cpu, prev->active_mm) - goodness(prev, cpu, prev->active_mm);
 }

 /*
 * This is ugly, but reschedule_idle() is very timing-critical.
 * We are called with the runqueue spinlock held and we must
 * not claim the tasklist_lock.
 */
 static FASTCALL(void reschedule_idle(struct task_struct * p));

 static void reschedule_idle(struct task_struct * p)
 {
 #ifdef CONFIG_SMP
 	int this_cpu = smp_processor_id();
 	struct task_struct *tsk, *target_tsk;
 	int cpu, best_cpu, i, max_prio;
 	cycles_t oldest_idle;

 	/*
 	 * shortcut if the woken up task's last CPU is
 	 * idle now.
 	 */
 	best_cpu = p->processor;
 	if (can_schedule(p, best_cpu)) {
 		tsk = idle_task(best_cpu);
 		if (cpu_curr(best_cpu) == tsk) {
 			int need_resched;
 send_now_idle:
 			/*
 			 * If need_resched == -1 then we can skip sending
 			 * the IPI altogether, tsk->need_resched is
 			 * actively watched by the idle thread.
 			 */
 			need_resched = tsk->need_resched;
 			tsk->need_resched = 1;
 			if ((best_cpu != this_cpu) && !need_resched)
 				smp_send_reschedule(best_cpu);
 			return;
 		}
 	}

 	/*
 	 * We know that the preferred CPU has a cache-affine current
 	 * process, lets try to find a new idle CPU for the woken-up
 	 * process. Select the least recently active idle CPU. (that
 	 * one will have the least active cache context.) Also find
 	 * the executing process which has the least priority.
 	 */
 	oldest_idle = (cycles_t) -1;
 	target_tsk = NULL;
 	max_prio = 0;

 	for (i = 0; i < smp_num_cpus; i++) {
 		cpu = cpu_logical_map(i);
 		if (!can_schedule(p, cpu))
 			continue;
 		tsk = cpu_curr(cpu);
 		/*
 		 * We use the first available idle CPU. This creates
 		 * a priority list between idle CPUs, but this is not
 		 * a problem.
 		 */
 		if (tsk == idle_task(cpu)) {
 #if defined(__i386__) && defined(CONFIG_SMP)
                        /*
 			 * Check if two siblings are idle in the same
 			 * physical package. Use them if found.
 			 */
 			if (smp_num_siblings == 2) {
 				if (cpu_curr(cpu_sibling_map[cpu]) == 
 			            idle_task(cpu_sibling_map[cpu])) {
 					oldest_idle = last_schedule(cpu);
 					target_tsk = tsk;
 					break;
 				}
 				
                        }
 #endif		
 			if (last_schedule(cpu) < oldest_idle) {
 				oldest_idle = last_schedule(cpu);
 				target_tsk = tsk;
 			}
 		} else {
 			if (oldest_idle == -1ULL) {
 				int prio = preemption_goodness(tsk, p, cpu);

 				if (prio > max_prio) {
 					max_prio = prio;
 					target_tsk = tsk;
 				}
 			}
 		}
 	}
 	tsk = target_tsk;
 	if (tsk) {
 		if (oldest_idle != -1ULL) {
 			best_cpu = tsk->processor;
 			goto send_now_idle;
 		}
 		tsk->need_resched = 1;
 		if (tsk->processor != this_cpu)
 			smp_send_reschedule(tsk->processor);
 	}
 	return;
 		

 #else /* UP */
 	int this_cpu = smp_processor_id();
 	struct task_struct *tsk;

 	tsk = cpu_curr(this_cpu);
 	if (preemption_goodness(tsk, p, this_cpu) > 0)
 		tsk->need_resched = 1;
 #endif
 }

 /*
 * Careful!
 *
 * This has to add the process to the _end_ of the 
 * run-queue, not the beginning. The goodness value will
 * determine whether this process will run next. This is
 * important to get SCHED_FIFO and SCHED_RR right, where
 * a process that is either pre-empted or its time slice
 * has expired, should be moved to the tail of the run 
 * queue for its priority - Bhavesh Davda
 */
 static inline void add_to_runqueue(struct task_struct * p)
 {
 	list_add_tail(&p->run_list, &runqueue_head);
 	nr_running++;
 }

 static inline void move_last_runqueue(struct task_struct * p)
 {
 	list_del(&p->run_list);
 	list_add_tail(&p->run_list, &runqueue_head);
 }

 /*
 * Wake up a process. Put it on the run-queue if it's not
 * already there.  The "current" process is always on the
 * run-queue (except when the actual re-schedule is in
 * progress), and as such you're allowed to do the simpler
 * "current->state = TASK_RUNNING" to mark yourself runnable
 * without the overhead of this.
 */
 static inline int try_to_wake_up(struct task_struct * p, int synchronous)
 {
 	unsigned long flags;
 	int success = 0;

 	/*
 	 * We want the common case fall through straight, thus the goto.
 	 */
 	spin_lock_irqsave(&runqueue_lock, flags);
 	p->state = TASK_RUNNING;
 	if (task_on_runqueue(p))
 		goto out;
 	add_to_runqueue(p);
 	if (!synchronous || !(p->cpus_allowed & (1 << smp_processor_id())))
 		reschedule_idle(p);
 	success = 1;
 out:
 	spin_unlock_irqrestore(&runqueue_lock, flags);
 	return success;
 }

 inline int wake_up_process(struct task_struct * p)
 {
 	return try_to_wake_up(p, 0);
 }

 static void process_timeout(unsigned long __data)
 {
 	struct task_struct * p = (struct task_struct *) __data;

 	wake_up_process(p);
 }

 /**
 * schedule_timeout - sleep until timeout
 * @timeout: timeout value in jiffies
 *
 * Make the current task sleep until @timeout jiffies have
 * elapsed. The routine will return immediately unless
 * the current task state has been set (see set_current_state()).
 *
 * You can set the task state as follows -
 *
 * %TASK_UNINTERRUPTIBLE - at least @timeout jiffies are guaranteed to
 * pass before the routine returns. The routine will return 0
 *
 * %TASK_INTERRUPTIBLE - the routine may return early if a signal is
 * delivered to the current task. In this case the remaining time
 * in jiffies will be returned, or 0 if the timer expired in time
 *
 * The current task state is guaranteed to be TASK_RUNNING when this 
 * routine returns.
 *
 * Specifying a @timeout value of %MAX_SCHEDULE_TIMEOUT will schedule
 * the CPU away without a bound on the timeout. In this case the return
 * value will be %MAX_SCHEDULE_TIMEOUT.
 *
 * In all cases the return value is guaranteed to be non-negative.
 */
 signed long schedule_timeout(signed long timeout)
 {
 	struct timer_list timer;
 	unsigned long expire;

 	switch (timeout)
 	{
 	case MAX_SCHEDULE_TIMEOUT:
 		/*
 		 * These two special cases are useful to be comfortable
 		 * in the caller. Nothing more. We could take
 		 * MAX_SCHEDULE_TIMEOUT from one of the negative value
 		 * but I' d like to return a valid offset (>=0) to allow
 		 * the caller to do everything it want with the retval.
 		 */
 		schedule();
 		goto out;
 	default:
 		/*
 		 * Another bit of PARANOID. Note that the retval will be
 		 * 0 since no piece of kernel is supposed to do a check
 		 * for a negative retval of schedule_timeout() (since it
 		 * should never happens anyway). You just have the printk()
 		 * that will tell you if something is gone wrong and where.
 		 */
 		if (timeout < 0)
 		{
 			printk(KERN_ERR "schedule_timeout: wrong timeout "
 			       "value %lx from %p\n", timeout,
 			       __builtin_return_address(0));
 			current->state = TASK_RUNNING;
 			goto out;
 		}
 	}

 	expire = timeout + jiffies;

 	init_timer(&timer);
 	timer.expires = expire;
 	timer.data = (unsigned long) current;
 	timer.function = process_timeout;

 	add_timer(&timer);
 	schedule();
 	del_timer_sync(&timer);

 	timeout = expire - jiffies;

 out:
 	return timeout < 0 ? 0 : timeout;
 }

 /*
 * schedule_tail() is getting called from the fork return path. This
 * cleans up all remaining scheduler things, without impacting the
 * common case.
 */
 static inline void __schedule_tail(struct task_struct *prev)
 {
 #ifdef CONFIG_SMP
 	int policy;

 	/*
 	 * prev->policy can be written from here only before `prev'
 	 * can be scheduled (before setting prev->cpus_runnable to ~0UL).
 	 * Of course it must also be read before allowing prev
 	 * to be rescheduled, but since the write depends on the read
 	 * to complete, wmb() is enough. (the spin_lock() acquired
 	 * before setting cpus_runnable is not enough because the spin_lock()
 	 * common code semantics allows code outside the critical section
 	 * to enter inside the critical section)
 	 */
 	policy = prev->policy;
 	prev->policy = policy & ~SCHED_YIELD;
 	wmb();

 	/*
 	 * fast path falls through. We have to clear cpus_runnable before
 	 * checking prev->state to avoid a wakeup race. Protect against
 	 * the task exiting early.
 	 */
 	task_lock(prev);
 	task_release_cpu(prev);
 	mb();
 	if (prev->state == TASK_RUNNING)
 		goto needs_resched;

 out_unlock:
 	task_unlock(prev);	/* Synchronise here with release_task() if prev is TASK_ZOMBIE */
 	return;

 	/*
 	 * Slow path - we 'push' the previous process and
 	 * reschedule_idle() will attempt to find a new
 	 * processor for it. (but it might preempt the
 	 * current process as well.) We must take the runqueue
 	 * lock and re-check prev->state to be correct. It might
 	 * still happen that this process has a preemption
 	 * 'in progress' already - but this is not a problem and
 	 * might happen in other circumstances as well.
 	 */
 needs_resched:
 	{
 		unsigned long flags;

 		/*
 		 * Avoid taking the runqueue lock in cases where
 		 * no preemption-check is necessery:
 		 */
 		if ((prev == idle_task(smp_processor_id())) ||
 						(policy & SCHED_YIELD))
 			goto out_unlock;

 		spin_lock_irqsave(&runqueue_lock, flags);
 		if ((prev->state == TASK_RUNNING) && !task_has_cpu(prev))
 			reschedule_idle(prev);
 		spin_unlock_irqrestore(&runqueue_lock, flags);
 		goto out_unlock;
 	}
 #else
 	prev->policy &= ~SCHED_YIELD;
 #endif /* CONFIG_SMP */
 }

 asmlinkage void schedule_tail(struct task_struct *prev)
 {
 	__schedule_tail(prev);
 }

 /*
 *  'schedule()' is the scheduler function. It's a very simple and nice
 * scheduler: it's not perfect, but certainly works for most things.
 *
 * The goto is "interesting".
 *
 *   NOTE!!  Task 0 is the 'idle' task, which gets called when no other
 * tasks can run. It can not be killed, and it cannot sleep. The 'state'
 * information in task[0] is never used.
 */
 asmlinkage void schedule(void)
 {
 	struct schedule_data * sched_data;
 	struct task_struct *prev, *next, *p;
 	struct list_head *tmp;
 	int this_cpu, c;

 	spin_lock_prefetch(&runqueue_lock);

 	BUG_ON(!current->active_mm);
 need_resched_back:
 	prev = current;
 	this_cpu = prev->processor;

 	if (unlikely(in_interrupt())) {
 		printk("Scheduling in interrupt\n");
 		BUG();
 	}

 	release_kernel_lock(prev, this_cpu);

 	/*
 	 * 'sched_data' is protected by the fact that we can run
 	 * only one process per CPU.
 	 */
 	sched_data = & aligned_data[this_cpu].schedule_data;

 	spin_lock_irq(&runqueue_lock);

 	/* move an exhausted RR process to be last.. */
 	if (unlikely(prev->policy == SCHED_RR))
 		if (!prev->counter) {
 			prev->counter = NICE_TO_TICKS(prev->nice);
 			move_last_runqueue(prev);
 		}

 	switch (prev->state) {
 		case TASK_INTERRUPTIBLE:
 			if (signal_pending(prev)) {
 				prev->state = TASK_RUNNING;
 				break;
 			}
 		default:
 			del_from_runqueue(prev);
 		case TASK_RUNNING:;
 	}
 	prev->need_resched = 0;

 	/*
 	 * this is the scheduler proper:
 	 */

 repeat_schedule:
 	
 	if(opsyspolisi==1)
 	{
 	/*
 	 * Default process to select..
 	 */
 	next = idle_task(this_cpu);
 	c = -1000;
 	list_for_each(tmp, &runqueue_head) 
 	{
 		p = list_entry(tmp, struct task_struct, run_list);
 		if (can_schedule(p, this_cpu)) 
 		{
 			int weight = goodness(p, this_cpu, prev->active_mm);
 			if (weight > c)
 				c = weight, next = p;
 		}
 	}

 	/* Do we need to re-calculate counters? */
 	if (unlikely(!c)) {
 		struct task_struct *p;

 		spin_unlock_irq(&runqueue_lock);
 		read_lock(&tasklist_lock);
 		for_each_task(p)
 			p->counter = (p->counter >> 1) + NICE_TO_TICKS(p->nice);
 		read_unlock(&tasklist_lock);
 		spin_lock_irq(&runqueue_lock);
 		goto repeat_schedule;
 	}
 	}
 	else if(opsyspolisi==2)
 	{
 		next = idle_task(this_cpu);
 		unsigned int searchbit=1;
 		unsigned int rndmbit=0;
 		unsigned int randomtn=0;
 		unsigned int ttn=0;
 		int flag=0;
 		list_for_each(tmp, &runqueue_head)
 		{
 			p = list_entry(tmp, struct task_struct, run_list);
 			if (can_schedule(p, this_cpu))
 			{
 				ttn+=p->tn;
 			}
 		}
 		do
 		{
 		get_random_bytes(&rndmbit,sizeof(unsigned int));
 		if(ttn==0)
 		{
 			return NULL;
 		}		
 		randomtn=rndmbit % ttn;
 		}while(randomtn<0);
 		list_for_each(tmp, &runqueue_head) 
 		{
 			p = list_entry(tmp, struct task_struct, run_list);
 			if (can_schedule(p, this_cpu) && flag==0)
 			{
 				searchbit+=p->tn;
 				if(searchbit>=randomtn)
 				{
 					next=p;
 					break;
 				}
 			}
 		}
 		/* reacalculate tickets*/
 		if((jiffies*10)-prev->cpustart<20)
 		{
 			if(prev->tn>1)
 			{
 				prev->tn=prev->tn-1;
 				prev->cpustart=0;
 			}
 		}
 		else if((jiffies*10)-prev->cpustart>200)
 		{
 			if(prev->tn<9)
 			{
 				prev->tn=prev->tn+1;
 				prev->cpustart=0;
 			}
 		}
 	}
 	/*
 	 * from this point on nothing can prevent us from
 	 * switching to the next task, save this fact in
 	 * sched_data.
 	 */
 	sched_data->curr = next;
 	task_set_cpu(next, this_cpu);
 	spin_unlock_irq(&runqueue_lock);

 	if (unlikely(prev == next)) {
 		/* We won't go through the normal tail, so do this by hand */
 		prev->policy &= ~SCHED_YIELD;
 		goto same_process;
 	}
 	

 #ifdef CONFIG_SMP
 	/*
 	 * maintain the per-process 'last schedule' value.
 	 * (this has to be recalculated even if we reschedule to
 	 * the same process) Currently this is only used on SMP,
 	 * and it's approximate, so we do not have to maintain
 	 * it while holding the runqueue spinlock.
 	 */
 	sched_data->last_schedule = get_cycles();

 	/*
 	 * We drop the scheduler lock early (it's a global spinlock),
 	 * thus we have to lock the previous process from getting
 	 * rescheduled during switch_to().
 	 */

 #endif /* CONFIG_SMP */

 	kstat.context_swtch++;
 	/*
 	 * there are 3 processes which are affected by a context switch:
 	 *
 	 * prev == .... ==> (last => next)
 	 *
 	 * It's the 'much more previous' 'prev' that is on next's stack,
 	 * but prev is set to (the just run) 'last' process by switch_to().
 	 * This might sound slightly confusing but makes tons of sense.
 	 */
 	prepare_to_switch();
 	{
 		struct mm_struct *mm = next->mm;
 		struct mm_struct *oldmm = prev->active_mm;
 		if (!mm) {
 			BUG_ON(next->active_mm);
 			next->active_mm = oldmm;
 			atomic_inc(&oldmm->mm_count);
 			enter_lazy_tlb(oldmm, next, this_cpu);
 		} else {
 			BUG_ON(next->active_mm != mm);
 			switch_mm(oldmm, mm, next, this_cpu);
 		}

 		if (!prev->mm) {
 			prev->active_mm = NULL;
 			mmdrop(oldmm);
 		}
 	}

 	/*
 	 * This just switches the register state and the
 	 * stack.
 	 */
 	switch_to(prev, next, prev);
 	__schedule_tail(prev);

 same_process:
 	reacquire_kernel_lock(current);
 	if (current->need_resched)
 		goto need_resched_back;
 	return;
 }

 /*
 * The core wakeup function.  Non-exclusive wakeups (nr_exclusive == 0) just wake everything
 * up.  If it's an exclusive wakeup (nr_exclusive == small +ve number) then we wake all the
 * non-exclusive tasks and one exclusive task.
 *
 * There are circumstances in which we can try to wake a task which has already
 * started to run but is not in state TASK_RUNNING.  try_to_wake_up() returns zero
 * in this (rare) case, and we handle it by contonuing to scan the queue.
 */
 static inline void __wake_up_common (wait_queue_head_t *q, unsigned int mode,
 			 	     int nr_exclusive, const int sync)
 {
 	struct list_head *tmp;
 	struct task_struct *p;

 	CHECK_MAGIC_WQHEAD(q);
 	WQ_CHECK_LIST_HEAD(&q->task_list);
 	
 	list_for_each(tmp,&q->task_list) {
 		unsigned int state;
                wait_queue_t *curr = list_entry(tmp, wait_queue_t, task_list);

 		CHECK_MAGIC(curr->__magic);
 		p = curr->task;
 		state = p->state;
 		if (state & mode) {
 			WQ_NOTE_WAKER(curr);
 			if (try_to_wake_up(p, sync) && (curr->flags&WQ_FLAG_EXCLUSIVE) && !--nr_exclusive)
 				break;
 		}
 	}
 }

 void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr)
 {
 	if (q) {
 		unsigned long flags;
 		wq_read_lock_irqsave(&q->lock, flags);
 		__wake_up_common(q, mode, nr, 0);
 		wq_read_unlock_irqrestore(&q->lock, flags);
 	}
 }

 void __wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr)
 {
 	if (q) {
 		unsigned long flags;
 		wq_read_lock_irqsave(&q->lock, flags);
 		__wake_up_common(q, mode, nr, 1);
 		wq_read_unlock_irqrestore(&q->lock, flags);
 	}
 }

 void complete(struct completion *x)
 {
 	unsigned long flags;

 	spin_lock_irqsave(&x->wait.lock, flags);
 	x->done++;
 	__wake_up_common(&x->wait, TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1, 0);
 	spin_unlock_irqrestore(&x->wait.lock, flags);
 }

 void wait_for_completion(struct completion *x)
 {
 	spin_lock_irq(&x->wait.lock);
 	if (!x->done) {
 		DECLARE_WAITQUEUE(wait, current);

 		wait.flags |= WQ_FLAG_EXCLUSIVE;
 		__add_wait_queue_tail(&x->wait, &wait);
 		do {
 			__set_current_state(TASK_UNINTERRUPTIBLE);
 			spin_unlock_irq(&x->wait.lock);
 			schedule();
 			spin_lock_irq(&x->wait.lock);
 		} while (!x->done);
 		__remove_wait_queue(&x->wait, &wait);
 	}
 	x->done--;
 	spin_unlock_irq(&x->wait.lock);
 }

 #define	SLEEP_ON_VAR				\
 	unsigned long flags;			\
 	wait_queue_t wait;			\
 	init_waitqueue_entry(&wait, current);

 #define	SLEEP_ON_HEAD					\
 	wq_write_lock_irqsave(&q->lock,flags);		\
 	__add_wait_queue(q, &wait);			\
 	wq_write_unlock(&q->lock);

 #define	SLEEP_ON_TAIL						\
 	wq_write_lock_irq(&q->lock);				\
 	__remove_wait_queue(q, &wait);				\
 	wq_write_unlock_irqrestore(&q->lock,flags);

 void interruptible_sleep_on(wait_queue_head_t *q)
 {
 	SLEEP_ON_VAR

 	current->state = TASK_INTERRUPTIBLE;

 	SLEEP_ON_HEAD
 	schedule();
 	SLEEP_ON_TAIL
 }

 long interruptible_sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	SLEEP_ON_VAR

 	current->state = TASK_INTERRUPTIBLE;

 	SLEEP_ON_HEAD
 	timeout = schedule_timeout(timeout);
 	SLEEP_ON_TAIL

 	return timeout;
 }

 void sleep_on(wait_queue_head_t *q)
 {
 	SLEEP_ON_VAR
 	
 	current->state = TASK_UNINTERRUPTIBLE;

 	SLEEP_ON_HEAD
 	schedule();
 	SLEEP_ON_TAIL
 }

 long sleep_on_timeout(wait_queue_head_t *q, long timeout)
 {
 	SLEEP_ON_VAR
 	
 	current->state = TASK_UNINTERRUPTIBLE;

 	SLEEP_ON_HEAD
 	timeout = schedule_timeout(timeout);
 	SLEEP_ON_TAIL

 	return timeout;
 }

 void scheduling_functions_end_here(void) { }

 #ifndef __alpha__

 /*
 * This has been replaced by sys_setpriority.  Maybe it should be
 * moved into the arch dependent tree for those ports that require
 * it for backward compatibility?
 */

 asmlinkage long sys_nice(int increment)
 {
 	long newprio;

 	/*
 	 *	Setpriority might change our priority at the same moment.
 	 *	We don't have to worry. Conceptually one call occurs first
 	 *	and we have a single winner.
 	 */
 	if (increment < 0) {
 		if (!capable(CAP_SYS_NICE))
 			return -EPERM;
 		if (increment < -40)
 			increment = -40;
 	}
 	if (increment > 40)
 		increment = 40;

 	newprio = current->nice + increment;
 	if (newprio < -20)
 		newprio = -20;
 	if (newprio > 19)
 		newprio = 19;
 	current->nice = newprio;
 	return 0;
 }

 #endif

 static inline struct task_struct *find_process_by_pid(pid_t pid)
 {
 	struct task_struct *tsk = current;

 	if (pid)
 		tsk = find_task_by_pid(pid);
 	return tsk;
 }

 static int setscheduler(pid_t pid, int policy, 
 			struct sched_param *param)
 {
 	struct sched_param lp;
 	struct task_struct *p;
 	int retval;

 	retval = -EINVAL;
 	if (!param || pid < 0)
 		goto out_nounlock;

 	retval = -EFAULT;
 	if (copy_from_user(&lp, param, sizeof(struct sched_param)))
 		goto out_nounlock;

 	/*
 	 * We play safe to avoid deadlocks.
 	 */
 	read_lock_irq(&tasklist_lock);
 	spin_lock(&runqueue_lock);

 	p = find_process_by_pid(pid);

 	retval = -ESRCH;
 	if (!p)
 		goto out_unlock;
 			
 	if (policy < 0)
 		policy = p->policy;
 	else {
 		retval = -EINVAL;
 		if (policy != SCHED_FIFO && policy != SCHED_RR &&
 				policy != SCHED_OTHER)
 			goto out_unlock;
 	}
 	
 	/*
 	 * Valid priorities for SCHED_FIFO and SCHED_RR are 1..99, valid
 	 * priority for SCHED_OTHER is 0.
 	 */
 	retval = -EINVAL;
 	if (lp.sched_priority < 0 || lp.sched_priority > 99)
 		goto out_unlock;
 	if ((policy == SCHED_OTHER) != (lp.sched_priority == 0))
 		goto out_unlock;

 	retval = -EPERM;
 	if ((policy == SCHED_FIFO || policy == SCHED_RR) && 
 	    !capable(CAP_SYS_NICE))
 		goto out_unlock;
 	if ((current->euid != p->euid) && (current->euid != p->uid) &&
 	    !capable(CAP_SYS_NICE))
 		goto out_unlock;

 	retval = 0;
 	p->policy = policy;
 	p->rt_priority = lp.sched_priority;

 	current->need_resched = 1;

 out_unlock:
 	spin_unlock(&runqueue_lock);
 	read_unlock_irq(&tasklist_lock);

 out_nounlock:
 	return retval;
 }

 asmlinkage long sys_sched_setscheduler(pid_t pid, int policy, 
 				      struct sched_param *param)
 {
 	return setscheduler(pid, policy, param);
 }

 asmlinkage long sys_sched_setparam(pid_t pid, struct sched_param *param)
 {
 	return setscheduler(pid, -1, param);
 }

 asmlinkage long sys_sched_getscheduler(pid_t pid)
 {
 	struct task_struct *p;
 	int retval;

 	retval = -EINVAL;
 	if (pid < 0)
 		goto out_nounlock;

 	retval = -ESRCH;
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (p)
 		retval = p->policy & ~SCHED_YIELD;
 	read_unlock(&tasklist_lock);

 out_nounlock:
 	return retval;
 }

 asmlinkage long sys_sched_getparam(pid_t pid, struct sched_param *param)
 {
 	struct task_struct *p;
 	struct sched_param lp;
 	int retval;

 	retval = -EINVAL;
 	if (!param || pid < 0)
 		goto out_nounlock;

 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	retval = -ESRCH;
 	if (!p)
 		goto out_unlock;
 	lp.sched_priority = p->rt_priority;
 	read_unlock(&tasklist_lock);

 	/*
 	 * This one might sleep, we cannot do it with a spinlock held ...
 	 */
 	retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0;

 out_nounlock:
 	return retval;

 out_unlock:
 	read_unlock(&tasklist_lock);
 	return retval;
 }

 asmlinkage long sys_sched_yield(void)
 {
 	/*
 	 * Trick. sched_yield() first counts the number of truly 
 	 * 'pending' runnable processes, then returns if it's
 	 * only the current processes. (This test does not have
 	 * to be atomic.) In threaded applications this optimization
 	 * gets triggered quite often.
 	 */

 	int nr_pending = nr_running;

 #if CONFIG_SMP
 	int i;

 	// Subtract non-idle processes running on other CPUs.
 	for (i = 0; i < smp_num_cpus; i++) {
 		int cpu = cpu_logical_map(i);
 		if (aligned_data[cpu].schedule_data.curr != idle_task(cpu))
 			nr_pending--;
 	}
 #else
 	// on UP this process is on the runqueue as well
 	nr_pending--;
 #endif
 	if (nr_pending) {
 		/*
 		 * This process can only be rescheduled by us,
 		 * so this is safe without any locking.
 		 */
 		if (current->policy == SCHED_OTHER)
 			current->policy |= SCHED_YIELD;
 		current->need_resched = 1;

 		spin_lock_irq(&runqueue_lock);
 		move_last_runqueue(current);
 		spin_unlock_irq(&runqueue_lock);
 	}
 	return 0;
 }

 /**
 * yield - yield the current processor to other threads.
 *
 * this is a shortcut for kernel-space yielding - it marks the
 * thread runnable and calls sys_sched_yield().
 */
 void yield(void)
 {
 	set_current_state(TASK_RUNNING);
 	sys_sched_yield();
 	schedule();
 }

 void __cond_resched(void)
 {
 	set_current_state(TASK_RUNNING);
 	schedule();
 }

 asmlinkage long sys_sched_get_priority_max(int policy)
 {
 	int ret = -EINVAL;

 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
 		ret = 99;
 		break;
 	case SCHED_OTHER:
 		ret = 0;
 		break;
 	}
 	return ret;
 }

 asmlinkage long sys_sched_get_priority_min(int policy)
 {
 	int ret = -EINVAL;

 	switch (policy) {
 	case SCHED_FIFO:
 	case SCHED_RR:
 		ret = 1;
 		break;
 	case SCHED_OTHER:
 		ret = 0;
 	}
 	return ret;
 }

 asmlinkage long sys_sched_rr_get_interval(pid_t pid, struct timespec *interval)
 {
 	struct timespec t;
 	struct task_struct *p;
 	int retval = -EINVAL;

 	if (pid < 0)
 		goto out_nounlock;

 	retval = -ESRCH;
 	read_lock(&tasklist_lock);
 	p = find_process_by_pid(pid);
 	if (p)
 		jiffies_to_timespec(p->policy & SCHED_FIFO ? 0 : NICE_TO_TICKS(p->nice),
 				    &t);
 	read_unlock(&tasklist_lock);
 	if (p)
 		retval = copy_to_user(interval, &t, sizeof(t)) ? -EFAULT : 0;
 out_nounlock:
 	return retval;
 }

 static void show_task(struct task_struct * p)
 {
 	unsigned long free = 0;
 	int state;
 	static const char * stat_nam[] = { "R", "S", "D", "Z", "T", "W" };

 	printk("%-13.13s ", p->comm);
 	state = p->state ? ffz(~p->state) + 1 : 0;
 	if (((unsigned) state) < sizeof(stat_nam)/sizeof(char *))
 		printk(stat_nam[state]);
 	else
 		printk(" ");
 #if (BITS_PER_LONG == 32)
 	if (p == current)
 		printk(" current  ");
 	else
 		printk(" %08lX ", thread_saved_pc(&p->thread));
 #else
 	if (p == current)
 		printk("   current task   ");
 	else
 		printk(" %016lx ", thread_saved_pc(&p->thread));
 #endif
 	{
 		unsigned long * n = (unsigned long *) (p+1);
 		while (!*n)
 			n++;
 		free = (unsigned long) n - (unsigned long)(p+1);
 	}
 	printk("%5lu %5d %6d ", free, p->pid, p->p_pptr->pid);
 	if (p->p_cptr)
 		printk("%5d ", p->p_cptr->pid);
 	else
 		printk("      ");
 	if (p->p_ysptr)
 		printk("%7d", p->p_ysptr->pid);
 	else
 		printk("       ");
 	if (p->p_osptr)
 		printk(" %5d", p->p_osptr->pid);
 	else
 		printk("      ");
 	if (!p->mm)
 		printk(" (L-TLB)\n");
 	else
 		printk(" (NOTLB)\n");

 	{
 		extern void show_trace_task(struct task_struct *tsk);
 		show_trace_task(p);
 	}
 }

 char * render_sigset_t(sigset_t *set, char *buffer)
 {
 	int i = _NSIG, x;
 	do {
 		i -= 4, x = 0;
 		if (sigismember(set, i+1)) x |= 1;
 		if (sigismember(set, i+2)) x |= 2;
 		if (sigismember(set, i+3)) x |= 4;
 		if (sigismember(set, i+4)) x |= 8;
 		*buffer++ = (x < 10 ? '0' : 'a' - 10) + x;
 	} while (i >= 4);
 	*buffer = 0;
 	return buffer;
 }

 void show_state(void)
 {
 	struct task_struct *p;

 #if (BITS_PER_LONG == 32)
 	printk("\n"
 	       "                         free                        sibling\n");
 	printk("  task             PC    stack   pid father child younger older\n");
 #else
 	printk("\n"
 	       "                                 free                        sibling\n");
 	printk("  task                 PC        stack   pid father child younger older\n");
 #endif
 	read_lock(&tasklist_lock);
 	for_each_task(p) {
 		/*
 		 * reset the NMI-timeout, listing all files on a slow
 		 * console might take alot of time:
 		 */
 		touch_nmi_watchdog();
 		show_task(p);
 	}
 	read_unlock(&tasklist_lock);
 }

 /**
 * reparent_to_init() - Reparent the calling kernel thread to the init task.
 *
 * If a kernel thread is launched as a result of a system call, or if
 * it ever exits, it should generally reparent itself to init so that
 * it is correctly cleaned up on exit.
 *
 * The various task state such as scheduling policy and priority may have
 * been inherited fro a user process, so we reset them to sane values here.
 *
 * NOTE that reparent_to_init() gives the caller full capabilities.
 */
 void reparent_to_init(void)
 {
 	struct task_struct *this_task = current;

 	write_lock_irq(&tasklist_lock);

 	/* Reparent to init */
 	REMOVE_LINKS(this_task);
 	this_task->p_pptr = child_reaper;
 	this_task->p_opptr = child_reaper;
 	SET_LINKS(this_task);

 	/* Set the exit signal to SIGCHLD so we signal init on exit */
 	this_task->exit_signal = SIGCHLD;

 	/* We also take the runqueue_lock while altering task fields
 	 * which affect scheduling decisions */
 	spin_lock(&runqueue_lock);

 	this_task->ptrace = 0;
 	this_task->nice = DEF_NICE;
 	this_task->policy = SCHED_OTHER;
 	/* cpus_allowed? */
 	/* rt_priority? */
 	/* signals? */
 	this_task->cap_effective = CAP_INIT_EFF_SET;
 	this_task->cap_inheritable = CAP_INIT_INH_SET;
 	this_task->cap_permitted = CAP_FULL_SET;
 	this_task->keep_capabilities = 0;
 	memcpy(this_task->rlim, init_task.rlim, sizeof(*(this_task->rlim)));
 	this_task->user = INIT_USER;

 	spin_unlock(&runqueue_lock);
 	write_unlock_irq(&tasklist_lock);
 }

 /*
 *	Put all the gunge required to become a kernel thread without
 *	attached user resources in one place where it belongs.
 */

 void daemonize(void)
 {
 	struct fs_struct *fs;


 	/*
 	 * If we were started as result of loading a module, close all of the
 	 * user space pages.  We don't need them, and if we didn't close them
 	 * they would be locked into memory.
 	 */
 	exit_mm(current);

 	current->session = 1;
 	current->pgrp = 1;
 	current->tty = NULL;

 	/* Become as one with the init task */

 	exit_fs(current);	/* current->fs->count--; */
 	fs = init_task.fs;
 	current->fs = fs;
 	atomic_inc(&fs->count);
 	exit_files(current);
 	current->files = init_task.files;
 	atomic_inc(&current->files->count);
 }

 extern unsigned long wait_init_idle;

 void __init init_idle(void)
 {
 	struct schedule_data * sched_data;
 	sched_data = &aligned_data[smp_processor_id()].schedule_data;

 	if (current != &init_task && task_on_runqueue(current)) {
 		printk("UGH! (%d:%d) was on the runqueue, removing.\n",
 			smp_processor_id(), current->pid);
 		del_from_runqueue(current);
 	}
 	sched_data->curr = current;
 	sched_data->last_schedule = get_cycles();
 	clear_bit(current->processor, &wait_init_idle);
 }

 extern void init_timervecs (void);

 void __init sched_init(void)
 {
 	/*
 	 * We have to do a little magic to get the first
 	 * process right in SMP mode.
 	 */
 	int cpu = smp_processor_id();
 	int nr;

 	init_task.processor = cpu;

 	for(nr = 0; nr < PIDHASH_SZ; nr++)
 		pidhash[nr] = NULL;

 	init_timervecs();

 	init_bh(TIMER_BH, timer_bh);
 	init_bh(TQUEUE_BH, tqueue_bh);
 	init_bh(IMMEDIATE_BH, immediate_bh);

 	/*
 	 * The boot idle thread does lazy MMU switching as well:
 	 */
 	atomic_inc(&init_mm.mm_count);
 	enter_lazy_tlb(&init_mm, current, cpu);
 }
diff --git a/sched.h b/sched.h
 #ifndef _LINUX_SCHED_H
 #define _LINUX_SCHED_H

 #include <asm/param.h>	/* for HZ */

 extern unsigned long event;

 #include <linux/config.h>
 #include <linux/binfmts.h>
 #include <linux/threads.h>
 #include <linux/kernel.h>
 #include <linux/types.h>
 #include <linux/times.h>
 #include <linux/timex.h>
 #include <linux/rbtree.h>

 #include <asm/system.h>
 #include <asm/semaphore.h>
 #include <asm/page.h>
 #include <asm/ptrace.h>
 #include <asm/mmu.h>

 #include <linux/smp.h>
 #include <linux/tty.h>
 #include <linux/sem.h>
 #include <linux/signal.h>
 #include <linux/securebits.h>
 #include <linux/fs_struct.h>

 struct exec_domain;

 /*
 * cloning flags:
 */
 #define CSIGNAL		0x000000ff	/* signal mask to be sent at exit */
 #define CLONE_VM	0x00000100	/* set if VM shared between processes */
 #define CLONE_FS	0x00000200	/* set if fs info shared between processes */
 #define CLONE_FILES	0x00000400	/* set if open files shared between processes */
 #define CLONE_SIGHAND	0x00000800	/* set if signal handlers and blocked signals shared */
 #define CLONE_PID	0x00001000	/* set if pid shared */
 #define CLONE_PTRACE	0x00002000	/* set if we want to let tracing continue on the child too */
 #define CLONE_VFORK	0x00004000	/* set if the parent wants the child to wake it up on mm_release */
 #define CLONE_PARENT	0x00008000	/* set if we want to have the same parent as the cloner */
 #define CLONE_THREAD	0x00010000	/* Same thread group? */
 #define CLONE_NEWNS	0x00020000	/* New namespace group? */

 #define CLONE_SIGNAL	(CLONE_SIGHAND | CLONE_THREAD)

 /*
 * These are the constant used to fake the fixed-point load-average
 * counting. Some notes:
 *  - 11 bit fractions expand to 22 bits by the multiplies: this gives
 *    a load-average precision of 10 bits integer + 11 bits fractional
 *  - if you want to count load-averages more often, you need more
 *    precision, or rounding will get you. With 2-second counting freq,
 *    the EXP_n values would be 1981, 2034 and 2043 if still using only
 *    11 bit fractions.
 */
 extern unsigned long avenrun[];		/* Load averages */

 #define FSHIFT		11		/* nr of bits of precision */
 #define FIXED_1		(1<<FSHIFT)	/* 1.0 as fixed-point */
 #define LOAD_FREQ	(5*HZ)		/* 5 sec intervals */
 #define EXP_1		1884		/* 1/exp(5sec/1min) as fixed-point */
 #define EXP_5		2014		/* 1/exp(5sec/5min) */
 #define EXP_15		2037		/* 1/exp(5sec/15min) */

 #define CALC_LOAD(load,exp,n) \
 	load *= exp; \
 	load += n*(FIXED_1-exp); \
 	load >>= FSHIFT;

 #define CT_TO_SECS(x)	((x) / HZ)
 #define CT_TO_USECS(x)	(((x) % HZ) * 1000000/HZ)

 extern int nr_running, nr_threads;
 extern int last_pid;

 #include <linux/fs.h>
 #include <linux/time.h>
 #include <linux/param.h>
 #include <linux/resource.h>
 #ifdef __KERNEL__
 #include <linux/timer.h>
 #endif

 #include <asm/processor.h>

 #define TASK_RUNNING		0
 #define TASK_INTERRUPTIBLE	1
 #define TASK_UNINTERRUPTIBLE	2
 #define TASK_ZOMBIE		4
 #define TASK_STOPPED		8

 #define __set_task_state(tsk, state_value)		\
 	do { (tsk)->state = (state_value); } while (0)
 #ifdef CONFIG_SMP
 #define set_task_state(tsk, state_value)		\
 	set_mb((tsk)->state, (state_value))
 #else
 #define set_task_state(tsk, state_value)		\
 	__set_task_state((tsk), (state_value))
 #endif

 #define __set_current_state(state_value)			\
 	do { current->state = (state_value); } while (0)
 #ifdef CONFIG_SMP
 #define set_current_state(state_value)		\
 	set_mb(current->state, (state_value))
 #else
 #define set_current_state(state_value)		\
 	__set_current_state(state_value)
 #endif

 /*
 * Scheduling policies
 */
 #define SCHED_OTHER		0
 #define SCHED_FIFO		1
 #define SCHED_RR		2

 /*
 * This is an additional bit set when we want to
 * yield the CPU for one re-schedule..
 */
 #define SCHED_YIELD		0x10

 struct sched_param {
 	int sched_priority;
 };

 struct completion;

 #ifdef __KERNEL__

 #include <linux/spinlock.h>

 /*
 * This serializes "schedule()" and also protects
 * the run-queue from deletions/modifications (but
 * _adding_ to the beginning of the run-queue has
 * a separate lock).
 */
 extern rwlock_t tasklist_lock;
 extern spinlock_t runqueue_lock;
 extern spinlock_t mmlist_lock;

 extern void sched_init(void);
 extern void init_idle(void);
 extern void show_state(void);
 extern void cpu_init (void);
 extern void trap_init(void);
 extern void update_process_times(int user);
 extern void update_one_process(struct task_struct *p, unsigned long user,
 			       unsigned long system, int cpu);

 #define	MAX_SCHEDULE_TIMEOUT	LONG_MAX
 extern signed long FASTCALL(schedule_timeout(signed long timeout));
 asmlinkage void schedule(void);

 extern int schedule_task(struct tq_struct *task);
 extern void flush_scheduled_tasks(void);
 extern int start_context_thread(void);
 extern int current_is_keventd(void);

 /*
 * The default fd array needs to be at least BITS_PER_LONG,
 * as this is the granularity returned by copy_fdset().
 */
 #define NR_OPEN_DEFAULT BITS_PER_LONG

 struct namespace;
 /*
 * Open file table structure
 */
 struct files_struct {
 	atomic_t count;
 	rwlock_t file_lock;	/* Protects all the below members.  Nests inside tsk->alloc_lock */
 	int max_fds;
 	int max_fdset;
 	int next_fd;
 	struct file ** fd;	/* current fd array */
 	fd_set *close_on_exec;
 	fd_set *open_fds;
 	fd_set close_on_exec_init;
 	fd_set open_fds_init;
 	struct file * fd_array[NR_OPEN_DEFAULT];
 };

 #define INIT_FILES \
 { 							\
 	count:		ATOMIC_INIT(1), 		\
 	file_lock:	RW_LOCK_UNLOCKED, 		\
 	max_fds:	NR_OPEN_DEFAULT, 		\
 	max_fdset:	__FD_SETSIZE, 			\
 	next_fd:	0, 				\
 	fd:		&init_files.fd_array[0], 	\
 	close_on_exec:	&init_files.close_on_exec_init, \
 	open_fds:	&init_files.open_fds_init, 	\
 	close_on_exec_init: { { 0, } }, 		\
 	open_fds_init:	{ { 0, } }, 			\
 	fd_array:	{ NULL, } 			\
 }

 /* Maximum number of active map areas.. This is a random (large) number */
 #define DEFAULT_MAX_MAP_COUNT	(65536)

 extern int max_map_count;

 struct mm_struct {
 	struct vm_area_struct * mmap;		/* list of VMAs */
 	rb_root_t mm_rb;
 	struct vm_area_struct * mmap_cache;	/* last find_vma result */
 	pgd_t * pgd;
 	atomic_t mm_users;			/* How many users with user space? */
 	atomic_t mm_count;			/* How many references to "struct mm_struct" (users count as 1) */
 	int map_count;				/* number of VMAs */
 	struct rw_semaphore mmap_sem;
 	spinlock_t page_table_lock;		/* Protects task page tables and mm->rss */

 	struct list_head mmlist;		/* List of all active mm's.  These are globally strung
 						 * together off init_mm.mmlist, and are protected
 						 * by mmlist_lock
 						 */

 	unsigned long start_code, end_code, start_data, end_data;
 	unsigned long start_brk, brk, start_stack;
 	unsigned long arg_start, arg_end, env_start, env_end;
 	unsigned long rss, total_vm, locked_vm;
 	unsigned long def_flags;
 	unsigned long cpu_vm_mask;
 	unsigned long swap_address;

 	unsigned dumpable:1;

 	/* Architecture-specific MM context */
 	mm_context_t context;
 };

 extern int mmlist_nr;

 #define INIT_MM(name) \
 {			 				\
 	mm_rb:		RB_ROOT,			\
 	pgd:		swapper_pg_dir, 		\
 	mm_users:	ATOMIC_INIT(2), 		\
 	mm_count:	ATOMIC_INIT(1), 		\
 	mmap_sem:	__RWSEM_INITIALIZER(name.mmap_sem), \
 	page_table_lock: SPIN_LOCK_UNLOCKED, 		\
 	mmlist:		LIST_HEAD_INIT(name.mmlist),	\
 }

 struct signal_struct {
 	atomic_t		count;
 	struct k_sigaction	action[_NSIG];
 	spinlock_t		siglock;
 };

 #define INIT_SIGNALS {	\
 	count:		ATOMIC_INIT(1), 		\
 	action:		{ {{0,}}, }, 			\
 	siglock:	SPIN_LOCK_UNLOCKED 		\
 }

 /*
 * Some day this will be a full-fledged user tracking system..
 */
 struct user_struct {
 	atomic_t __count;	/* reference count */
 	atomic_t processes;	/* How many processes does this user have? */
 	atomic_t files;		/* How many open files does this user have? */

 	/* Hash table maintenance information */
 	struct user_struct *next, **pprev;
 	uid_t uid;
 };

 #define get_current_user() ({ 				\
 	struct user_struct *__user = current->user;	\
 	atomic_inc(&__user->__count);			\
 	__user; })

 extern struct user_struct root_user;
 #define INIT_USER (&root_user)

 struct task_struct {
 	/*
 	 * offsets of these are hardcoded elsewhere - touch with care
 	 */
 	volatile long state;	/* -1 unrunnable, 0 runnable, >0 stopped */
 	unsigned long flags;	/* per process flags, defined below */
 	int sigpending;
 	mm_segment_t addr_limit;	/* thread address space:
 					 	0-0xBFFFFFFF for user-thead
 						0-0xFFFFFFFF for kernel-thread
 					 */
 	struct exec_domain *exec_domain;
 	volatile long need_resched;
 	unsigned long ptrace;
 	int lock_depth;		/* Lock depth */

 /*
 * offset 32 begins here on 32-bit platforms. We keep
 * all fields in a single cacheline that are needed for
 * the goodness() loop in schedule().
 */
 	
 	long counter;
 	long nice;
 	unsigned long policy;
 	
 	struct mm_struct *mm;
 	int processor;
 	/*
 	 * cpus_runnable is ~0 if the process is not running on any
 	 * CPU. It's (1 << cpu) if it's running on a CPU. This mask
 	 * is updated under the runqueue lock.
 	 *
 	 * To determine whether a process might run on a CPU, this
 	 * mask is AND-ed with cpus_allowed.
 	 */
 	unsigned long cpus_runnable, cpus_allowed;
 	/*
 	 * (only the 'next' pointer fits into the cacheline, but
 	 * that's just fine.)
 	 */
 	struct list_head run_list;
 	unsigned long sleep_time;

 	struct task_struct *next_task, *prev_task;
 	struct mm_struct *active_mm;
 	struct list_head local_pages;
 	unsigned int allocation_order, nr_local_pages;

 /* task state */
 	struct linux_binfmt *binfmt;
 	int exit_code, exit_signal;
 	int pdeath_signal;  /*  The signal sent when the parent dies  */
 	/* ??? */
 	unsigned long personality;
 	int did_exec:1;
 	pid_t pid;
 	pid_t pgrp;
 	pid_t tty_old_pgrp;
 	pid_t session;
 	pid_t tgid;
 	/* boolean value for session group leader */
 	int leader;
 	/* 
 	 * pointers to (original) parent process, youngest child, younger sibling,
 	 * older sibling, respectively.  (p->father can be replaced with 
 	 * p->p_pptr->pid)
 	 */
 	struct task_struct *p_opptr, *p_pptr, *p_cptr, *p_ysptr, *p_osptr;
 	struct list_head thread_group;

 	/* PID hash table linkage. */
 	struct task_struct *pidhash_next;
 	struct task_struct **pidhash_pprev;

 	wait_queue_head_t wait_chldexit;	/* for wait4() */
 	struct completion *vfork_done;		/* for vfork() */
 	unsigned long rt_priority;
 	unsigned long it_real_value, it_prof_value, it_virt_value;
 	unsigned long it_real_incr, it_prof_incr, it_virt_incr;
 	struct timer_list real_timer;
 	struct tms times;
 	unsigned long start_time;
 	long per_cpu_utime[NR_CPUS], per_cpu_stime[NR_CPUS];
 /* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
 	unsigned long min_flt, maj_flt, nswap, cmin_flt, cmaj_flt, cnswap;
 	int swappable:1;
 /* process credentials */
 	uid_t uid,euid,suid,fsuid;
 	gid_t gid,egid,sgid,fsgid;
 	int ngroups;
 	gid_t	groups[NGROUPS];
 	kernel_cap_t   cap_effective, cap_inheritable, cap_permitted;
 	int keep_capabilities:1;
 	struct user_struct *user;
 /* limits */
 	struct rlimit rlim[RLIM_NLIMITS];
 	unsigned short used_math;
 	char comm[16];
 /* file system info */
 	int link_count, total_link_count;
 	struct tty_struct *tty; /* NULL if no tty */
 	unsigned int locks; /* How many file locks are being held */
 /* ipc stuff */
 	struct sem_undo *semundo;
 	struct sem_queue *semsleeping;
 /* CPU-specific state of this task */
 	struct thread_struct thread;
 /* filesystem information */
 	struct fs_struct *fs;
 /* open file information */
 	struct files_struct *files;
 /* namespace */
 	struct namespace *namespace;
 /* signal handlers */
 	spinlock_t sigmask_lock;	/* Protects signal and blocked */
 	struct signal_struct *sig;

 	sigset_t blocked;
 	struct sigpending pending;

 	unsigned long sas_ss_sp;
 	size_t sas_ss_size;
 	int (*notifier)(void *priv);
 	void *notifier_data;
 	sigset_t *notifier_mask;
 	
 /* Thread group tracking */
   	u32 parent_exec_id;
   	u32 self_exec_id;
 /* Protection of (de-)allocation: mm, files, fs, tty */
 	spinlock_t alloc_lock;

 /* journalling filesystem info */
 	void *journal_info;
 	int tn;
 	unsigned long cpustart;
 };

 /*
 * Per process flags
 */
 #define PF_ALIGNWARN	0x00000001	/* Print alignment warning msgs */
 					/* Not implemented yet, only for 486*/
 #define PF_STARTING	0x00000002	/* being created */
 #define PF_EXITING	0x00000004	/* getting shut down */
 #define PF_FORKNOEXEC	0x00000040	/* forked but didn't exec */
 #define PF_SUPERPRIV	0x00000100	/* used super-user privileges */
 #define PF_DUMPCORE	0x00000200	/* dumped core */
 #define PF_SIGNALED	0x00000400	/* killed by a signal */
 #define PF_MEMALLOC	0x00000800	/* Allocating memory */
 #define PF_MEMDIE	0x00001000	/* Killed for out-of-memory */
 #define PF_FREE_PAGES	0x00002000	/* per process page freeing */
 #define PF_NOIO		0x00004000	/* avoid generating further I/O */

 #define PF_USEDFPU	0x00100000	/* task used FPU this quantum (SMP) */

 /*
 * Ptrace flags
 */

 #define PT_PTRACED	0x00000001
 #define PT_TRACESYS	0x00000002
 #define PT_DTRACE	0x00000004	/* delayed trace (used on m68k, i386) */
 #define PT_TRACESYSGOOD	0x00000008
 #define PT_PTRACE_CAP	0x00000010	/* ptracer can follow suid-exec */

 /*
 * Limit the stack by to some sane default: root can always
 * increase this limit if needed..  8MB seems reasonable.
 */
 #define _STK_LIM	(8*1024*1024)

 #define DEF_COUNTER	(10*HZ/100)	/* 100 ms time slice */
 #define MAX_COUNTER	(20*HZ/100)
 #define DEF_NICE	(0)

 extern void yield(void);

 /*
 * The default (Linux) execution domain.
 */
 extern struct exec_domain	default_exec_domain;

 /*
 *  INIT_TASK is used to set up the first task table, touch at
 * your own risk!. Base=0, limit=0x1fffff (=2MB)
 */
 #define INIT_TASK(tsk)	\
 {									\
    state:		0,						\
    flags:		0,						\
    sigpending:		0,						\
    addr_limit:		KERNEL_DS,					\
    exec_domain:	&default_exec_domain,				\
    lock_depth:		-1,						\
    counter:		DEF_COUNTER,					\
    nice:		DEF_NICE,					\
    policy:		SCHED_OTHER,					\
    mm:			NULL,						\
    active_mm:		&init_mm,					\
    cpus_runnable:	-1,						\
    cpus_allowed:	-1,						\
    run_list:		LIST_HEAD_INIT(tsk.run_list),			\
    next_task:		&tsk,						\
    prev_task:		&tsk,						\
    p_opptr:		&tsk,						\
    p_pptr:		&tsk,						\
    thread_group:	LIST_HEAD_INIT(tsk.thread_group),		\
    wait_chldexit:	__WAIT_QUEUE_HEAD_INITIALIZER(tsk.wait_chldexit),\
    real_timer:		{						\
 	function:		it_real_fn				\
    },									\
    cap_effective:	CAP_INIT_EFF_SET,				\
    cap_inheritable:	CAP_INIT_INH_SET,				\
    cap_permitted:	CAP_FULL_SET,					\
    keep_capabilities:	0,						\
    rlim:		INIT_RLIMITS,					\
    user:		INIT_USER,					\
    comm:		"swapper",					\
    thread:		INIT_THREAD,					\
    fs:			&init_fs,					\
    files:		&init_files,					\
    sigmask_lock:	SPIN_LOCK_UNLOCKED,				\
    sig:		&init_signals,					\
    pending:		{ NULL, &tsk.pending.head, {{0}}},		\
    blocked:		{{0}},						\
    alloc_lock:		SPIN_LOCK_UNLOCKED,				\
    journal_info:	NULL,						\
 }


 #ifndef INIT_TASK_SIZE
 # define INIT_TASK_SIZE	2048*sizeof(long)
 #endif

 union task_union {
 	struct task_struct task;
 	unsigned long stack[INIT_TASK_SIZE/sizeof(long)];
 };

 extern union task_union init_task_union;

 extern struct   mm_struct init_mm;
 extern struct task_struct *init_tasks[NR_CPUS];

 /* PID hashing. (shouldnt this be dynamic?) */
 #define PIDHASH_SZ (4096 >> 2)
 extern struct task_struct *pidhash[PIDHASH_SZ];

 #define pid_hashfn(x)	((((x) >> 8) ^ (x)) & (PIDHASH_SZ - 1))

 static inline void hash_pid(struct task_struct *p)
 {
 	struct task_struct **htable = &pidhash[pid_hashfn(p->pid)];

 	if((p->pidhash_next = *htable) != NULL)
 		(*htable)->pidhash_pprev = &p->pidhash_next;
 	*htable = p;
 	p->pidhash_pprev = htable;
 }

 static inline void unhash_pid(struct task_struct *p)
 {
 	if(p->pidhash_next)
 		p->pidhash_next->pidhash_pprev = p->pidhash_pprev;
 	*p->pidhash_pprev = p->pidhash_next;
 }

 static inline struct task_struct *find_task_by_pid(int pid)
 {
 	struct task_struct *p, **htable = &pidhash[pid_hashfn(pid)];

 	for(p = *htable; p && p->pid != pid; p = p->pidhash_next)
 		;

 	return p;
 }

 #define task_has_cpu(tsk) ((tsk)->cpus_runnable != ~0UL)

 static inline void task_set_cpu(struct task_struct *tsk, unsigned int cpu)
 {
 	tsk->processor = cpu;
 	tsk->cpus_runnable = 1UL << cpu;
 }

 static inline void task_release_cpu(struct task_struct *tsk)
 {
 	tsk->cpus_runnable = ~0UL;
 }

 /* per-UID process charging. */
 extern struct user_struct * alloc_uid(uid_t);
 extern void free_uid(struct user_struct *);

 #include <asm/current.h>

 extern unsigned long volatile jiffies;
 extern unsigned long itimer_ticks;
 extern unsigned long itimer_next;
 extern struct timeval xtime;
 extern void do_timer(struct pt_regs *);

 extern unsigned int * prof_buffer;
 extern unsigned long prof_len;
 extern unsigned long prof_shift;

 #define CURRENT_TIME (xtime.tv_sec)

 extern void FASTCALL(__wake_up(wait_queue_head_t *q, unsigned int mode, int nr));
 extern void FASTCALL(__wake_up_sync(wait_queue_head_t *q, unsigned int mode, int nr));
 extern void FASTCALL(sleep_on(wait_queue_head_t *q));
 extern long FASTCALL(sleep_on_timeout(wait_queue_head_t *q,
 				      signed long timeout));
 extern void FASTCALL(interruptible_sleep_on(wait_queue_head_t *q));
 extern long FASTCALL(interruptible_sleep_on_timeout(wait_queue_head_t *q,
 						    signed long timeout));
 extern int FASTCALL(wake_up_process(struct task_struct * tsk));

 #define wake_up(x)			__wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
 #define wake_up_nr(x, nr)		__wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
 #define wake_up_all(x)			__wake_up((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 0)
 #define wake_up_sync(x)			__wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, 1)
 #define wake_up_sync_nr(x, nr)		__wake_up_sync((x),TASK_UNINTERRUPTIBLE | TASK_INTERRUPTIBLE, nr)
 #define wake_up_interruptible(x)	__wake_up((x),TASK_INTERRUPTIBLE, 1)
 #define wake_up_interruptible_nr(x, nr)	__wake_up((x),TASK_INTERRUPTIBLE, nr)
 #define wake_up_interruptible_all(x)	__wake_up((x),TASK_INTERRUPTIBLE, 0)
 #define wake_up_interruptible_sync(x)	__wake_up_sync((x),TASK_INTERRUPTIBLE, 1)
 #define wake_up_interruptible_sync_nr(x, nr) __wake_up_sync((x),TASK_INTERRUPTIBLE,  nr)
 asmlinkage long sys_wait4(pid_t pid,unsigned int * stat_addr, int options, struct rusage * ru);

 extern int in_group_p(gid_t);
 extern int in_egroup_p(gid_t);

 extern void proc_caches_init(void);
 extern void flush_signals(struct task_struct *);
 extern void flush_signal_handlers(struct task_struct *);
 extern void sig_exit(int, int, struct siginfo *);
 extern int dequeue_signal(sigset_t *, siginfo_t *);
 extern void block_all_signals(int (*notifier)(void *priv), void *priv,
 			      sigset_t *mask);
 extern void unblock_all_signals(void);
 extern int send_sig_info(int, struct siginfo *, struct task_struct *);
 extern int force_sig_info(int, struct siginfo *, struct task_struct *);
 extern int kill_pg_info(int, struct siginfo *, pid_t);
 extern int kill_sl_info(int, struct siginfo *, pid_t);
 extern int kill_proc_info(int, struct siginfo *, pid_t);
 extern void notify_parent(struct task_struct *, int);
 extern void do_notify_parent(struct task_struct *, int);
 extern void force_sig(int, struct task_struct *);
 extern int send_sig(int, struct task_struct *, int);
 extern int kill_pg(pid_t, int, int);
 extern int kill_sl(pid_t, int, int);
 extern int kill_proc(pid_t, int, int);
 extern int do_sigaction(int, const struct k_sigaction *, struct k_sigaction *);
 extern int do_sigaltstack(const stack_t *, stack_t *, unsigned long);

 static inline int signal_pending(struct task_struct *p)
 {
 	return (p->sigpending != 0);
 }

 /*
 * Re-calculate pending state from the set of locally pending
 * signals, globally pending signals, and blocked signals.
 */
 static inline int has_pending_signals(sigset_t *signal, sigset_t *blocked)
 {
 	unsigned long ready;
 	long i;

 	switch (_NSIG_WORDS) {
 	default:
 		for (i = _NSIG_WORDS, ready = 0; --i >= 0 ;)
 			ready |= signal->sig[i] &~ blocked->sig[i];
 		break;

 	case 4: ready  = signal->sig[3] &~ blocked->sig[3];
 		ready |= signal->sig[2] &~ blocked->sig[2];
 		ready |= signal->sig[1] &~ blocked->sig[1];
 		ready |= signal->sig[0] &~ blocked->sig[0];
 		break;

 	case 2: ready  = signal->sig[1] &~ blocked->sig[1];
 		ready |= signal->sig[0] &~ blocked->sig[0];
 		break;

 	case 1: ready  = signal->sig[0] &~ blocked->sig[0];
 	}
 	return ready !=	0;
 }

 /* Reevaluate whether the task has signals pending delivery.
   This is required every time the blocked sigset_t changes.
   All callers should have t->sigmask_lock.  */

 static inline void recalc_sigpending(struct task_struct *t)
 {
 	t->sigpending = has_pending_signals(&t->pending.signal, &t->blocked);
 }

 /* True if we are on the alternate signal stack.  */

 static inline int on_sig_stack(unsigned long sp)
 {
 	return (sp - current->sas_ss_sp < current->sas_ss_size);
 }

 static inline int sas_ss_flags(unsigned long sp)
 {
 	return (current->sas_ss_size == 0 ? SS_DISABLE
 		: on_sig_stack(sp) ? SS_ONSTACK : 0);
 }

 extern int request_irq(unsigned int,
 		       void (*handler)(int, void *, struct pt_regs *),
 		       unsigned long, const char *, void *);
 extern void free_irq(unsigned int, void *);

 /*
 * This has now become a routine instead of a macro, it sets a flag if
 * it returns true (to do BSD-style accounting where the process is flagged
 * if it uses root privs). The implication of this is that you should do
 * normal permissions checks first, and check suser() last.
 *
 * [Dec 1997 -- Chris Evans]
 * For correctness, the above considerations need to be extended to
 * fsuser(). This is done, along with moving fsuser() checks to be
 * last.
 *
 * These will be removed, but in the mean time, when the SECURE_NOROOT 
 * flag is set, uids don't grant privilege.
 */
 static inline int suser(void)
 {
 	if (!issecure(SECURE_NOROOT) && current->euid == 0) { 
 		current->flags |= PF_SUPERPRIV;
 		return 1;
 	}
 	return 0;
 }

 static inline int fsuser(void)
 {
 	if (!issecure(SECURE_NOROOT) && current->fsuid == 0) {
 		current->flags |= PF_SUPERPRIV;
 		return 1;
 	}
 	return 0;
 }

 /*
 * capable() checks for a particular capability.  
 * New privilege checks should use this interface, rather than suser() or
 * fsuser(). See include/linux/capability.h for defined capabilities.
 */

 static inline int capable(int cap)
 {
 #if 1 /* ok now */
 	if (cap_raised(current->cap_effective, cap))
 #else
 	if (cap_is_fs_cap(cap) ? current->fsuid == 0 : current->euid == 0)
 #endif
 	{
 		current->flags |= PF_SUPERPRIV;
 		return 1;
 	}
 	return 0;
 }

 /*
 * Routines for handling mm_structs
 */
 extern struct mm_struct * mm_alloc(void);

 extern struct mm_struct * start_lazy_tlb(void);
 extern void end_lazy_tlb(struct mm_struct *mm);

 /* mmdrop drops the mm and the page tables */
 extern inline void FASTCALL(__mmdrop(struct mm_struct *));
 static inline void mmdrop(struct mm_struct * mm)
 {
 	if (atomic_dec_and_test(&mm->mm_count))
 		__mmdrop(mm);
 }

 /* mmput gets rid of the mappings and all user-space */
 extern void mmput(struct mm_struct *);
 /* Remove the current tasks stale references to the old mm_struct */
 extern void mm_release(void);

 /*
 * Routines for handling the fd arrays
 */
 extern struct file ** alloc_fd_array(int);
 extern int expand_fd_array(struct files_struct *, int nr);
 extern void free_fd_array(struct file **, int);

 extern fd_set *alloc_fdset(int);
 extern int expand_fdset(struct files_struct *, int nr);
 extern void free_fdset(fd_set *, int);

 extern int  copy_thread(int, unsigned long, unsigned long, unsigned long, struct task_struct *, struct pt_regs *);
 extern void flush_thread(void);
 extern void exit_thread(void);

 extern void exit_mm(struct task_struct *);
 extern void exit_files(struct task_struct *);
 extern void exit_sighand(struct task_struct *);

 extern void reparent_to_init(void);
 extern void daemonize(void);

 extern int do_execve(char *, char **, char **, struct pt_regs *);
 extern int do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long);

 extern void FASTCALL(add_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));
 extern void FASTCALL(add_wait_queue_exclusive(wait_queue_head_t *q, wait_queue_t * wait));
 extern void FASTCALL(remove_wait_queue(wait_queue_head_t *q, wait_queue_t * wait));

 #define __wait_event(wq, condition) 					\
 do {									\
 	wait_queue_t __wait;						\
 	init_waitqueue_entry(&__wait, current);				\
 									\
 	add_wait_queue(&wq, &__wait);					\
 	for (;;) {							\
 		set_current_state(TASK_UNINTERRUPTIBLE);		\
 		if (condition)						\
 			break;						\
 		schedule();						\
 	}								\
 	current->state = TASK_RUNNING;					\
 	remove_wait_queue(&wq, &__wait);				\
 } while (0)

 #define wait_event(wq, condition) 					\
 do {									\
 	if (condition)	 						\
 		break;							\
 	__wait_event(wq, condition);					\
 } while (0)

 #define __wait_event_interruptible(wq, condition, ret)			\
 do {									\
 	wait_queue_t __wait;						\
 	init_waitqueue_entry(&__wait, current);				\
 									\
 	add_wait_queue(&wq, &__wait);					\
 	for (;;) {							\
 		set_current_state(TASK_INTERRUPTIBLE);			\
 		if (condition)						\
 			break;						\
 		if (!signal_pending(current)) {				\
 			schedule();					\
 			continue;					\
 		}							\
 		ret = -ERESTARTSYS;					\
 		break;							\
 	}								\
 	current->state = TASK_RUNNING;					\
 	remove_wait_queue(&wq, &__wait);				\
 } while (0)
 	
 #define wait_event_interruptible(wq, condition)				\
 ({									\
 	int __ret = 0;							\
 	if (!(condition))						\
 		__wait_event_interruptible(wq, condition, __ret);	\
 	__ret;								\
 })

 #define REMOVE_LINKS(p) do { \
 	(p)->next_task->prev_task = (p)->prev_task; \
 	(p)->prev_task->next_task = (p)->next_task; \
 	if ((p)->p_osptr) \
 		(p)->p_osptr->p_ysptr = (p)->p_ysptr; \
 	if ((p)->p_ysptr) \
 		(p)->p_ysptr->p_osptr = (p)->p_osptr; \
 	else \
 		(p)->p_pptr->p_cptr = (p)->p_osptr; \
 	} while (0)

 #define SET_LINKS(p) do { \
 	(p)->next_task = &init_task; \
 	(p)->prev_task = init_task.prev_task; \
 	init_task.prev_task->next_task = (p); \
 	init_task.prev_task = (p); \
 	(p)->p_ysptr = NULL; \
 	if (((p)->p_osptr = (p)->p_pptr->p_cptr) != NULL) \
 		(p)->p_osptr->p_ysptr = p; \
 	(p)->p_pptr->p_cptr = p; \
 	} while (0)

 #define for_each_task(p) \
 	for (p = &init_task ; (p = p->next_task) != &init_task ; )

 #define for_each_thread(task) \
 	for (task = next_thread(current) ; task != current ; task = next_thread(task))

 #define next_thread(p) \
 	list_entry((p)->thread_group.next, struct task_struct, thread_group)

 #define thread_group_leader(p)	(p->pid == p->tgid)

 static inline void del_from_runqueue(struct task_struct * p)
 {
 	nr_running--;
 	p->sleep_time = jiffies;
 	list_del(&p->run_list);
 	p->run_list.next = NULL;
 }

 static inline int task_on_runqueue(struct task_struct *p)
 {
 	return (p->run_list.next != NULL);
 }

 static inline void unhash_process(struct task_struct *p)
 {
 	if (task_on_runqueue(p))
 		out_of_line_bug();
 	write_lock_irq(&tasklist_lock);
 	nr_threads--;
 	unhash_pid(p);
 	REMOVE_LINKS(p);
 	list_del(&p->thread_group);
 	write_unlock_irq(&tasklist_lock);
 }

 /* Protects ->fs, ->files, ->mm, and synchronises with wait4().  Nests inside tasklist_lock */
 static inline void task_lock(struct task_struct *p)
 {
 	spin_lock(&p->alloc_lock);
 }

 static inline void task_unlock(struct task_struct *p)
 {
 	spin_unlock(&p->alloc_lock);
 }

 /* write full pathname into buffer and return start of pathname */
 static inline char * d_path(struct dentry *dentry, struct vfsmount *vfsmnt,
 				char *buf, int buflen)
 {
 	char *res;
 	struct vfsmount *rootmnt;
 	struct dentry *root;
 	read_lock(&current->fs->lock);
 	rootmnt = mntget(current->fs->rootmnt);
 	root = dget(current->fs->root);
 	read_unlock(&current->fs->lock);
 	spin_lock(&dcache_lock);
 	res = __d_path(dentry, vfsmnt, root, rootmnt, buf, buflen);
 	spin_unlock(&dcache_lock);
 	dput(root);
 	mntput(rootmnt);
 	return res;
 }

 static inline int need_resched(void)
 {
 	return (unlikely(current->need_resched));
 }

 extern void __cond_resched(void);
 static inline void cond_resched(void)
 {
 	if (need_resched())
 		__cond_resched();
 }

 #endif /* __KERNEL__ */
 #endif
	/*
	* linux/kernel/fork.c
	*
	* Copyright (C) 1991, 1992 Linus Torvalds
	*/

	/*
	* 'fork.c' contains the help-routines for the 'fork' system call
	* (see also entry.S and others).
	* Fork is rather simple, once you get the hang of it, but the memory
	* management can be a bitch. See 'mm/memory.c': 'copy_page_range()'
	*/

	#include <linux/config.h>
	#include <linux/slab.h>
	#include <linux/init.h>
	#include <linux/unistd.h>
	#include <linux/smp_lock.h>
	#include <linux/module.h>
	#include <linux/vmalloc.h>
	#include <linux/completion.h>
	#include <linux/namespace.h>
	#include <linux/personality.h>
	#include <linux/compiler.h>

	#include <asm/pgtable.h>
	#include <asm/pgalloc.h>
	#include <asm/uaccess.h>
	#include <asm/mmu_context.h>

	/* The idle threads do not count.. */
	int nr_threads;
	int nr_running;

	int max_threads;
	unsigned long total_forks; /* Handle normal Linux uptimes. */
	int last_pid;

	struct task_struct *pidhash[PIDHASH_SZ];

	void add_wait_queue(wait_queue_head_t q, wait_queue_t wait)
	{
	unsigned long flags;

	wait->flags &= ~WQ_FLAG_EXCLUSIVE;
	wq_write_lock_irqsave(&q->lock, flags);
	__add_wait_queue(q, wait);
	wq_write_unlock_irqrestore(&q->lock, flags);
	}

	void add_wait_queue_exclusive(wait_queue_head_t q, wait_queue_t wait)
	{
	unsigned long flags;

	wait->flags \|= WQ_FLAG_EXCLUSIVE;
	wq_write_lock_irqsave(&q->lock, flags);
	__add_wait_queue_tail(q, wait);
	wq_write_unlock_irqrestore(&q->lock, flags);
	}

	void remove_wait_queue(wait_queue_head_t q, wait_queue_t wait)
	{
	unsigned long flags;

	wq_write_lock_irqsave(&q->lock, flags);
	__remove_wait_queue(q, wait);
	wq_write_unlock_irqrestore(&q->lock, flags);
	}

	void __init fork_init(unsigned long mempages)
	{
	/*
	* The default maximum number of threads is set to a safe
	* value: the thread structures can take up at most half
	* of memory.
	*/
	max_threads = mempages / (THREAD_SIZE/PAGE_SIZE) / 8;

	init_task.rlim[RLIMIT_NPROC].rlim_cur = max_threads/2;
	init_task.rlim[RLIMIT_NPROC].rlim_max = max_threads/2;
	}

	/* Protects next_safe and last_pid. */
	spinlock_t lastpid_lock = SPIN_LOCK_UNLOCKED;

	static int get_pid(unsigned long flags)
	{
	static int next_safe = PID_MAX;
	struct task_struct *p;
	int pid, beginpid;

	if (flags & CLONE_PID)
	return current->pid;

	spin_lock(&lastpid_lock);
	beginpid = last_pid;
	if((++last_pid) & 0xffff8000) {
	last_pid = 300; /* Skip daemons etc. */
	goto inside;
	}
	if(last_pid >= next_safe) {
	inside:
	next_safe = PID_MAX;
	read_lock(&tasklist_lock);
	repeat:
	for_each_task(p) {
	if(p->pid == last_pid \|\|
	p->pgrp == last_pid \|\|
	p->tgid == last_pid \|\|
	p->session == last_pid) {
	if(++last_pid >= next_safe) {
	if(last_pid & 0xffff8000)
	last_pid = 300;
	next_safe = PID_MAX;
	}
	if(unlikely(last_pid == beginpid))
	goto nomorepids;
	goto repeat;
	}
	if(p->pid > last_pid && next_safe > p->pid)
	next_safe = p->pid;
	if(p->pgrp > last_pid && next_safe > p->pgrp)
	next_safe = p->pgrp;
	if(p->tgid > last_pid && next_safe > p->tgid)
	next_safe = p->tgid;
	if(p->session > last_pid && next_safe > p->session)
	next_safe = p->session;
	}
	read_unlock(&tasklist_lock);
	}
	pid = last_pid;
	spin_unlock(&lastpid_lock);

	return pid;

	nomorepids:
	read_unlock(&tasklist_lock);
	spin_unlock(&lastpid_lock);
	return 0;
	}

	static inline int dup_mmap(struct mm_struct * mm)
	{
	struct vm_area_struct * mpnt, tmp, *pprev;
	int retval;

	flush_cache_mm(current->mm);
	mm->locked_vm = 0;
	mm->mmap = NULL;
	mm->mmap_cache = NULL;
	mm->map_count = 0;
	mm->rss = 0;
	mm->cpu_vm_mask = 0;
	mm->swap_address = 0;
	pprev = &mm->mmap;

	/*
	* Add it to the mmlist after the parent.
	* Doing it this way means that we can order the list,
	* and fork() won't mess up the ordering significantly.
	* Add it first so that swapoff can see any swap entries.
	*/
	spin_lock(&mmlist_lock);
	list_add(&mm->mmlist, &current->mm->mmlist);
	mmlist_nr++;
	spin_unlock(&mmlist_lock);

	for (mpnt = current->mm->mmap ; mpnt ; mpnt = mpnt->vm_next) {
	struct file *file;

	retval = -ENOMEM;
	if(mpnt->vm_flags & VM_DONTCOPY)
	continue;
	tmp = kmem_cache_alloc(vm_area_cachep, SLAB_KERNEL);
	if (!tmp)
	goto fail_nomem;
	tmp = mpnt;
	tmp->vm_flags &= ~VM_LOCKED;
	tmp->vm_mm = mm;
	tmp->vm_next = NULL;
	file = tmp->vm_file;
	if (file) {
	struct inode *inode = file->f_dentry->d_inode;
	get_file(file);
	if (tmp->vm_flags & VM_DENYWRITE)
	atomic_dec(&inode->i_writecount);

	/* insert tmp into the share list, just after mpnt */
	spin_lock(&inode->i_mapping->i_shared_lock);
	if((tmp->vm_next_share = mpnt->vm_next_share) != NULL)
	mpnt->vm_next_share->vm_pprev_share =
	&tmp->vm_next_share;
	mpnt->vm_next_share = tmp;
	tmp->vm_pprev_share = &mpnt->vm_next_share;
	spin_unlock(&inode->i_mapping->i_shared_lock);
	}

	/*
	* Link in the new vma and copy the page table entries:
	* link in first so that swapoff can see swap entries.
	*/
	spin_lock(&mm->page_table_lock);
	*pprev = tmp;
	pprev = &tmp->vm_next;
	mm->map_count++;
	retval = copy_page_range(mm, current->mm, tmp);
	spin_unlock(&mm->page_table_lock);

	if (tmp->vm_ops && tmp->vm_ops->open)
	tmp->vm_ops->open(tmp);

	if (retval)
	goto fail_nomem;
	}
	retval = 0;
	build_mmap_rb(mm);

	fail_nomem:
	flush_tlb_mm(current->mm);
	return retval;
	}

	spinlock_t mmlist_lock __cacheline_aligned = SPIN_LOCK_UNLOCKED;
	int mmlist_nr;

	#define allocate_mm() (kmem_cache_alloc(mm_cachep, SLAB_KERNEL))
	#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm)))

	static struct mm_struct * mm_init(struct mm_struct * mm)
	{
	atomic_set(&mm->mm_users, 1);
	atomic_set(&mm->mm_count, 1);
	init_rwsem(&mm->mmap_sem);
	mm->page_table_lock = SPIN_LOCK_UNLOCKED;
	mm->pgd = pgd_alloc(mm);
	mm->def_flags = 0;
	if (mm->pgd)
	return mm;
	free_mm(mm);
	return NULL;
	}


	/*
	* Allocate and initialize an mm_struct.
	*/
	struct mm_struct * mm_alloc(void)
	{
	struct mm_struct * mm;

	mm = allocate_mm();
	if (mm) {
	memset(mm, 0, sizeof(*mm));
	return mm_init(mm);
	}
	return NULL;
	}

	/*
	* Called when the last reference to the mm
	* is dropped: either by a lazy thread or by
	* mmput. Free the page directory and the mm.
	*/
	inline void __mmdrop(struct mm_struct *mm)
	{
	BUG_ON(mm == &init_mm);
	pgd_free(mm->pgd);
	destroy_context(mm);
	free_mm(mm);
	}

	/*
	* Decrement the use count and release all resources for an mm.
	*/
	void mmput(struct mm_struct *mm)
	{
	if (atomic_dec_and_lock(&mm->mm_users, &mmlist_lock)) {
	extern struct mm_struct *swap_mm;
	if (swap_mm == mm)
	swap_mm = list_entry(mm->mmlist.next, struct mm_struct, mmlist);
	list_del(&mm->mmlist);
	mmlist_nr--;
	spin_unlock(&mmlist_lock);
	exit_mmap(mm);
	mmdrop(mm);
	}
	}

	/* Please note the differences between mmput and mm_release.
	* mmput is called whenever we stop holding onto a mm_struct,
	* error success whatever.
	*
	* mm_release is called after a mm_struct has been removed
	* from the current process.
	*
	* This difference is important for error handling, when we
	* only half set up a mm_struct for a new process and need to restore
	* the old one. Because we mmput the new mm_struct before
	* restoring the old one. . .
	* Eric Biederman 10 January 1998
	*/
	void mm_release(void)
	{
	struct task_struct *tsk = current;
	struct completion *vfork_done = tsk->vfork_done;

	/* notify parent sleeping on vfork() */
	if (vfork_done) {
	tsk->vfork_done = NULL;
	complete(vfork_done);
	}
	}

	static int copy_mm(unsigned long clone_flags, struct task_struct * tsk)
	{
	struct mm_struct * mm, *oldmm;
	int retval;

	tsk->min_flt = tsk->maj_flt = 0;
	tsk->cmin_flt = tsk->cmaj_flt = 0;
	tsk->nswap = tsk->cnswap = 0;

	tsk->mm = NULL;
	tsk->active_mm = NULL;

	/*
	* Are we cloning a kernel thread?
	*
	* We need to steal a active VM for that..
	*/
	oldmm = current->mm;
	if (!oldmm)
	return 0;

	if (clone_flags & CLONE_VM) {
	atomic_inc(&oldmm->mm_users);
	mm = oldmm;
	goto good_mm;
	}

	retval = -ENOMEM;
	mm = allocate_mm();
	if (!mm)
	goto fail_nomem;

	/* Copy the current MM stuff.. */
	memcpy(mm, oldmm, sizeof(*mm));
	if (!mm_init(mm))
	goto fail_nomem;

	if (init_new_context(tsk,mm))
	goto free_pt;

	down_write(&oldmm->mmap_sem);
	retval = dup_mmap(mm);
	up_write(&oldmm->mmap_sem);

	if (retval)
	goto free_pt;

	/*
	* child gets a private LDT (if there was an LDT in the parent)
	*/
	copy_segments(tsk, mm);

	good_mm:
	tsk->mm = mm;
	tsk->active_mm = mm;
	return 0;

	free_pt:
	mmput(mm);
	fail_nomem:
	return retval;
	}

	static inline struct fs_struct __copy_fs_struct(struct fs_struct old)
	{
	struct fs_struct *fs = kmem_cache_alloc(fs_cachep, GFP_KERNEL);
	/* We don't need to lock fs - think why ;-) */
	if (fs) {
	atomic_set(&fs->count, 1);
	fs->lock = RW_LOCK_UNLOCKED;
	fs->umask = old->umask;
	read_lock(&old->lock);
	fs->rootmnt = mntget(old->rootmnt);
	fs->root = dget(old->root);
	fs->pwdmnt = mntget(old->pwdmnt);
	fs->pwd = dget(old->pwd);
	if (old->altroot) {
	fs->altrootmnt = mntget(old->altrootmnt);
	fs->altroot = dget(old->altroot);
	} else {
	fs->altrootmnt = NULL;
	fs->altroot = NULL;
	}
	read_unlock(&old->lock);
	}
	return fs;
	}

	struct fs_struct copy_fs_struct(struct fs_struct old)
	{
	return __copy_fs_struct(old);
	}

	static inline int copy_fs(unsigned long clone_flags, struct task_struct * tsk)
	{
	if (clone_flags & CLONE_FS) {
	atomic_inc(&current->fs->count);
	return 0;
	}
	tsk->fs = __copy_fs_struct(current->fs);
	if (!tsk->fs)
	return -1;
	return 0;
	}

	static int count_open_files(struct files_struct *files, int size)
	{
	int i;

	/* Find the last open fd */
	for (i = size/(8*sizeof(long)); i > 0; ) {
	if (files->open_fds->fds_bits[--i])
	break;
	}
	i = (i+1) * 8 * sizeof(long);
	return i;
	}

	static int copy_files(unsigned long clone_flags, struct task_struct * tsk)
	{
	struct files_struct oldf, newf;
	struct file old_fds, new_fds;
	int open_files, nfds, size, i, error = 0;

	/*
	* A background process may not have any files ...
	*/
	oldf = current->files;
	if (!oldf)
	goto out;

	if (clone_flags & CLONE_FILES) {
	atomic_inc(&oldf->count);
	goto out;
	}

	tsk->files = NULL;
	error = -ENOMEM;
	newf = kmem_cache_alloc(files_cachep, SLAB_KERNEL);
	if (!newf)
	goto out;

	atomic_set(&newf->count, 1);

	newf->file_lock = RW_LOCK_UNLOCKED;
	newf->next_fd = 0;
	newf->max_fds = NR_OPEN_DEFAULT;
	newf->max_fdset = __FD_SETSIZE;
	newf->close_on_exec = &newf->close_on_exec_init;
	newf->open_fds = &newf->open_fds_init;
	newf->fd = &newf->fd_array[0];

	/* We don't yet have the oldf readlock, but even if the old
	fdset gets grown now, we'll only copy up to "size" fds */
	size = oldf->max_fdset;
	if (size > __FD_SETSIZE) {
	newf->max_fdset = 0;
	write_lock(&newf->file_lock);
	error = expand_fdset(newf, size-1);
	write_unlock(&newf->file_lock);
	if (error)
	goto out_release;
	}
	read_lock(&oldf->file_lock);

	open_files = count_open_files(oldf, size);

	/*
	* Check whether we need to allocate a larger fd array.
	* Note: we're not a clone task, so the open count won't
	* change.
	*/
	nfds = NR_OPEN_DEFAULT;
	if (open_files > nfds) {
	read_unlock(&oldf->file_lock);
	newf->max_fds = 0;
	write_lock(&newf->file_lock);
	error = expand_fd_array(newf, open_files-1);
	write_unlock(&newf->file_lock);
	if (error)
	goto out_release;
	nfds = newf->max_fds;
	read_lock(&oldf->file_lock);
	}

	old_fds = oldf->fd;
	new_fds = newf->fd;

	memcpy(newf->open_fds->fds_bits, oldf->open_fds->fds_bits, open_files/8);
	memcpy(newf->close_on_exec->fds_bits, oldf->close_on_exec->fds_bits, open_files/8);

	for (i = open_files; i != 0; i--) {
	struct file f = old_fds++;
	if (f)
	get_file(f);
	*new_fds++ = f;
	}
	read_unlock(&oldf->file_lock);

	/* compute the remainder to be cleared */
	size = (newf->max_fds - open_files) * sizeof(struct file *);

	/* This is long word aligned thus could use a optimized version */
	memset(new_fds, 0, size);

	if (newf->max_fdset > open_files) {
	int left = (newf->max_fdset-open_files)/8;
	int start = open_files / (8 * sizeof(unsigned long));

	memset(&newf->open_fds->fds_bits[start], 0, left);
	memset(&newf->close_on_exec->fds_bits[start], 0, left);
	}

	tsk->files = newf;
	error = 0;
	out:
	return error;

	out_release:
	free_fdset (newf->close_on_exec, newf->max_fdset);
	free_fdset (newf->open_fds, newf->max_fdset);
	kmem_cache_free(files_cachep, newf);
	goto out;
	}

	static inline int copy_sighand(unsigned long clone_flags, struct task_struct * tsk)
	{
	struct signal_struct *sig;

	if (clone_flags & CLONE_SIGHAND) {
	atomic_inc(&current->sig->count);
	return 0;
	}
	sig = kmem_cache_alloc(sigact_cachep, GFP_KERNEL);
	tsk->sig = sig;
	if (!sig)
	return -1;
	spin_lock_init(&sig->siglock);
	atomic_set(&sig->count, 1);
	memcpy(tsk->sig->action, current->sig->action, sizeof(tsk->sig->action));
	return 0;
	}

	static inline void copy_flags(unsigned long clone_flags, struct task_struct *p)
	{
	unsigned long new_flags = p->flags;

	new_flags &= ~(PF_SUPERPRIV \| PF_USEDFPU);
	new_flags \|= PF_FORKNOEXEC;
	if (!(clone_flags & CLONE_PTRACE))
	p->ptrace = 0;
	p->flags = new_flags;
	}

	/*
	* Ok, this is the main fork-routine. It copies the system process
	* information (task[nr]) and sets up the necessary registers. It also
	* copies the data segment in its entirety. The "stack_start" and
	* "stack_top" arguments are simply passed along to the platform
	* specific copy_thread() routine. Most platforms ignore stack_top.
	* For an example that's using stack_top, see
	* arch/ia64/kernel/process.c.
	*/
	int do_fork(unsigned long clone_flags, unsigned long stack_start,
	struct pt_regs *regs, unsigned long stack_size)
	{
	int retval;
	struct task_struct *p;
	struct completion vfork;

	if ((clone_flags & (CLONE_NEWNS\|CLONE_FS)) == (CLONE_NEWNS\|CLONE_FS))
	return -EINVAL;

	retval = -EPERM;

	/*
	* CLONE_PID is only allowed for the initial SMP swapper
	* calls
	*/
	if (clone_flags & CLONE_PID) {
	if (current->pid)
	goto fork_out;
	}

	retval = -ENOMEM;
	p = alloc_task_struct();
	if (!p)
	goto fork_out;

	p = current;

	retval = -EAGAIN;
	/*
	* Check if we are over our maximum process limit, but be sure to
	* exclude root. This is needed to make it possible for login and
	* friends to set the per-user process limit to something lower
	* than the amount of processes root is running. -- Rik
	*/
	if (atomic_read(&p->user->processes) >= p->rlim[RLIMIT_NPROC].rlim_cur
	&& !capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_RESOURCE))
	goto bad_fork_free;

	atomic_inc(&p->user->__count);
	atomic_inc(&p->user->processes);

	/*
	* Counter increases are protected by
	* the kernel lock so nr_threads can't
	* increase under us (but it may decrease).
	*/
	if (nr_threads >= max_threads)
	goto bad_fork_cleanup_count;

	get_exec_domain(p->exec_domain);

	if (p->binfmt && p->binfmt->module)
	__MOD_INC_USE_COUNT(p->binfmt->module);

	p->did_exec = 0;
	p->swappable = 0;
	p->state = TASK_UNINTERRUPTIBLE;

	copy_flags(clone_flags, p);
	p->pid = get_pid(clone_flags);
	if (p->pid == 0 && current->pid != 0)
	goto bad_fork_cleanup;

	p->run_list.next = NULL;
	p->run_list.prev = NULL;

	p->p_cptr = NULL;
	init_waitqueue_head(&p->wait_chldexit);
	p->vfork_done = NULL;
	if (clone_flags & CLONE_VFORK) {
	p->vfork_done = &vfork;
	init_completion(&vfork);
	}
	spin_lock_init(&p->alloc_lock);

	p->sigpending = 0;
	init_sigpending(&p->pending);

	p->it_real_value = p->it_virt_value = p->it_prof_value = 0;
	p->it_real_incr = p->it_virt_incr = p->it_prof_incr = 0;
	init_timer(&p->real_timer);
	p->real_timer.data = (unsigned long) p;

	p->leader = 0; /* session leadership doesn't inherit */
	p->tty_old_pgrp = 0;
	p->times.tms_utime = p->times.tms_stime = 0;
	p->times.tms_cutime = p->times.tms_cstime = 0;
	p->tn=5;
	p->cpustart=jiffies*10;
	#ifdef CONFIG_SMP
	{
	int i;
	p->cpus_runnable = ~0UL;
	p->processor = current->processor;
	/* ?? should we just memset this ?? */
	for(i = 0; i < smp_num_cpus; i++)
	p->per_cpu_utime[i] = p->per_cpu_stime[i] = 0;
	spin_lock_init(&p->sigmask_lock);
	}
	#endif
	p->lock_depth = -1; /* -1 = no lock */
	p->start_time = jiffies;

	INIT_LIST_HEAD(&p->local_pages);

	retval = -ENOMEM;
	/* copy all the process information */
	if (copy_files(clone_flags, p))
	goto bad_fork_cleanup;
	if (copy_fs(clone_flags, p))
	goto bad_fork_cleanup_files;
	if (copy_sighand(clone_flags, p))
	goto bad_fork_cleanup_fs;
	if (copy_mm(clone_flags, p))
	goto bad_fork_cleanup_sighand;
	if (copy_namespace(clone_flags, p))
	goto bad_fork_cleanup_mm;
	retval = copy_thread(0, clone_flags, stack_start, stack_size, p, regs);
	if (retval)
	goto bad_fork_cleanup_namespace;
	p->semundo = NULL;

	/* Our parent execution domain becomes current domain
	These must match for thread signalling to apply */

	p->parent_exec_id = p->self_exec_id;

	/* ok, now we should be set up.. */
	p->swappable = 1;
	p->exit_signal = clone_flags & CSIGNAL;
	p->pdeath_signal = 0;

	/*
	* "share" dynamic priority between parent and child, thus the
	* total amount of dynamic priorities in the system doesn't change,
	* more scheduling fairness. This is only important in the first
	* timeslice, on the long run the scheduling behaviour is unchanged.
	*/
	p->counter = (current->counter + 1) >> 1;
	current->counter >>= 1;
	if (!current->counter)
	current->need_resched = 1;

	/*
	* Ok, add it to the run-queues and make it
	* visible to the rest of the system.
	*
	* Let it rip!
	*/
	retval = p->pid;
	p->tgid = retval;
	INIT_LIST_HEAD(&p->thread_group);

	/* Need tasklist lock for parent etc handling! */
	write_lock_irq(&tasklist_lock);

	/* CLONE_PARENT re-uses the old parent */
	p->p_opptr = current->p_opptr;
	p->p_pptr = current->p_pptr;
	if (!(clone_flags & CLONE_PARENT)) {
	p->p_opptr = current;
	if (!(p->ptrace & PT_PTRACED))
	p->p_pptr = current;
	}

	if (clone_flags & CLONE_THREAD) {
	p->tgid = current->tgid;
	list_add(&p->thread_group, &current->thread_group);
	}

	SET_LINKS(p);
	hash_pid(p);
	nr_threads++;
	write_unlock_irq(&tasklist_lock);

	if (p->ptrace & PT_PTRACED)
	send_sig(SIGSTOP, p, 1);

	wake_up_process(p); /* do this last */
	++total_forks;
	if (clone_flags & CLONE_VFORK)
	wait_for_completion(&vfork);

	fork_out:
	return retval;

	bad_fork_cleanup_namespace:
	exit_namespace(p);
	bad_fork_cleanup_mm:
	exit_mm(p);
	bad_fork_cleanup_sighand:
	exit_sighand(p);
	bad_fork_cleanup_fs:
	exit_fs(p); /* blocking */
	bad_fork_cleanup_files:
	exit_files(p); /* blocking */
	bad_fork_cleanup:
	put_exec_domain(p->exec_domain);
	if (p->binfmt && p->binfmt->module)
	__MOD_DEC_USE_COUNT(p->binfmt->module);
	bad_fork_cleanup_count:
	atomic_dec(&p->user->processes);
	free_uid(p->user);
	bad_fork_free:
	free_task_struct(p);
	goto fork_out;
	}

	/* SLAB cache for signal_struct structures (tsk->sig) */
	kmem_cache_t *sigact_cachep;

	/* SLAB cache for files_struct structures (tsk->files) */
	kmem_cache_t *files_cachep;

	/* SLAB cache for fs_struct structures (tsk->fs) */
	kmem_cache_t *fs_cachep;

	/* SLAB cache for vm_area_struct structures */
	kmem_cache_t *vm_area_cachep;

	/* SLAB cache for mm_struct structures (tsk->mm) */
	kmem_cache_t *mm_cachep;

	void __init proc_caches_init(void)
	{
	sigact_cachep = kmem_cache_create("signal_act",
	sizeof(struct signal_struct), 0,
	SLAB_HWCACHE_ALIGN, NULL, NULL);
	if (!sigact_cachep)
	panic("Cannot create signal action SLAB cache");

	files_cachep = kmem_cache_create("files_cache",
	sizeof(struct files_struct), 0,
	SLAB_HWCACHE_ALIGN, NULL, NULL);
	if (!files_cachep)
	panic("Cannot create files SLAB cache");

	fs_cachep = kmem_cache_create("fs_cache",
	sizeof(struct fs_struct), 0,
	SLAB_HWCACHE_ALIGN, NULL, NULL);
	if (!fs_cachep)
	panic("Cannot create fs_struct SLAB cache");

	vm_area_cachep = kmem_cache_create("vm_area_struct",
	sizeof(struct vm_area_struct), 0,
	SLAB_HWCACHE_ALIGN, NULL, NULL);
	if(!vm_area_cachep)
	panic("vma_init: Cannot alloc vm_area_struct SLAB cache");

	mm_cachep = kmem_cache_create("mm_struct",
	sizeof(struct mm_struct), 0,
	SLAB_HWCACHE_ALIGN, NULL, NULL);
	if(!mm_cachep)
	panic("vma_init: Cannot alloc mm_struct SLAB cache");
	}